119 files changed, 15683 insertions, 1514 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index eaff24a19502..cfe512fd1caf 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -220,6 +220,7 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/f2fs/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 1d7af79288a0..9d53192236fc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_F2FS_FS)           += f2fs/
 obj-y                           += exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)           += ceph/
 obj-$(CONFIG_PSTORE)            += pstore/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e9bad5093a3f..5f95d1ed9c6d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
        return block_read_full_page(page, adfs_get_block);
 }
+static void adfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size)
+                truncate_pagecache(inode, to, inode->i_size);
+}
 static int adfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                adfs_get_block,
                                &ADFS_I(mapping->host)->mmu_private);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                adfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 2f4c935cb327..af3261b78102 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {
 };
 const struct inode_operations affs_file_inode_operations = {
-        .truncate       = affs_truncate,
        .setattr        = affs_notify_change,
 };
@@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)
        return block_read_full_page(page, affs_get_block);
 }
+static void affs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                affs_truncate(inode);
+        }
+}
 static int affs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                affs_get_block,
                                &AFFS_I(mapping->host)->mmu_private);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                affs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 15c484268229..0e092d08680e 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
+                truncate_setsize(inode, attr->ia_size);
+                affs_truncate(inode);
        }
        setattr_copy(inode, attr);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index f20e8a71062f..ad3ea1497cc3 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
        return block_read_full_page(page, bfs_get_block);
 }
+static void bfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size)
+                truncate_pagecache(inode, to, inode->i_size);
+}
 static int bfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep,
                                bfs_get_block);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                bfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c7b67cf24bba..eea5da7a2b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                switch (tm->op) {
                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                        BUG_ON(tm->slot < n);
-                case MOD_LOG_KEY_REMOVE:
-                        n++;
                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+                case MOD_LOG_KEY_REMOVE:
                        btrfs_set_node_key(eb, &tm->key, tm->slot);
                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
                        btrfs_set_node_ptr_generation(eb, tm->slot,
                                                      tm->generation);
+                        n++;
                        break;
                case MOD_LOG_KEY_REPLACE:
                        BUG_ON(tm->slot >= n);
@@ -4611,12 +4611,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        u32 nritems;
        int ret;
-        if (level) {
-                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
-                                              MOD_LOG_KEY_REMOVE);
-                BUG_ON(ret < 0);
-        }
        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
                if (level)
@@ -4627,6 +4621,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                              btrfs_node_key_ptr_offset(slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
+        } else if (level) {
+                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+                                              MOD_LOG_KEY_REMOVE);
+                BUG_ON(ret < 0);
        }
        nritems--;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67ed24ae86bb..16d9e8e191e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4262,16 +4262,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
-        if (unlikely(d_need_lookup(dentry))) {
+        ret = btrfs_inode_by_name(dir, dentry, &location);
-                memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
-                kfree(dentry->d_fsdata);
-                dentry->d_fsdata = NULL;
-                /* This thing is hashed, drop it for now */
-                d_drop(dentry);
-        } else {
-                ret = btrfs_inode_by_name(dir, dentry, &location);
-        }
        if (ret < 0)
                return ERR_PTR(ret);
@@ -4341,11 +4332,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *ret;
        ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
-        if (unlikely(d_need_lookup(dentry))) {
-                spin_lock(&dentry->d_lock);
-                dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
-                spin_unlock(&dentry->d_lock);
-        }
        return ret;
 }
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 67bef6d01484..746ce532e130 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object(
        _enter("{%s},%p,", cache->cache.identifier, cookie);
-        lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+        lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
        if (!lookup_data)
                goto nomem_lookup_data;
        /* create a new object record and a temporary leaf image */
-        object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+        object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
        if (!object)
                goto nomem_object;
@@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object(
         * - stick the length on the front and leave space on the back for the
         *   encoder
         */
-        buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+        buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
        if (!buffer)
                goto nomem_buffer;
@@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
                return;
        }
-        auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+        auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
        if (!auxdata) {
                _leave(" [nomem]");
                return;
@@ -441,6 +441,54 @@ truncate_failed:
 }
 /*
+ * Invalidate an object
+ */
+static void cachefiles_invalidate_object(struct fscache_operation *op)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        struct path path;
+        uint64_t ni_size;
+        int ret;
+        object = container_of(op->object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
+                                          &ni_size);
+        _enter("{OBJ%x},[%llu]",
+               op->object->debug_id, (unsigned long long)ni_size);
+        if (object->backer) {
+                ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+                fscache_set_store_limit(&object->fscache, ni_size);
+                path.dentry = object->backer;
+                path.mnt = cache->mnt;
+                cachefiles_begin_secure(cache, &saved_cred);
+                ret = vfs_truncate(&path, 0);
+                if (ret == 0)
+                        ret = vfs_truncate(&path, ni_size);
+                cachefiles_end_secure(cache, saved_cred);
+                if (ret != 0) {
+                        fscache_set_store_limit(&object->fscache, 0);
+                        if (ret == -EIO)
+                                cachefiles_io_error_obj(object,
+                                                        "Invalidate failed");
+                }
+        }
+        fscache_op_complete(op, true);
+        _leave("");
+}
+/*
 * dissociate a cache from all the pages it was backing
 */
 static void cachefiles_dissociate_pages(struct fscache_cache *cache)
@@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
        .lookup_complete        = cachefiles_lookup_complete,
        .grab_object            = cachefiles_grab_object,
        .update_object          = cachefiles_update_object,
+        .invalidate_object      = cachefiles_invalidate_object,
        .drop_object            = cachefiles_drop_object,
        .put_object             = cachefiles_put_object,
        .sync_cache             = cachefiles_sync_cache,
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bd6bc1bde2d7..49382519907a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -23,6 +23,8 @@ extern unsigned cachefiles_debug;
 #define CACHEFILES_DEBUG_KLEAVE 2
 #define CACHEFILES_DEBUG_KDEBUG 4
+#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
 /*
 * node records
 */
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 81b8b2b3a674..33b58c60f2d1 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
        _debug("max: %d", max);
-        key = kmalloc(max, GFP_KERNEL);
+        key = kmalloc(max, cachefiles_gfp);
        if (!key)
                return NULL;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index b0b5f7cdfffa..8c01c5fcdf75 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
        printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
               prefix, fscache_object_states[object->fscache.state],
               object->fscache.flags, work_busy(&object->fscache.work),
-               object->fscache.events,
+               object->fscache.events, object->fscache.event_mask);
-               object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
        printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
               prefix, object->fscache.n_ops, object->fscache.n_in_progress,
               object->fscache.n_exclusive);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c994691d9445..480992259707 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
        struct page *backpage = monitor->back_page, *backpage2;
        int ret;
-        kenter("{ino=%lx},{%lx,%lx}",
+        _enter("{ino=%lx},{%lx,%lx}",
               object->backer->d_inode->i_ino,
               backpage->index, backpage->flags);
        /* skip if the page was truncated away completely */
        if (backpage->mapping != bmapping) {
-                kleave(" = -ENODATA [mapping]");
+                _leave(" = -ENODATA [mapping]");
                return -ENODATA;
        }
        backpage2 = find_get_page(bmapping, backpage->index);
        if (!backpage2) {
-                kleave(" = -ENODATA [gone]");
+                _leave(" = -ENODATA [gone]");
                return -ENODATA;
        }
        if (backpage != backpage2) {
                put_page(backpage2);
-                kleave(" = -ENODATA [different]");
+                _leave(" = -ENODATA [different]");
                return -ENODATA;
        }
@@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
                if (PageUptodate(backpage))
                        goto unlock_discard;
-                kdebug("reissue read");
+                _debug("reissue read");
                ret = bmapping->a_ops->readpage(NULL, backpage);
                if (ret < 0)
                        goto unlock_discard;
@@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
        }
        /* it'll reappear on the todo list */
-        kleave(" = -EINPROGRESS");
+        _leave(" = -EINPROGRESS");
        return -EINPROGRESS;
 unlock_discard:
@@ -137,7 +137,7 @@ unlock_discard:
        spin_lock_irq(&object->work_lock);
        list_del(&monitor->op_link);
        spin_unlock_irq(&object->work_lock);
-        kleave(" = %d", ret);
+        _leave(" = %d", ret);
        return ret;
 }
@@ -174,11 +174,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
                _debug("- copy {%lu}", monitor->back_page->index);
        recheck:
-                if (PageUptodate(monitor->back_page)) {
+                if (test_bit(FSCACHE_COOKIE_INVALIDATING,
+                             &object->fscache.cookie->flags)) {
+                        error = -ESTALE;
+                } else if (PageUptodate(monitor->back_page)) {
                        copy_highpage(monitor->netfs_page, monitor->back_page);
+                        fscache_mark_page_cached(monitor->op,
-                        pagevec_add(&pagevec, monitor->netfs_page);
+                                                 monitor->netfs_page);
-                        fscache_mark_pages_cached(monitor->op, &pagevec);
                        error = 0;
                } else if (!PageError(monitor->back_page)) {
                        /* the page has probably been truncated */
@@ -198,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
                fscache_end_io(op, monitor->netfs_page, error);
                page_cache_release(monitor->netfs_page);
+                fscache_retrieval_complete(op, 1);
                fscache_put_retrieval(op);
                kfree(monitor);
@@ -239,7 +242,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
        _debug("read back %p{%lu,%d}",
               netpage, netpage->index, page_count(netpage));
-        monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+        monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
        if (!monitor)
                goto nomem;
@@ -258,13 +261,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
                        goto backing_page_already_present;
                if (!newpage) {
-                        newpage = page_cache_alloc_cold(bmapping);
+                        newpage = __page_cache_alloc(cachefiles_gfp |
+                                                     __GFP_COLD);
                        if (!newpage)
                                goto nomem_monitor;
                }
                ret = add_to_page_cache(newpage, bmapping,
-                                        netpage->index, GFP_KERNEL);
+                                        netpage->index, cachefiles_gfp);
                if (ret == 0)
                        goto installed_new_backing_page;
                if (ret != -EEXIST)
@@ -335,11 +339,11 @@ backing_page_already_present:
 backing_page_already_uptodate:
        _debug("- uptodate");
-        pagevec_add(pagevec, netpage);
+        fscache_mark_page_cached(op, netpage);
-        fscache_mark_pages_cached(op, pagevec);
        copy_highpage(netpage, backpage);
        fscache_end_io(op, netpage, 0);
+        fscache_retrieval_complete(op, 1);
 success:
        _debug("success");
@@ -357,10 +361,13 @@ out:
 read_error:
        _debug("read error %d", ret);
-        if (ret == -ENOMEM)
+        if (ret == -ENOMEM) {
+                fscache_retrieval_complete(op, 1);
                goto out;
+        }
 io_error:
        cachefiles_io_error_obj(object, "Page read error on backing file");
+        fscache_retrieval_complete(op, 1);
        ret = -ENOBUFS;
        goto out;
@@ -370,6 +377,7 @@ nomem_monitor:
        fscache_put_retrieval(monitor->op);
        kfree(monitor);
 nomem:
+        fscache_retrieval_complete(op, 1);
        _leave(" = -ENOMEM");
        return -ENOMEM;
 }
@@ -408,7 +416,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        _enter("{%p},{%lx},,,", object, page->index);
        if (!object->backer)
-                return -ENOBUFS;
+                goto enobufs;
        inode = object->backer->d_inode;
        ASSERT(S_ISREG(inode->i_mode));
@@ -417,7 +425,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        /* calculate the shift required to use bmap */
        if (inode->i_sb->s_blocksize > PAGE_SIZE)
-                return -ENOBUFS;
+                goto enobufs;
        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -448,15 +456,20 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
                                                       &pagevec);
        } else if (cachefiles_has_space(cache, 0, 1) == 0) {
                /* there's space in the cache we can use */
-                pagevec_add(&pagevec, page);
+                fscache_mark_page_cached(op, page);
-                fscache_mark_pages_cached(op, &pagevec);
+                fscache_retrieval_complete(op, 1);
                ret = -ENODATA;
        } else {
-                ret = -ENOBUFS;
+                goto enobufs;
        }
        _leave(" = %d", ret);
        return ret;
+enobufs:
+        fscache_retrieval_complete(op, 1);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
 }
 /*
@@ -465,8 +478,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 */
 static int cachefiles_read_backing_file(struct cachefiles_object *object,
                                        struct fscache_retrieval *op,
-                                        struct list_head *list,
+                                        struct list_head *list)
-                                        struct pagevec *mark_pvec)
 {
        struct cachefiles_one_read *monitor = NULL;
        struct address_space *bmapping = object->backer->d_inode->i_mapping;
@@ -485,7 +497,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                       netpage, netpage->index, page_count(netpage));
                if (!monitor) {
-                        monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+                        monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
                        if (!monitor)
                                goto nomem;
@@ -500,13 +512,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                                goto backing_page_already_present;
                        if (!newpage) {
-                                newpage = page_cache_alloc_cold(bmapping);
+                                newpage = __page_cache_alloc(cachefiles_gfp |
+                                                             __GFP_COLD);
                                if (!newpage)
                                        goto nomem;
                        }
                        ret = add_to_page_cache(newpage, bmapping,
-                                                netpage->index, GFP_KERNEL);
+                                                netpage->index, cachefiles_gfp);
                        if (ret == 0)
                                goto installed_new_backing_page;
                        if (ret != -EEXIST)
@@ -536,10 +549,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                _debug("- monitor add");
                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-                                        GFP_KERNEL);
+                                        cachefiles_gfp);
                if (ret < 0) {
                        if (ret == -EEXIST) {
                                page_cache_release(netpage);
+                                fscache_retrieval_complete(op, 1);
                                continue;
                        }
                        goto nomem;
@@ -612,10 +626,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                _debug("- uptodate");
                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-                                        GFP_KERNEL);
+                                        cachefiles_gfp);
                if (ret < 0) {
                        if (ret == -EEXIST) {
                                page_cache_release(netpage);
+                                fscache_retrieval_complete(op, 1);
                                continue;
                        }
                        goto nomem;
@@ -626,16 +641,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                page_cache_release(backpage);
                backpage = NULL;
-                if (!pagevec_add(mark_pvec, netpage))
+                fscache_mark_page_cached(op, netpage);
-                        fscache_mark_pages_cached(op, mark_pvec);
                page_cache_get(netpage);
                if (!pagevec_add(&lru_pvec, netpage))
                        __pagevec_lru_add_file(&lru_pvec);
+                /* the netpage is unlocked and marked up to date here */
                fscache_end_io(op, netpage, 0);
                page_cache_release(netpage);
                netpage = NULL;
+                fscache_retrieval_complete(op, 1);
                continue;
        }
@@ -661,6 +677,7 @@ out:
        list_for_each_entry_safe(netpage, _n, list, lru) {
                list_del(&netpage->lru);
                page_cache_release(netpage);
+                fscache_retrieval_complete(op, 1);
        }
        _leave(" = %d", ret);
@@ -669,15 +686,17 @@ out:
 nomem:
        _debug("nomem");
        ret = -ENOMEM;
-        goto out;
+        goto record_page_complete;
 read_error:
        _debug("read error %d", ret);
        if (ret == -ENOMEM)
-                goto out;
+                goto record_page_complete;
 io_error:
        cachefiles_io_error_obj(object, "Page read error on backing file");
        ret = -ENOBUFS;
+record_page_complete:
+        fscache_retrieval_complete(op, 1);
        goto out;
 }
@@ -709,7 +728,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
               *nr_pages);
        if (!object->backer)
-                return -ENOBUFS;
+                goto all_enobufs;
        space = 1;
        if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
@@ -722,7 +741,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
        /* calculate the shift required to use bmap */
        if (inode->i_sb->s_blocksize > PAGE_SIZE)
-                return -ENOBUFS;
+                goto all_enobufs;
        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -762,7 +781,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
                        nrbackpages++;
                } else if (space && pagevec_add(&pagevec, page) == 0) {
                        fscache_mark_pages_cached(op, &pagevec);
+                        fscache_retrieval_complete(op, 1);
                        ret = -ENODATA;
+                } else {
+                        fscache_retrieval_complete(op, 1);
                }
        }
@@ -775,18 +797,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
        /* submit the apparently valid pages to the backing fs to be read from
         * disk */
        if (nrbackpages > 0) {
-                ret2 = cachefiles_read_backing_file(object, op, &backpages,
+                ret2 = cachefiles_read_backing_file(object, op, &backpages);
-                                                    &pagevec);
                if (ret2 == -ENOMEM || ret2 == -EINTR)
                        ret = ret2;
        }
-        if (pagevec_count(&pagevec) > 0)
-                fscache_mark_pages_cached(op, &pagevec);
        _leave(" = %d [nr=%u%s]",
               ret, *nr_pages, list_empty(pages) ? " empty" : "");
        return ret;
+all_enobufs:
+        fscache_retrieval_complete(op, *nr_pages);
+        return -ENOBUFS;
 }
 /*
@@ -806,7 +828,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
 {
        struct cachefiles_object *object;
        struct cachefiles_cache *cache;
-        struct pagevec pagevec;
        int ret;
        object = container_of(op->op.object,
@@ -817,14 +838,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
        _enter("%p,{%lx},", object, page->index);
        ret = cachefiles_has_space(cache, 0, 1);
-        if (ret == 0) {
+        if (ret == 0)
-                pagevec_init(&pagevec, 0);
+                fscache_mark_page_cached(op, page);
-                pagevec_add(&pagevec, page);
+        else
-                fscache_mark_pages_cached(op, &pagevec);
-        } else {
                ret = -ENOBUFS;
-        }
+        fscache_retrieval_complete(op, 1);
        _leave(" = %d", ret);
        return ret;
 }
@@ -874,6 +893,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
                ret = -ENOBUFS;
        }
+        fscache_retrieval_complete(op, *nr_pages);
        _leave(" = %d", ret);
        return ret;
 }
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index e18b183b47e1..73b46288b54b 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
        ASSERT(dentry);
        ASSERT(dentry->d_inode);
-        auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+        auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
        if (!auxbuf) {
                _leave(" = -ENOMEM");
                return -ENOMEM;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6690269f5dde..064d1a68d2c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
        kfree(req->r_pages);
 }
+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+{
+        int i;
+        for (i = 0; i < num_pages; i++)
+                unlock_page(pages[i]);
+}
 /*
 * start an async read(ahead) operation.  return nr_pages we submitted
 * a read for on success, or negative error code.
@@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
        return nr_pages;
 out_pages:
+        ceph_unlock_page_vector(pages, nr_pages);
        ceph_release_page_vector(pages, nr_pages);
 out:
        ceph_osdc_put_request(req);
@@ -1078,23 +1087,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
                            struct page **pagep, void **fsdata)
 {
        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_file_info *fi = file->private_data;
        struct page *page;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        int r;
+        int r, want, got = 0;
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
+                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_BUFFER;
+        dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+             inode, ceph_vinop(inode), pos, len, inode->i_size);
+        r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
+        if (r < 0)
+                return r;
+        dout("write_begin %p %llx.%llx %llu~%u  got cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+        if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
+                ceph_put_cap_refs(ci, got);
+                return -EAGAIN;
+        }
        do {
                /* get a page */
                page = grab_cache_page_write_begin(mapping, index, 0);
-                if (!page)
+                if (!page) {
-                        return -ENOMEM;
+                        r = -ENOMEM;
-                *pagep = page;
+                        break;
+                }
                dout("write_begin file %p inode %p page %p %d~%d\n", file,
                     inode, page, (int)pos, (int)len);
                r = ceph_update_writeable_page(file, pos, len, page);
+                if (r)
+                        page_cache_release(page);
        } while (r == -EAGAIN);
+        if (r) {
+                ceph_put_cap_refs(ci, got);
+        } else {
+                *pagep = page;
+                *(int *)fsdata = got;
+        }
        return r;
 }
@@ -1108,10 +1145,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
                          struct page *page, void *fsdata)
 {
        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
        int check_cap = 0;
+        int got = (unsigned long)fsdata;
        dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
             inode, page, (int)pos, (int)copied, (int)len);
@@ -1134,6 +1173,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
        up_read(&mdsc->snap_rwsem);
        page_cache_release(page);
+        if (copied > 0) {
+                int dirty;
+                spin_lock(&ci->i_ceph_lock);
+                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+                spin_unlock(&ci->i_ceph_lock);
+                if (dirty)
+                        __mark_inode_dirty(inode, dirty);
+        }
+        dout("write_end %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+        ceph_put_cap_refs(ci, got);
        if (check_cap)
                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3251e9cc6401..a1d9bb30c1bf 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
        if (!ctx) {
                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
                if (cap) {
+                        spin_lock(&mdsc->caps_list_lock);
                        mdsc->caps_use_count++;
                        mdsc->caps_total_count++;
+                        spin_unlock(&mdsc->caps_list_lock);
                }
                return cap;
        }
@@ -1349,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
                if (!ci->i_head_snapc)
                        ci->i_head_snapc = ceph_get_snap_context(
                                ci->i_snap_realm->cached_context);
-                dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+                dout(" inode %p now dirty snapc %p auth cap %p\n",
-                        ci->i_head_snapc);
+                     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
                BUG_ON(!list_empty(&ci->i_dirty_item));
                spin_lock(&mdsc->cap_dirty_lock);
-                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+                if (ci->i_auth_cap)
+                        list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+                else
+                        list_add(&ci->i_dirty_item,
+                                 &mdsc->cap_dirty_migrating);
                spin_unlock(&mdsc->cap_dirty_lock);
                if (ci->i_flushing_caps == 0) {
                        ihold(inode);
@@ -2388,7 +2394,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                            &atime);
        /* max size increase? */
-        if (max_size != ci->i_max_size) {
+        if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
                dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
                ci->i_max_size = max_size;
                if (max_size >= ci->i_wanted_max_size) {
@@ -2745,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        /* make sure we re-request max_size, if necessary */
        spin_lock(&ci->i_ceph_lock);
+        ci->i_wanted_max_size = 0;  /* reset */
        ci->i_requested_max_size = 0;
        spin_unlock(&ci->i_ceph_lock);
 }
@@ -2840,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
                                  snaptrace, snaptrace_len);
-                ceph_check_caps(ceph_inode(inode), 0, session);
-                goto done_unlocked;
        }
        /* the rest require a cap */
@@ -2858,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_CAP_OP_REVOKE:
        case CEPH_CAP_OP_GRANT:
+        case CEPH_CAP_OP_IMPORT:
                handle_cap_grant(inode, h, session, cap, msg->middle);
                goto done_unlocked;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d4dfdcf76d7f..e51558fca3a3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct ceph_osd_client *osdc =
                &ceph_sb_to_client(inode->i_sb)->client->osdc;
        loff_t endoff = pos + iov->iov_len;
-        int want, got = 0;
+        int got = 0;
-        int ret, err;
+        int ret, err, written;
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
 retry_snap:
+        written = 0;
        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
                return -ENOSPC;
        __ceph_do_pending_vmtruncate(inode);
-        dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
-             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-             inode->i_size);
-        if (fi->fmode & CEPH_FILE_MODE_LAZY)
-                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
-        else
-                want = CEPH_CAP_FILE_BUFFER;
-        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
-        if (ret < 0)
-                goto out_put;
-        dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
-             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-             ceph_cap_string(got));
-        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
-            (iocb->ki_filp->f_flags & O_DIRECT) ||
-            (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
-            (fi->flags & CEPH_F_SYNC)) {
-                ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
-                        &iocb->ki_pos);
-        } else {
-                /*
-                 * buffered write; drop Fw early to avoid slow
-                 * revocation if we get stuck on balance_dirty_pages
-                 */
-                int dirty;
-                spin_lock(&ci->i_ceph_lock);
-                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
-                spin_unlock(&ci->i_ceph_lock);
-                ceph_put_cap_refs(ci, got);
+        /*
+         * try to do a buffered write.  if we don't have sufficient
+         * caps, we'll get -EAGAIN from generic_file_aio_write, or a
+         * short write if we only get caps for some pages.
+         */
+        if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
+            !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
+            !(fi->flags & CEPH_F_SYNC)) {
                ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+                if (ret >= 0)
+                        written = ret;
                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-                        err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
+                        err = vfs_fsync_range(file, pos, pos + written - 1, 1);
                        if (err < 0)
                                ret = err;
                }
+                if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
+                        goto out;
+        }
-                if (dirty)
+        dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
-                        __mark_inode_dirty(inode, dirty);
+             inode, ceph_vinop(inode), pos + written,
+             (unsigned)iov->iov_len - written, inode->i_size);
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
+        if (ret < 0)
                goto out;
-        }
+        dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+             inode, ceph_vinop(inode), pos + written,
+             (unsigned)iov->iov_len - written, ceph_cap_string(got));
+        ret = ceph_sync_write(file, iov->iov_base + written,
+                              iov->iov_len - written, &iocb->ki_pos);
        if (ret >= 0) {
                int dirty;
                spin_lock(&ci->i_ceph_lock);
@@ -777,13 +767,10 @@ retry_snap:
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
        }
-out_put:
        dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
-             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             inode, ceph_vinop(inode), pos + written,
-             ceph_cap_string(got));
+             (unsigned)iov->iov_len - written, ceph_cap_string(got));
        ceph_put_cap_refs(ci, got);
 out:
        if (ret == -EOLDSNAPC) {
                dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ba95eea201bf..2971eaa65cdc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 to;
-        int wrbuffer_refs, wake = 0;
+        int wrbuffer_refs, finish = 0;
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1498,15 +1498,18 @@ retry:
        truncate_inode_pages(inode->i_mapping, to);
        spin_lock(&ci->i_ceph_lock);
-        ci->i_truncate_pending--;
+        if (to == ci->i_truncate_size) {
-        if (ci->i_truncate_pending == 0)
+                ci->i_truncate_pending = 0;
-                wake = 1;
+                finish = 1;
+        }
        spin_unlock(&ci->i_ceph_lock);
+        if (!finish)
+                goto retry;
        if (wrbuffer_refs == 0)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
-        if (wake)
-                wake_up_all(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1bcf712655d9..9165eb8309eb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
        } else if (rpath || rino) {
                *ino = rino;
                *ppath = rpath;
-                *pathlen = strlen(rpath);
+                *pathlen = rpath ? strlen(rpath) : 0;
                dout(" path %.*s\n", *pathlen, rpath);
        }
@@ -1876,9 +1876,14 @@ finish:
 static void __wake_requests(struct ceph_mds_client *mdsc,
                            struct list_head *head)
 {
-        struct ceph_mds_request *req, *nreq;
+        struct ceph_mds_request *req;
+        LIST_HEAD(tmp_list);
+        list_splice_init(head, &tmp_list);
-        list_for_each_entry_safe(req, nreq, head, r_wait) {
+        while (!list_empty(&tmp_list)) {
+                req = list_entry(tmp_list.next,
+                                 struct ceph_mds_request, r_wait);
                list_del_init(&req->r_wait);
                __do_request(mdsc, req);
        }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2eb43f211325..e86aa9948124 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
        if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
                seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
-        if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-                seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
        if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
                seq_printf(m, ",osdkeepalivetimeout=%d",
                           opt->osd_keepalive_timeout);
@@ -849,7 +847,7 @@ static int ceph_register_bdi(struct super_block *sb,
                fsc->backing_dev_info.ra_pages =
                        default_backing_dev_info.ra_pages;
-        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
+        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
                sb->s_bdi = &fsc->backing_dev_info;
diff --git a/fs/dcache.c b/fs/dcache.c
index 3a463d0c4fe8..19153a0a810c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -455,24 +455,6 @@ void d_drop(struct dentry *dentry)
 EXPORT_SYMBOL(d_drop);
 /*
- * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
- * @dentry: dentry to drop
- *
- * This is called when we do a lookup on a placeholder dentry that needed to be
- * looked up.  The dentry should have been hashed in order for it to be found by
- * the lookup code, but now needs to be unhashed while we do the actual lookup
- * and clear the DCACHE_NEED_LOOKUP flag.
- */
-void d_clear_need_lookup(struct dentry *dentry)
-{
-        spin_lock(&dentry->d_lock);
-        __d_drop(dentry);
-        dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
-        spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(d_clear_need_lookup);
-/*
 * Finish off a dentry we've decided to kill.
 * dentry->d_lock must be held, returns with it unlocked.
 * If ref is non-zero, then decrement the refcount too.
@@ -565,13 +547,7 @@ repeat:
        if (d_unhashed(dentry))
                goto kill_it;
-        /*
+        dentry->d_flags |= DCACHE_REFERENCED;
-         * If this dentry needs lookup, don't set the referenced flag so that it
-         * is more likely to be cleaned up by the dcache shrinker in case of
-         * memory pressure.
-         */
-        if (!d_need_lookup(dentry))
-                dentry->d_flags |= DCACHE_REFERENCED;
        dentry_lru_add(dentry);
        dentry->d_count--;
@@ -1583,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias);
 */
 struct dentry *d_obtain_alias(struct inode *inode)
 {
-        static const struct qstr anonstring = { .name = "" };
+        static const struct qstr anonstring = QSTR_INIT("/", 1);
        struct dentry *tmp;
        struct dentry *res;
@@ -1737,13 +1713,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
        }
        /*
-         * We are going to instantiate this dentry, unhash it and clear the
-         * lookup flag so we can do that.
-         */
-        if (unlikely(d_need_lookup(found)))
-                d_clear_need_lookup(found);
-        /*
         * Negative dentry: instantiate it unless the inode is a directory and
         * already has a dentry.
         */
diff --git a/fs/exec.c b/fs/exec.c
index 237d5342786c..18c45cac368f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1669,7 +1669,6 @@ int get_dumpable(struct mm_struct *mm)
        return __get_dumpable(mm->flags);
 }
-#ifdef __ARCH_WANT_SYS_EXECVE
 SYSCALL_DEFINE3(execve,
                const char __user *, filename,
                const char __user *const __user *, argv,
@@ -1697,23 +1696,3 @@ asmlinkage long compat_sys_execve(const char __user * filename,
        return error;
 }
 #endif
-#endif
-#ifdef __ARCH_WANT_KERNEL_EXECVE
-int kernel_execve(const char *filename,
-                  const char *const argv[],
-                  const char *const envp[])
-{
-        int ret = do_execve(filename,
-                        (const char __user *const __user *)argv,
-                        (const char __user *const __user *)envp);
-        if (ret < 0)
-                return ret;
-        /*
-         * We were successful.  We won't be returning to our caller, but
-         * instead to user space by manipulating the kernel stack.
-         */
-        ret_from_kernel_execve(current_pt_regs());
-}
-#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 606bb074c501..5df4bb4aab14 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -322,10 +322,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
        if (parent && (len < 4)) {
                *max_len = 4;
-                return 255;
+                return FILEID_INVALID;
        } else if (len < 2) {
                *max_len = 2;
-                return 255;
+                return FILEID_INVALID;
        }
        len = 2;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
new file mode 100644
index 000000000000..fd27e7e6326e
--- /dev/null
+++ b/fs/f2fs/Kconfig
@@ -0,0 +1,53 @@
+config F2FS_FS
+        tristate "F2FS filesystem support (EXPERIMENTAL)"
+        depends on BLOCK
+        help
+          F2FS is based on Log-structured File System (LFS), which supports
+          versatile "flash-friendly" features. The design has been focused on
+          addressing the fundamental issues in LFS, which are snowball effect
+          of wandering tree and high cleaning overhead.
+          Since flash-based storages show different characteristics according to
+          the internal geometry or flash memory management schemes aka FTL, F2FS
+          and tools support various parameters not only for configuring on-disk
+          layout, but also for selecting allocation and cleaning algorithms.
+          If unsure, say N.
+config F2FS_STAT_FS
+        bool "F2FS Status Information"
+        depends on F2FS_FS && DEBUG_FS
+        default y
+        help
+          /sys/kernel/debug/f2fs/ contains information about all the partitions
+          mounted as f2fs. Each file shows the whole f2fs information.
+          /sys/kernel/debug/f2fs/status includes:
+            - major file system information managed by f2fs currently
+            - average SIT information about whole segments
+            - current memory footprint consumed by f2fs.
+config F2FS_FS_XATTR
+        bool "F2FS extended attributes"
+        depends on F2FS_FS
+        default y
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          If unsure, say N.
+config F2FS_FS_POSIX_ACL
+        bool "F2FS Access Control Lists"
+        depends on F2FS_FS_XATTR
+        select FS_POSIX_ACL
+        default y
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          gourps beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
new file mode 100644
index 000000000000..27a0820340b9
--- /dev/null
+++ b/fs/f2fs/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_F2FS_FS) += f2fs.o
+f2fs-y          := dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y          += checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
+f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
+f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
new file mode 100644
index 000000000000..fed74d193ffb
--- /dev/null
+++ b/fs/f2fs/acl.c
@@ -0,0 +1,414 @@
+/*
+ * fs/f2fs/acl.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+#define get_inode_mode(i)       ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+                                        (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+static inline size_t f2fs_acl_size(int count)
+{
+        if (count <= 4) {
+                return sizeof(struct f2fs_acl_header) +
+                        count * sizeof(struct f2fs_acl_entry_short);
+        } else {
+                return sizeof(struct f2fs_acl_header) +
+                        4 * sizeof(struct f2fs_acl_entry_short) +
+                        (count - 4) * sizeof(struct f2fs_acl_entry);
+        }
+}
+static inline int f2fs_acl_count(size_t size)
+{
+        ssize_t s;
+        size -= sizeof(struct f2fs_acl_header);
+        s = size - 4 * sizeof(struct f2fs_acl_entry_short);
+        if (s < 0) {
+                if (size % sizeof(struct f2fs_acl_entry_short))
+                        return -1;
+                return size / sizeof(struct f2fs_acl_entry_short);
+        } else {
+                if (s % sizeof(struct f2fs_acl_entry))
+                        return -1;
+                return s / sizeof(struct f2fs_acl_entry) + 4;
+        }
+}
+static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
+{
+        int i, count;
+        struct posix_acl *acl;
+        struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value;
+        struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
+        const char *end = value + size;
+        if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
+                return ERR_PTR(-EINVAL);
+        count = f2fs_acl_count(size);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_KERNEL);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < count; i++) {
+                if ((char *)entry > end)
+                        goto fail;
+                acl->a_entries[i].e_tag  = le16_to_cpu(entry->e_tag);
+                acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm);
+                switch (acl->a_entries[i].e_tag) {
+                case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
+                case ACL_MASK:
+                case ACL_OTHER:
+                        acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+                        entry = (struct f2fs_acl_entry *)((char *)entry +
+                                        sizeof(struct f2fs_acl_entry_short));
+                        break;
+                case ACL_USER:
+                        acl->a_entries[i].e_uid =
+                                make_kuid(&init_user_ns,
+                                                le32_to_cpu(entry->e_id));
+                        entry = (struct f2fs_acl_entry *)((char *)entry +
+                                        sizeof(struct f2fs_acl_entry));
+                        break;
+                case ACL_GROUP:
+                        acl->a_entries[i].e_gid =
+                                make_kgid(&init_user_ns,
+                                                le32_to_cpu(entry->e_id));
+                        entry = (struct f2fs_acl_entry *)((char *)entry +
+                                        sizeof(struct f2fs_acl_entry));
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        if ((char *)entry != end)
+                goto fail;
+        return acl;
+fail:
+        posix_acl_release(acl);
+        return ERR_PTR(-EINVAL);
+}
+static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+        struct f2fs_acl_header *f2fs_acl;
+        struct f2fs_acl_entry *entry;
+        int i;
+        f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
+                        sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+        if (!f2fs_acl)
+                return ERR_PTR(-ENOMEM);
+        f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION);
+        entry = (struct f2fs_acl_entry *)(f2fs_acl + 1);
+        for (i = 0; i < acl->a_count; i++) {
+                entry->e_tag  = cpu_to_le16(acl->a_entries[i].e_tag);
+                entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm);
+                switch (acl->a_entries[i].e_tag) {
+                case ACL_USER:
+                        entry->e_id = cpu_to_le32(
+                                        from_kuid(&init_user_ns,
+                                                acl->a_entries[i].e_uid));
+                        entry = (struct f2fs_acl_entry *)((char *)entry +
+                                        sizeof(struct f2fs_acl_entry));
+                        break;
+                case ACL_GROUP:
+                        entry->e_id = cpu_to_le32(
+                                        from_kgid(&init_user_ns,
+                                                acl->a_entries[i].e_gid));
+                        entry = (struct f2fs_acl_entry *)((char *)entry +
+                                        sizeof(struct f2fs_acl_entry));
+                        break;
+                case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
+                case ACL_MASK:
+                case ACL_OTHER:
+                        entry = (struct f2fs_acl_entry *)((char *)entry +
+                                        sizeof(struct f2fs_acl_entry_short));
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        *size = f2fs_acl_size(acl->a_count);
+        return (void *)f2fs_acl;
+fail:
+        kfree(f2fs_acl);
+        return ERR_PTR(-EINVAL);
+}
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+        void *value = NULL;
+        struct posix_acl *acl;
+        int retval;
+        if (!test_opt(sbi, POSIX_ACL))
+                return NULL;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
+        if (type == ACL_TYPE_ACCESS)
+                name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+        retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
+        if (retval > 0) {
+                value = kmalloc(retval, GFP_KERNEL);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                retval = f2fs_getxattr(inode, name_index, "", value, retval);
+        }
+        if (retval < 0) {
+                if (retval == -ENODATA)
+                        acl = NULL;
+                else
+                        acl = ERR_PTR(retval);
+        } else {
+                acl = f2fs_acl_from_disk(value, retval);
+        }
+        kfree(value);
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
+        return acl;
+}
+static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        int name_index;
+        void *value = NULL;
+        size_t size = 0;
+        int error;
+        if (!test_opt(sbi, POSIX_ACL))
+                return 0;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+                if (acl) {
+                        error = posix_acl_equiv_mode(acl, &inode->i_mode);
+                        if (error < 0)
+                                return error;
+                        set_acl_inode(fi, inode->i_mode);
+                        if (error == 0)
+                                acl = NULL;
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                value = f2fs_acl_to_disk(acl, &size);
+                if (IS_ERR(value)) {
+                        cond_clear_inode_flag(fi, FI_ACL_MODE);
+                        return (int)PTR_ERR(value);
+                }
+        }
+        error = f2fs_setxattr(inode, name_index, "", value, size);
+        kfree(value);
+        if (!error)
+                set_cached_acl(inode, type, acl);
+        cond_clear_inode_flag(fi, FI_ACL_MODE);
+        return error;
+}
+int f2fs_init_acl(struct inode *inode, struct inode *dir)
+{
+        struct posix_acl *acl = NULL;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        int error = 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (test_opt(sbi, POSIX_ACL)) {
+                        acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current_umask();
+        }
+        if (test_opt(sbi, POSIX_ACL) && acl) {
+                if (S_ISDIR(inode->i_mode)) {
+                        error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+                        if (error)
+                                goto cleanup;
+                }
+                error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+                if (error < 0)
+                        return error;
+                if (error > 0)
+                        error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+        }
+cleanup:
+        posix_acl_release(acl);
+        return error;
+}
+int f2fs_acl_chmod(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int error;
+        mode_t mode = get_inode_mode(inode);
+        if (!test_opt(sbi, POSIX_ACL))
+                return 0;
+        if (S_ISLNK(mode))
+                return -EOPNOTSUPP;
+        acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
+        if (error)
+                return error;
+        error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+        posix_acl_release(acl);
+        return error;
+}
+static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
+                size_t list_size, const char *name, size_t name_len, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+        const char *xname = POSIX_ACL_XATTR_DEFAULT;
+        size_t size;
+        if (!test_opt(sbi, POSIX_ACL))
+                return 0;
+        if (type == ACL_TYPE_ACCESS)
+                xname = POSIX_ACL_XATTR_ACCESS;
+        size = strlen(xname) + 1;
+        if (list && size <= list_size)
+                memcpy(list, xname, size);
+        return size;
+}
+static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
+                void *buffer, size_t size, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+        struct posix_acl *acl;
+        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(sbi, POSIX_ACL))
+                return -EOPNOTSUPP;
+        acl = f2fs_get_acl(dentry->d_inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (!acl)
+                return -ENODATA;
+        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+        struct inode *inode = dentry->d_inode;
+        struct posix_acl *acl = NULL;
+        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(sbi, POSIX_ACL))
+                return -EOPNOTSUPP;
+        if (!inode_owner_or_capable(inode))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(&init_user_ns, value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (acl) {
+                        error = posix_acl_valid(acl);
+                        if (error)
+                                goto release_and_out;
+                }
+        } else {
+                acl = NULL;
+        }
+        error = f2fs_set_acl(inode, type, acl);
+release_and_out:
+        posix_acl_release(acl);
+        return error;
+}
+const struct xattr_handler f2fs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags = ACL_TYPE_DEFAULT,
+        .list = f2fs_xattr_list_acl,
+        .get = f2fs_xattr_get_acl,
+        .set = f2fs_xattr_set_acl,
+};
+const struct xattr_handler f2fs_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags = ACL_TYPE_ACCESS,
+        .list = f2fs_xattr_list_acl,
+        .get = f2fs_xattr_get_acl,
+        .set = f2fs_xattr_set_acl,
+};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
new file mode 100644
index 000000000000..80f430674417
--- /dev/null
+++ b/fs/f2fs/acl.h
@@ -0,0 +1,57 @@
+/*
+ * fs/f2fs/acl.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.h
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_ACL_H__
+#define __F2FS_ACL_H__
+#include <linux/posix_acl_xattr.h>
+#define F2FS_ACL_VERSION        0x0001
+struct f2fs_acl_entry {
+        __le16 e_tag;
+        __le16 e_perm;
+        __le32 e_id;
+};
+struct f2fs_acl_entry_short {
+        __le16 e_tag;
+        __le16 e_perm;
+};
+struct f2fs_acl_header {
+        __le32 a_version;
+};
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
+extern int f2fs_acl_chmod(struct inode *inode);
+extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
+#else
+#define f2fs_check_acl  NULL
+#define f2fs_get_acl    NULL
+#define f2fs_set_acl    NULL
+static inline int f2fs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
+{
+        return 0;
+}
+#endif
+#endif /* __F2FS_ACL_H__ */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
new file mode 100644
index 000000000000..6ef36c37e2be
--- /dev/null
+++ b/fs/f2fs/checkpoint.c
@@ -0,0 +1,794 @@
+/*
+ * fs/f2fs/checkpoint.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+static struct kmem_cache *orphan_entry_slab;
+static struct kmem_cache *inode_entry_slab;
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct page *page = NULL;
+repeat:
+        page = grab_cache_page(mapping, index);
+        if (!page) {
+                cond_resched();
+                goto repeat;
+        }
+        /* We wait writeback only inside grab_meta_page() */
+        wait_on_page_writeback(page);
+        SetPageUptodate(page);
+        return page;
+}
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct page *page;
+repeat:
+        page = grab_cache_page(mapping, index);
+        if (!page) {
+                cond_resched();
+                goto repeat;
+        }
+        if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
+                f2fs_put_page(page, 1);
+                goto repeat;
+        }
+        mark_page_accessed(page);
+        /* We do not allow returning an errorneous page */
+        return page;
+}
+static int f2fs_write_meta_page(struct page *page,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        int err;
+        wait_on_page_writeback(page);
+        err = write_meta_page(sbi, page, wbc);
+        if (err) {
+                wbc->pages_skipped++;
+                set_page_dirty(page);
+        }
+        dec_page_count(sbi, F2FS_DIRTY_META);
+        /* In this case, we should not unlock this page */
+        if (err != AOP_WRITEPAGE_ACTIVATE)
+                unlock_page(page);
+        return err;
+}
+static int f2fs_write_meta_pages(struct address_space *mapping,
+                                struct writeback_control *wbc)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        struct block_device *bdev = sbi->sb->s_bdev;
+        long written;
+        if (wbc->for_kupdate)
+                return 0;
+        if (get_pages(sbi, F2FS_DIRTY_META) == 0)
+                return 0;
+        /* if mounting is failed, skip writing node pages */
+        mutex_lock(&sbi->cp_mutex);
+        written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+        mutex_unlock(&sbi->cp_mutex);
+        wbc->nr_to_write -= written;
+        return 0;
+}
+long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+                                                long nr_to_write)
+{
+        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        pgoff_t index = 0, end = LONG_MAX;
+        struct pagevec pvec;
+        long nwritten = 0;
+        struct writeback_control wbc = {
+                .for_reclaim = 0,
+        };
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                int i, nr_pages;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                PAGECACHE_TAG_DIRTY,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        lock_page(page);
+                        BUG_ON(page->mapping != mapping);
+                        BUG_ON(!PageDirty(page));
+                        clear_page_dirty_for_io(page);
+                        f2fs_write_meta_page(page, &wbc);
+                        if (nwritten++ >= nr_to_write)
+                                break;
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        if (nwritten)
+                f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+        return nwritten;
+}
+static int f2fs_set_meta_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        SetPageUptodate(page);
+        if (!PageDirty(page)) {
+                __set_page_dirty_nobuffers(page);
+                inc_page_count(sbi, F2FS_DIRTY_META);
+                F2FS_SET_SB_DIRT(sbi);
+                return 1;
+        }
+        return 0;
+}
+const struct address_space_operations f2fs_meta_aops = {
+        .writepage      = f2fs_write_meta_page,
+        .writepages     = f2fs_write_meta_pages,
+        .set_page_dirty = f2fs_set_meta_page_dirty,
+};
+int check_orphan_space(struct f2fs_sb_info *sbi)
+{
+        unsigned int max_orphans;
+        int err = 0;
+        /*
+         * considering 512 blocks in a segment 5 blocks are needed for cp
+         * and log segment summaries. Remaining blocks are used to keep
+         * orphan entries with the limitation one reserved segment
+         * for cp pack we can have max 1020*507 orphan entries
+         */
+        max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
+        mutex_lock(&sbi->orphan_inode_mutex);
+        if (sbi->n_orphans >= max_orphans)
+                err = -ENOSPC;
+        mutex_unlock(&sbi->orphan_inode_mutex);
+        return err;
+}
+void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+        struct list_head *head, *this;
+        struct orphan_inode_entry *new = NULL, *orphan = NULL;
+        mutex_lock(&sbi->orphan_inode_mutex);
+        head = &sbi->orphan_inode_list;
+        list_for_each(this, head) {
+                orphan = list_entry(this, struct orphan_inode_entry, list);
+                if (orphan->ino == ino)
+                        goto out;
+                if (orphan->ino > ino)
+                        break;
+                orphan = NULL;
+        }
+retry:
+        new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+        if (!new) {
+                cond_resched();
+                goto retry;
+        }
+        new->ino = ino;
+        INIT_LIST_HEAD(&new->list);
+        /* add new_oentry into list which is sorted by inode number */
+        if (orphan) {
+                struct orphan_inode_entry *prev;
+                /* get previous entry */
+                prev = list_entry(orphan->list.prev, typeof(*prev), list);
+                if (&prev->list != head)
+                        /* insert new orphan inode entry */
+                        list_add(&new->list, &prev->list);
+                else
+                        list_add(&new->list, head);
+        } else {
+                list_add_tail(&new->list, head);
+        }
+        sbi->n_orphans++;
+out:
+        mutex_unlock(&sbi->orphan_inode_mutex);
+}
+void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+        struct list_head *this, *next, *head;
+        struct orphan_inode_entry *orphan;
+        mutex_lock(&sbi->orphan_inode_mutex);
+        head = &sbi->orphan_inode_list;
+        list_for_each_safe(this, next, head) {
+                orphan = list_entry(this, struct orphan_inode_entry, list);
+                if (orphan->ino == ino) {
+                        list_del(&orphan->list);
+                        kmem_cache_free(orphan_entry_slab, orphan);
+                        sbi->n_orphans--;
+                        break;
+                }
+        }
+        mutex_unlock(&sbi->orphan_inode_mutex);
+}
+static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+        struct inode *inode = f2fs_iget(sbi->sb, ino);
+        BUG_ON(IS_ERR(inode));
+        clear_nlink(inode);
+        /* truncate all the data during iput */
+        iput(inode);
+}
+int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+{
+        block_t start_blk, orphan_blkaddr, i, j;
+        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+                return 0;
+        sbi->por_doing = 1;
+        start_blk = __start_cp_addr(sbi) + 1;
+        orphan_blkaddr = __start_sum_addr(sbi) - 1;
+        for (i = 0; i < orphan_blkaddr; i++) {
+                struct page *page = get_meta_page(sbi, start_blk + i);
+                struct f2fs_orphan_block *orphan_blk;
+                orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+                for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
+                        nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
+                        recover_orphan_inode(sbi, ino);
+                }
+                f2fs_put_page(page, 1);
+        }
+        /* clear Orphan Flag */
+        clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+        sbi->por_doing = 0;
+        return 0;
+}
+static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+        struct list_head *head, *this, *next;
+        struct f2fs_orphan_block *orphan_blk = NULL;
+        struct page *page = NULL;
+        unsigned int nentries = 0;
+        unsigned short index = 1;
+        unsigned short orphan_blocks;
+        orphan_blocks = (unsigned short)((sbi->n_orphans +
+                (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+        mutex_lock(&sbi->orphan_inode_mutex);
+        head = &sbi->orphan_inode_list;
+        /* loop for each orphan inode entry and write them in Jornal block */
+        list_for_each_safe(this, next, head) {
+                struct orphan_inode_entry *orphan;
+                orphan = list_entry(this, struct orphan_inode_entry, list);
+                if (nentries == F2FS_ORPHANS_PER_BLOCK) {
+                        /*
+                         * an orphan block is full of 1020 entries,
+                         * then we need to flush current orphan blocks
+                         * and bring another one in memory
+                         */
+                        orphan_blk->blk_addr = cpu_to_le16(index);
+                        orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+                        orphan_blk->entry_count = cpu_to_le32(nentries);
+                        set_page_dirty(page);
+                        f2fs_put_page(page, 1);
+                        index++;
+                        start_blk++;
+                        nentries = 0;
+                        page = NULL;
+                }
+                if (page)
+                        goto page_exist;
+                page = grab_meta_page(sbi, start_blk);
+                orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+                memset(orphan_blk, 0, sizeof(*orphan_blk));
+page_exist:
+                orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+        }
+        if (!page)
+                goto end;
+        orphan_blk->blk_addr = cpu_to_le16(index);
+        orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+        orphan_blk->entry_count = cpu_to_le32(nentries);
+        set_page_dirty(page);
+        f2fs_put_page(page, 1);
+end:
+        mutex_unlock(&sbi->orphan_inode_mutex);
+}
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+                                block_t cp_addr, unsigned long long *version)
+{
+        struct page *cp_page_1, *cp_page_2 = NULL;
+        unsigned long blk_size = sbi->blocksize;
+        struct f2fs_checkpoint *cp_block;
+        unsigned long long cur_version = 0, pre_version = 0;
+        unsigned int crc = 0;
+        size_t crc_offset;
+        /* Read the 1st cp block in this CP pack */
+        cp_page_1 = get_meta_page(sbi, cp_addr);
+        /* get the version number */
+        cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
+        crc_offset = le32_to_cpu(cp_block->checksum_offset);
+        if (crc_offset >= blk_size)
+                goto invalid_cp1;
+        crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+                goto invalid_cp1;
+        pre_version = le64_to_cpu(cp_block->checkpoint_ver);
+        /* Read the 2nd cp block in this CP pack */
+        cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+        cp_page_2 = get_meta_page(sbi, cp_addr);
+        cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
+        crc_offset = le32_to_cpu(cp_block->checksum_offset);
+        if (crc_offset >= blk_size)
+                goto invalid_cp2;
+        crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+                goto invalid_cp2;
+        cur_version = le64_to_cpu(cp_block->checkpoint_ver);
+        if (cur_version == pre_version) {
+                *version = cur_version;
+                f2fs_put_page(cp_page_2, 1);
+                return cp_page_1;
+        }
+invalid_cp2:
+        f2fs_put_page(cp_page_2, 1);
+invalid_cp1:
+        f2fs_put_page(cp_page_1, 1);
+        return NULL;
+}
+int get_valid_checkpoint(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_checkpoint *cp_block;
+        struct f2fs_super_block *fsb = sbi->raw_super;
+        struct page *cp1, *cp2, *cur_page;
+        unsigned long blk_size = sbi->blocksize;
+        unsigned long long cp1_version = 0, cp2_version = 0;
+        unsigned long long cp_start_blk_no;
+        sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
+        if (!sbi->ckpt)
+                return -ENOMEM;
+        /*
+         * Finding out valid cp block involves read both
+         * sets( cp pack1 and cp pack 2)
+         */
+        cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+        cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
+        /* The second checkpoint pack should start at the next segment */
+        cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+        cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
+        if (cp1 && cp2) {
+                if (ver_after(cp2_version, cp1_version))
+                        cur_page = cp2;
+                else
+                        cur_page = cp1;
+        } else if (cp1) {
+                cur_page = cp1;
+        } else if (cp2) {
+                cur_page = cp2;
+        } else {
+                goto fail_no_cp;
+        }
+        cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
+        memcpy(sbi->ckpt, cp_block, blk_size);
+        f2fs_put_page(cp1, 1);
+        f2fs_put_page(cp2, 1);
+        return 0;
+fail_no_cp:
+        kfree(sbi->ckpt);
+        return -EINVAL;
+}
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct list_head *head = &sbi->dir_inode_list;
+        struct dir_inode_entry *new;
+        struct list_head *this;
+        if (!S_ISDIR(inode->i_mode))
+                return;
+retry:
+        new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+        if (!new) {
+                cond_resched();
+                goto retry;
+        }
+        new->inode = inode;
+        INIT_LIST_HEAD(&new->list);
+        spin_lock(&sbi->dir_inode_lock);
+        list_for_each(this, head) {
+                struct dir_inode_entry *entry;
+                entry = list_entry(this, struct dir_inode_entry, list);
+                if (entry->inode == inode) {
+                        kmem_cache_free(inode_entry_slab, new);
+                        goto out;
+                }
+        }
+        list_add_tail(&new->list, head);
+        sbi->n_dirty_dirs++;
+        BUG_ON(!S_ISDIR(inode->i_mode));
+out:
+        inc_page_count(sbi, F2FS_DIRTY_DENTS);
+        inode_inc_dirty_dents(inode);
+        SetPagePrivate(page);
+        spin_unlock(&sbi->dir_inode_lock);
+}
+void remove_dirty_dir_inode(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct list_head *head = &sbi->dir_inode_list;
+        struct list_head *this;
+        if (!S_ISDIR(inode->i_mode))
+                return;
+        spin_lock(&sbi->dir_inode_lock);
+        if (atomic_read(&F2FS_I(inode)->dirty_dents))
+                goto out;
+        list_for_each(this, head) {
+                struct dir_inode_entry *entry;
+                entry = list_entry(this, struct dir_inode_entry, list);
+                if (entry->inode == inode) {
+                        list_del(&entry->list);
+                        kmem_cache_free(inode_entry_slab, entry);
+                        sbi->n_dirty_dirs--;
+                        break;
+                }
+        }
+out:
+        spin_unlock(&sbi->dir_inode_lock);
+}
+void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+{
+        struct list_head *head = &sbi->dir_inode_list;
+        struct dir_inode_entry *entry;
+        struct inode *inode;
+retry:
+        spin_lock(&sbi->dir_inode_lock);
+        if (list_empty(head)) {
+                spin_unlock(&sbi->dir_inode_lock);
+                return;
+        }
+        entry = list_entry(head->next, struct dir_inode_entry, list);
+        inode = igrab(entry->inode);
+        spin_unlock(&sbi->dir_inode_lock);
+        if (inode) {
+                filemap_flush(inode->i_mapping);
+                iput(inode);
+        } else {
+                /*
+                 * We should submit bio, since it exists several
+                 * wribacking dentry pages in the freeing inode.
+                 */
+                f2fs_submit_bio(sbi, DATA, true);
+        }
+        goto retry;
+}
+/*
+ * Freeze all the FS-operations for checkpoint.
+ */
+void block_operations(struct f2fs_sb_info *sbi)
+{
+        int t;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = LONG_MAX,
+                .for_reclaim = 0,
+        };
+        /* Stop renaming operation */
+        mutex_lock_op(sbi, RENAME);
+        mutex_lock_op(sbi, DENTRY_OPS);
+retry_dents:
+        /* write all the dirty dentry pages */
+        sync_dirty_dir_inodes(sbi);
+        mutex_lock_op(sbi, DATA_WRITE);
+        if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
+                mutex_unlock_op(sbi, DATA_WRITE);
+                goto retry_dents;
+        }
+        /* block all the operations */
+        for (t = DATA_NEW; t <= NODE_TRUNC; t++)
+                mutex_lock_op(sbi, t);
+        mutex_lock(&sbi->write_inode);
+        /*
+         * POR: we should ensure that there is no dirty node pages
+         * until finishing nat/sit flush.
+         */
+retry:
+        sync_node_pages(sbi, 0, &wbc);
+        mutex_lock_op(sbi, NODE_WRITE);
+        if (get_pages(sbi, F2FS_DIRTY_NODES)) {
+                mutex_unlock_op(sbi, NODE_WRITE);
+                goto retry;
+        }
+        mutex_unlock(&sbi->write_inode);
+}
+static void unblock_operations(struct f2fs_sb_info *sbi)
+{
+        int t;
+        for (t = NODE_WRITE; t >= RENAME; t--)
+                mutex_unlock_op(sbi, t);
+}
+static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+{
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        nid_t last_nid = 0;
+        block_t start_blk;
+        struct page *cp_page;
+        unsigned int data_sum_blocks, orphan_blocks;
+        unsigned int crc32 = 0;
+        void *kaddr;
+        int i;
+        /* Flush all the NAT/SIT pages */
+        while (get_pages(sbi, F2FS_DIRTY_META))
+                sync_meta_pages(sbi, META, LONG_MAX);
+        next_free_nid(sbi, &last_nid);
+        /*
+         * modify checkpoint
+         * version number is already updated
+         */
+        ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
+        ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+        ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
+        for (i = 0; i < 3; i++) {
+                ckpt->cur_node_segno[i] =
+                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
+                ckpt->cur_node_blkoff[i] =
+                        cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
+                ckpt->alloc_type[i + CURSEG_HOT_NODE] =
+                                curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
+        }
+        for (i = 0; i < 3; i++) {
+                ckpt->cur_data_segno[i] =
+                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
+                ckpt->cur_data_blkoff[i] =
+                        cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
+                ckpt->alloc_type[i + CURSEG_HOT_DATA] =
+                                curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
+        }
+        ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+        ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+        ckpt->next_free_nid = cpu_to_le32(last_nid);
+        /* 2 cp  + n data seg summary + orphan inode blocks */
+        data_sum_blocks = npages_for_summary_flush(sbi);
+        if (data_sum_blocks < 3)
+                set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+        else
+                clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+        orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
+                                        / F2FS_ORPHANS_PER_BLOCK;
+        ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
+        if (is_umount) {
+                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+                ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+                        data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
+        } else {
+                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+                ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+                        data_sum_blocks + orphan_blocks);
+        }
+        if (sbi->n_orphans)
+                set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+        else
+                clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+        /* update SIT/NAT bitmap */
+        get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
+        get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
+        crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+        *(__le32 *)((unsigned char *)ckpt +
+                                le32_to_cpu(ckpt->checksum_offset))
+                                = cpu_to_le32(crc32);
+        start_blk = __start_cp_addr(sbi);
+        /* write out checkpoint buffer at block 0 */
+        cp_page = grab_meta_page(sbi, start_blk++);
+        kaddr = page_address(cp_page);
+        memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+        set_page_dirty(cp_page);
+        f2fs_put_page(cp_page, 1);
+        if (sbi->n_orphans) {
+                write_orphan_inodes(sbi, start_blk);
+                start_blk += orphan_blocks;
+        }
+        write_data_summaries(sbi, start_blk);
+        start_blk += data_sum_blocks;
+        if (is_umount) {
+                write_node_summaries(sbi, start_blk);
+                start_blk += NR_CURSEG_NODE_TYPE;
+        }
+        /* writeout checkpoint block */
+        cp_page = grab_meta_page(sbi, start_blk);
+        kaddr = page_address(cp_page);
+        memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+        set_page_dirty(cp_page);
+        f2fs_put_page(cp_page, 1);
+        /* wait for previous submitted node/meta pages writeback */
+        while (get_pages(sbi, F2FS_WRITEBACK))
+                congestion_wait(BLK_RW_ASYNC, HZ / 50);
+        filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
+        filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+        /* update user_block_counts */
+        sbi->last_valid_block_count = sbi->total_valid_block_count;
+        sbi->alloc_valid_block_count = 0;
+        /* Here, we only have one bio having CP pack */
+        if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
+                sbi->sb->s_flags |= MS_RDONLY;
+        else
+                sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+        clear_prefree_segments(sbi);
+        F2FS_RESET_SB_DIRT(sbi);
+}
+/*
+ * We guarantee that this checkpoint procedure should not fail.
+ */
+void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
+{
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        unsigned long long ckpt_ver;
+        if (!blocked) {
+                mutex_lock(&sbi->cp_mutex);
+                block_operations(sbi);
+        }
+        f2fs_submit_bio(sbi, DATA, true);
+        f2fs_submit_bio(sbi, NODE, true);
+        f2fs_submit_bio(sbi, META, true);
+        /*
+         * update checkpoint pack index
+         * Increase the version number so that
+         * SIT entries and seg summaries are written at correct place
+         */
+        ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
+        ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
+        /* write cached NAT/SIT entries to NAT/SIT area */
+        flush_nat_entries(sbi);
+        flush_sit_entries(sbi);
+        reset_victim_segmap(sbi);
+        /* unlock all the fs_lock[] in do_checkpoint() */
+        do_checkpoint(sbi, is_umount);
+        unblock_operations(sbi);
+        mutex_unlock(&sbi->cp_mutex);
+}
+void init_orphan_info(struct f2fs_sb_info *sbi)
+{
+        mutex_init(&sbi->orphan_inode_mutex);
+        INIT_LIST_HEAD(&sbi->orphan_inode_list);
+        sbi->n_orphans = 0;
+}
+int create_checkpoint_caches(void)
+{
+        orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
+                        sizeof(struct orphan_inode_entry), NULL);
+        if (unlikely(!orphan_entry_slab))
+                return -ENOMEM;
+        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
+                        sizeof(struct dir_inode_entry), NULL);
+        if (unlikely(!inode_entry_slab)) {
+                kmem_cache_destroy(orphan_entry_slab);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void destroy_checkpoint_caches(void)
+{
+        kmem_cache_destroy(orphan_entry_slab);
+        kmem_cache_destroy(inode_entry_slab);
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
new file mode 100644
index 000000000000..655aeabc1dd4
--- /dev/null
+++ b/fs/f2fs/data.c
@@ -0,0 +1,702 @@
+/*
+ * fs/f2fs/data.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+/*
+ * Lock ordering for the change of data block address:
+ * ->data_page
+ *  ->node_page
+ *    update block addresses in the node page
+ */
+static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+{
+        struct f2fs_node *rn;
+        __le32 *addr_array;
+        struct page *node_page = dn->node_page;
+        unsigned int ofs_in_node = dn->ofs_in_node;
+        wait_on_page_writeback(node_page);
+        rn = (struct f2fs_node *)page_address(node_page);
+        /* Get physical address of data block */
+        addr_array = blkaddr_in_node(rn);
+        addr_array[ofs_in_node] = cpu_to_le32(new_addr);
+        set_page_dirty(node_page);
+}
+int reserve_new_block(struct dnode_of_data *dn)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+                return -EPERM;
+        if (!inc_valid_block_count(sbi, dn->inode, 1))
+                return -ENOSPC;
+        __set_data_blkaddr(dn, NEW_ADDR);
+        dn->data_blkaddr = NEW_ADDR;
+        sync_inode_page(dn);
+        return 0;
+}
+static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
+                                        struct buffer_head *bh_result)
+{
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        pgoff_t start_fofs, end_fofs;
+        block_t start_blkaddr;
+        read_lock(&fi->ext.ext_lock);
+        if (fi->ext.len == 0) {
+                read_unlock(&fi->ext.ext_lock);
+                return 0;
+        }
+        sbi->total_hit_ext++;
+        start_fofs = fi->ext.fofs;
+        end_fofs = fi->ext.fofs + fi->ext.len - 1;
+        start_blkaddr = fi->ext.blk_addr;
+        if (pgofs >= start_fofs && pgofs <= end_fofs) {
+                unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+                size_t count;
+                clear_buffer_new(bh_result);
+                map_bh(bh_result, inode->i_sb,
+                                start_blkaddr + pgofs - start_fofs);
+                count = end_fofs - pgofs + 1;
+                if (count < (UINT_MAX >> blkbits))
+                        bh_result->b_size = (count << blkbits);
+                else
+                        bh_result->b_size = UINT_MAX;
+                sbi->read_hit_ext++;
+                read_unlock(&fi->ext.ext_lock);
+                return 1;
+        }
+        read_unlock(&fi->ext.ext_lock);
+        return 0;
+}
+void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+{
+        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+        pgoff_t fofs, start_fofs, end_fofs;
+        block_t start_blkaddr, end_blkaddr;
+        BUG_ON(blk_addr == NEW_ADDR);
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node;
+        /* Update the page address in the parent node */
+        __set_data_blkaddr(dn, blk_addr);
+        write_lock(&fi->ext.ext_lock);
+        start_fofs = fi->ext.fofs;
+        end_fofs = fi->ext.fofs + fi->ext.len - 1;
+        start_blkaddr = fi->ext.blk_addr;
+        end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+        /* Drop and initialize the matched extent */
+        if (fi->ext.len == 1 && fofs == start_fofs)
+                fi->ext.len = 0;
+        /* Initial extent */
+        if (fi->ext.len == 0) {
+                if (blk_addr != NULL_ADDR) {
+                        fi->ext.fofs = fofs;
+                        fi->ext.blk_addr = blk_addr;
+                        fi->ext.len = 1;
+                }
+                goto end_update;
+        }
+        /* Frone merge */
+        if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
+                fi->ext.fofs--;
+                fi->ext.blk_addr--;
+                fi->ext.len++;
+                goto end_update;
+        }
+        /* Back merge */
+        if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
+                fi->ext.len++;
+                goto end_update;
+        }
+        /* Split the existing extent */
+        if (fi->ext.len > 1 &&
+                fofs >= start_fofs && fofs <= end_fofs) {
+                if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
+                        fi->ext.len = fofs - start_fofs;
+                } else {
+                        fi->ext.fofs = fofs + 1;
+                        fi->ext.blk_addr = start_blkaddr +
+                                        fofs - start_fofs + 1;
+                        fi->ext.len -= fofs - start_fofs + 1;
+                }
+                goto end_update;
+        }
+        write_unlock(&fi->ext.ext_lock);
+        return;
+end_update:
+        write_unlock(&fi->ext.ext_lock);
+        sync_inode_page(dn);
+        return;
+}
+struct page *find_data_page(struct inode *inode, pgoff_t index)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct address_space *mapping = inode->i_mapping;
+        struct dnode_of_data dn;
+        struct page *page;
+        int err;
+        page = find_get_page(mapping, index);
+        if (page && PageUptodate(page))
+                return page;
+        f2fs_put_page(page, 0);
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+        if (err)
+                return ERR_PTR(err);
+        f2fs_put_dnode(&dn);
+        if (dn.data_blkaddr == NULL_ADDR)
+                return ERR_PTR(-ENOENT);
+        /* By fallocate(), there is no cached page, but with NEW_ADDR */
+        if (dn.data_blkaddr == NEW_ADDR)
+                return ERR_PTR(-EINVAL);
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+        if (err) {
+                f2fs_put_page(page, 1);
+                return ERR_PTR(err);
+        }
+        unlock_page(page);
+        return page;
+}
+/*
+ * If it tries to access a hole, return an error.
+ * Because, the callers, functions in dir.c and GC, should be able to know
+ * whether this page exists or not.
+ */
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct address_space *mapping = inode->i_mapping;
+        struct dnode_of_data dn;
+        struct page *page;
+        int err;
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+        if (err)
+                return ERR_PTR(err);
+        f2fs_put_dnode(&dn);
+        if (dn.data_blkaddr == NULL_ADDR)
+                return ERR_PTR(-ENOENT);
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        if (PageUptodate(page))
+                return page;
+        BUG_ON(dn.data_blkaddr == NEW_ADDR);
+        BUG_ON(dn.data_blkaddr == NULL_ADDR);
+        err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+        if (err) {
+                f2fs_put_page(page, 1);
+                return ERR_PTR(err);
+        }
+        return page;
+}
+/*
+ * Caller ensures that this data page is never allocated.
+ * A new zero-filled data page is allocated in the page cache.
+ */
+struct page *get_new_data_page(struct inode *inode, pgoff_t index,
+                                                bool new_i_size)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page;
+        struct dnode_of_data dn;
+        int err;
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, index, 0);
+        if (err)
+                return ERR_PTR(err);
+        if (dn.data_blkaddr == NULL_ADDR) {
+                if (reserve_new_block(&dn)) {
+                        f2fs_put_dnode(&dn);
+                        return ERR_PTR(-ENOSPC);
+                }
+        }
+        f2fs_put_dnode(&dn);
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        if (PageUptodate(page))
+                return page;
+        if (dn.data_blkaddr == NEW_ADDR) {
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+        } else {
+                err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+                if (err) {
+                        f2fs_put_page(page, 1);
+                        return ERR_PTR(err);
+                }
+        }
+        SetPageUptodate(page);
+        if (new_i_size &&
+                i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
+                i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+                mark_inode_dirty_sync(inode);
+        }
+        return page;
+}
+static void read_end_io(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate) {
+                        SetPageUptodate(page);
+                } else {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                unlock_page(page);
+        } while (bvec >= bio->bi_io_vec);
+        kfree(bio->bi_private);
+        bio_put(bio);
+}
+/*
+ * Fill the locked page with data located in the block address.
+ * Read operation is synchronous, and caller must unlock the page.
+ */
+int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
+                                        block_t blk_addr, int type)
+{
+        struct block_device *bdev = sbi->sb->s_bdev;
+        bool sync = (type == READ_SYNC);
+        struct bio *bio;
+        /* This page can be already read by other threads */
+        if (PageUptodate(page)) {
+                if (!sync)
+                        unlock_page(page);
+                return 0;
+        }
+        down_read(&sbi->bio_sem);
+        /* Allocate a new bio */
+        bio = f2fs_bio_alloc(bdev, 1);
+        /* Initialize the bio */
+        bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+        bio->bi_end_io = read_end_io;
+        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+                kfree(bio->bi_private);
+                bio_put(bio);
+                up_read(&sbi->bio_sem);
+                return -EFAULT;
+        }
+        submit_bio(type, bio);
+        up_read(&sbi->bio_sem);
+        /* wait for read completion if sync */
+        if (sync) {
+                lock_page(page);
+                if (PageError(page))
+                        return -EIO;
+        }
+        return 0;
+}
+/*
+ * This function should be used by the data read flow only where it
+ * does not check the "create" flag that indicates block allocation.
+ * The reason for this special functionality is to exploit VFS readahead
+ * mechanism.
+ */
+static int get_data_block_ro(struct inode *inode, sector_t iblock,
+                        struct buffer_head *bh_result, int create)
+{
+        unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+        unsigned maxblocks = bh_result->b_size >> blkbits;
+        struct dnode_of_data dn;
+        pgoff_t pgofs;
+        int err;
+        /* Get the page offset from the block offset(iblock) */
+        pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+        if (check_extent_cache(inode, pgofs, bh_result))
+                return 0;
+        /* When reading holes, we need its node page */
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
+        if (err)
+                return (err == -ENOENT) ? 0 : err;
+        /* It does not support data allocation */
+        BUG_ON(create);
+        if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
+                int i;
+                unsigned int end_offset;
+                end_offset = IS_INODE(dn.node_page) ?
+                                ADDRS_PER_INODE :
+                                ADDRS_PER_BLOCK;
+                clear_buffer_new(bh_result);
+                /* Give more consecutive addresses for the read ahead */
+                for (i = 0; i < end_offset - dn.ofs_in_node; i++)
+                        if (((datablock_addr(dn.node_page,
+                                                        dn.ofs_in_node + i))
+                                != (dn.data_blkaddr + i)) || maxblocks == i)
+                                break;
+                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+                bh_result->b_size = (i << blkbits);
+        }
+        f2fs_put_dnode(&dn);
+        return 0;
+}
+static int f2fs_read_data_page(struct file *file, struct page *page)
+{
+        return mpage_readpage(page, get_data_block_ro);
+}
+static int f2fs_read_data_pages(struct file *file,
+                        struct address_space *mapping,
+                        struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+}
+int do_write_data_page(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        block_t old_blk_addr, new_blk_addr;
+        struct dnode_of_data dn;
+        int err = 0;
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
+        if (err)
+                return err;
+        old_blk_addr = dn.data_blkaddr;
+        /* This page is already truncated */
+        if (old_blk_addr == NULL_ADDR)
+                goto out_writepage;
+        set_page_writeback(page);
+        /*
+         * If current allocation needs SSR,
+         * it had better in-place writes for updated data.
+         */
+        if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
+                                need_inplace_update(inode)) {
+                rewrite_data_page(F2FS_SB(inode->i_sb), page,
+                                                old_blk_addr);
+        } else {
+                write_data_page(inode, page, &dn,
+                                old_blk_addr, &new_blk_addr);
+                update_extent_cache(new_blk_addr, &dn);
+                F2FS_I(inode)->data_version =
+                        le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
+        }
+out_writepage:
+        f2fs_put_dnode(&dn);
+        return err;
+}
+static int f2fs_write_data_page(struct page *page,
+                                        struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        loff_t i_size = i_size_read(inode);
+        const pgoff_t end_index = ((unsigned long long) i_size)
+                                                        >> PAGE_CACHE_SHIFT;
+        unsigned offset;
+        int err = 0;
+        if (page->index < end_index)
+                goto out;
+        /*
+         * If the offset is out-of-range of file size,
+         * this page does not have to be written to disk.
+         */
+        offset = i_size & (PAGE_CACHE_SIZE - 1);
+        if ((page->index >= end_index + 1) || !offset) {
+                if (S_ISDIR(inode->i_mode)) {
+                        dec_page_count(sbi, F2FS_DIRTY_DENTS);
+                        inode_dec_dirty_dents(inode);
+                }
+                goto unlock_out;
+        }
+        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+out:
+        if (sbi->por_doing)
+                goto redirty_out;
+        if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
+                goto redirty_out;
+        mutex_lock_op(sbi, DATA_WRITE);
+        if (S_ISDIR(inode->i_mode)) {
+                dec_page_count(sbi, F2FS_DIRTY_DENTS);
+                inode_dec_dirty_dents(inode);
+        }
+        err = do_write_data_page(page);
+        if (err && err != -ENOENT) {
+                wbc->pages_skipped++;
+                set_page_dirty(page);
+        }
+        mutex_unlock_op(sbi, DATA_WRITE);
+        if (wbc->for_reclaim)
+                f2fs_submit_bio(sbi, DATA, true);
+        if (err == -ENOENT)
+                goto unlock_out;
+        clear_cold_data(page);
+        unlock_page(page);
+        if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
+                f2fs_balance_fs(sbi);
+        return 0;
+unlock_out:
+        unlock_page(page);
+        return (err == -ENOENT) ? 0 : err;
+redirty_out:
+        wbc->pages_skipped++;
+        set_page_dirty(page);
+        return AOP_WRITEPAGE_ACTIVATE;
+}
+#define MAX_DESIRED_PAGES_WP    4096
+static int f2fs_write_data_pages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        int ret;
+        long excess_nrtw = 0, desired_nrtw;
+        if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
+                desired_nrtw = MAX_DESIRED_PAGES_WP;
+                excess_nrtw = desired_nrtw - wbc->nr_to_write;
+                wbc->nr_to_write = desired_nrtw;
+        }
+        if (!S_ISDIR(inode->i_mode))
+                mutex_lock(&sbi->writepages);
+        ret = generic_writepages(mapping, wbc);
+        if (!S_ISDIR(inode->i_mode))
+                mutex_unlock(&sbi->writepages);
+        f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+        remove_dirty_dir_inode(inode);
+        wbc->nr_to_write -= excess_nrtw;
+        return ret;
+}
+static int f2fs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        struct inode *inode = mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct page *page;
+        pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
+        struct dnode_of_data dn;
+        int err = 0;
+        /* for nobh_write_end */
+        *fsdata = NULL;
+        f2fs_balance_fs(sbi);
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        mutex_lock_op(sbi, DATA_NEW);
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, index, 0);
+        if (err) {
+                mutex_unlock_op(sbi, DATA_NEW);
+                f2fs_put_page(page, 1);
+                return err;
+        }
+        if (dn.data_blkaddr == NULL_ADDR) {
+                err = reserve_new_block(&dn);
+                if (err) {
+                        f2fs_put_dnode(&dn);
+                        mutex_unlock_op(sbi, DATA_NEW);
+                        f2fs_put_page(page, 1);
+                        return err;
+                }
+        }
+        f2fs_put_dnode(&dn);
+        mutex_unlock_op(sbi, DATA_NEW);
+        if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+                return 0;
+        if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+                unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+                unsigned end = start + len;
+                /* Reading beyond i_size is simple: memset to zero */
+                zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+                return 0;
+        }
+        if (dn.data_blkaddr == NEW_ADDR) {
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+        } else {
+                err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+                if (err) {
+                        f2fs_put_page(page, 1);
+                        return err;
+                }
+        }
+        SetPageUptodate(page);
+        clear_cold_data(page);
+        return 0;
+}
+static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
+                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        if (rw == WRITE)
+                return 0;
+        /* Needs synchronization with the cleaner */
+        return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+                                                  get_data_block_ro);
+}
+static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+{
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
+                dec_page_count(sbi, F2FS_DIRTY_DENTS);
+                inode_dec_dirty_dents(inode);
+        }
+        ClearPagePrivate(page);
+}
+static int f2fs_release_data_page(struct page *page, gfp_t wait)
+{
+        ClearPagePrivate(page);
+        return 0;
+}
+static int f2fs_set_data_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        SetPageUptodate(page);
+        if (!PageDirty(page)) {
+                __set_page_dirty_nobuffers(page);
+                set_dirty_dir_page(inode, page);
+                return 1;
+        }
+        return 0;
+}
+const struct address_space_operations f2fs_dblock_aops = {
+        .readpage       = f2fs_read_data_page,
+        .readpages      = f2fs_read_data_pages,
+        .writepage      = f2fs_write_data_page,
+        .writepages     = f2fs_write_data_pages,
+        .write_begin    = f2fs_write_begin,
+        .write_end      = nobh_write_end,
+        .set_page_dirty = f2fs_set_data_page_dirty,
+        .invalidatepage = f2fs_invalidate_data_page,
+        .releasepage    = f2fs_release_data_page,
+        .direct_IO      = f2fs_direct_IO,
+};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
new file mode 100644
index 000000000000..0e0380a588ad
--- /dev/null
+++ b/fs/f2fs/debug.c
@@ -0,0 +1,361 @@
+/*
+ * f2fs debugging statistics
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ * Copyright (c) 2012 Linux Foundation
+ * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+static LIST_HEAD(f2fs_stat_list);
+static struct dentry *debugfs_root;
+static void update_general_status(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_stat_info *si = sbi->stat_info;
+        int i;
+        /* valid check of the segment numbers */
+        si->hit_ext = sbi->read_hit_ext;
+        si->total_ext = sbi->total_hit_ext;
+        si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
+        si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
+        si->ndirty_dirs = sbi->n_dirty_dirs;
+        si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+        si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+        si->rsvd_segs = reserved_segments(sbi);
+        si->overp_segs = overprovision_segments(sbi);
+        si->valid_count = valid_user_blocks(sbi);
+        si->valid_node_count = valid_node_count(sbi);
+        si->valid_inode_count = valid_inode_count(sbi);
+        si->utilization = utilization(sbi);
+        si->free_segs = free_segments(sbi);
+        si->free_secs = free_sections(sbi);
+        si->prefree_count = prefree_segments(sbi);
+        si->dirty_count = dirty_segments(sbi);
+        si->node_pages = sbi->node_inode->i_mapping->nrpages;
+        si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+        si->nats = NM_I(sbi)->nat_cnt;
+        si->sits = SIT_I(sbi)->dirty_sentries;
+        si->fnids = NM_I(sbi)->fcnt;
+        si->bg_gc = sbi->bg_gc;
+        si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+                * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+                / 2;
+        si->util_valid = (int)(written_block_count(sbi) >>
+                                                sbi->log_blocks_per_seg)
+                * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+                / 2;
+        si->util_invalid = 50 - si->util_free - si->util_valid;
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
+                struct curseg_info *curseg = CURSEG_I(sbi, i);
+                si->curseg[i] = curseg->segno;
+                si->cursec[i] = curseg->segno / sbi->segs_per_sec;
+                si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+        }
+        for (i = 0; i < 2; i++) {
+                si->segment_count[i] = sbi->segment_count[i];
+                si->block_count[i] = sbi->block_count[i];
+        }
+}
+/*
+ * This function calculates BDF of every segments
+ */
+static void update_sit_info(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_stat_info *si = sbi->stat_info;
+        unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned int segno, vblocks;
+        int ndirty = 0;
+        bimodal = 0;
+        total_vblocks = 0;
+        blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+        hblks_per_sec = blks_per_sec / 2;
+        mutex_lock(&sit_i->sentry_lock);
+        for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+                vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+                dist = abs(vblocks - hblks_per_sec);
+                bimodal += dist * dist;
+                if (vblocks > 0 && vblocks < blks_per_sec) {
+                        total_vblocks += vblocks;
+                        ndirty++;
+                }
+        }
+        mutex_unlock(&sit_i->sentry_lock);
+        dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
+        si->bimodal = bimodal / dist;
+        if (si->dirty_count)
+                si->avg_vblocks = total_vblocks / ndirty;
+        else
+                si->avg_vblocks = 0;
+}
+/*
+ * This function calculates memory footprint.
+ */
+static void update_mem_info(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_stat_info *si = sbi->stat_info;
+        unsigned npages;
+        if (si->base_mem)
+                goto get_cache;
+        si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+        si->base_mem += 2 * sizeof(struct f2fs_inode_info);
+        si->base_mem += sizeof(*sbi->ckpt);
+        /* build sm */
+        si->base_mem += sizeof(struct f2fs_sm_info);
+        /* build sit */
+        si->base_mem += sizeof(struct sit_info);
+        si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
+        si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+        if (sbi->segs_per_sec > 1)
+                si->base_mem += sbi->total_sections *
+                        sizeof(struct sec_entry);
+        si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
+        /* build free segmap */
+        si->base_mem += sizeof(struct free_segmap_info);
+        si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        si->base_mem += f2fs_bitmap_size(sbi->total_sections);
+        /* build curseg */
+        si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
+        si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+        /* build dirty segmap */
+        si->base_mem += sizeof(struct dirty_seglist_info);
+        si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        /* buld nm */
+        si->base_mem += sizeof(struct f2fs_nm_info);
+        si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+        /* build gc */
+        si->base_mem += sizeof(struct f2fs_gc_kthread);
+get_cache:
+        /* free nids */
+        si->cache_mem = NM_I(sbi)->fcnt;
+        si->cache_mem += NM_I(sbi)->nat_cnt;
+        npages = sbi->node_inode->i_mapping->nrpages;
+        si->cache_mem += npages << PAGE_CACHE_SHIFT;
+        npages = sbi->meta_inode->i_mapping->nrpages;
+        si->cache_mem += npages << PAGE_CACHE_SHIFT;
+        si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
+        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
+}
+static int stat_show(struct seq_file *s, void *v)
+{
+        struct f2fs_stat_info *si, *next;
+        int i = 0;
+        int j;
+        list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+                mutex_lock(&si->stat_lock);
+                if (!si->sbi) {
+                        mutex_unlock(&si->stat_lock);
+                        continue;
+                }
+                update_general_status(si->sbi);
+                seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
+                seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ",
+                           si->nat_area_segs, si->sit_area_segs);
+                seq_printf(s, "[SSA: %d] [MAIN: %d",
+                           si->ssa_area_segs, si->main_area_segs);
+                seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
+                           si->overp_segs, si->rsvd_segs);
+                seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
+                           si->utilization, si->valid_count);
+                seq_printf(s, "  - Node: %u (Inode: %u, ",
+                           si->valid_node_count, si->valid_inode_count);
+                seq_printf(s, "Other: %u)\n  - Data: %u\n",
+                           si->valid_node_count - si->valid_inode_count,
+                           si->valid_count - si->valid_node_count);
+                seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
+                           si->main_area_segs, si->main_area_sections,
+                           si->main_area_zones);
+                seq_printf(s, "  - COLD  data: %d, %d, %d\n",
+                           si->curseg[CURSEG_COLD_DATA],
+                           si->cursec[CURSEG_COLD_DATA],
+                           si->curzone[CURSEG_COLD_DATA]);
+                seq_printf(s, "  - WARM  data: %d, %d, %d\n",
+                           si->curseg[CURSEG_WARM_DATA],
+                           si->cursec[CURSEG_WARM_DATA],
+                           si->curzone[CURSEG_WARM_DATA]);
+                seq_printf(s, "  - HOT   data: %d, %d, %d\n",
+                           si->curseg[CURSEG_HOT_DATA],
+                           si->cursec[CURSEG_HOT_DATA],
+                           si->curzone[CURSEG_HOT_DATA]);
+                seq_printf(s, "  - Dir   dnode: %d, %d, %d\n",
+                           si->curseg[CURSEG_HOT_NODE],
+                           si->cursec[CURSEG_HOT_NODE],
+                           si->curzone[CURSEG_HOT_NODE]);
+                seq_printf(s, "  - File   dnode: %d, %d, %d\n",
+                           si->curseg[CURSEG_WARM_NODE],
+                           si->cursec[CURSEG_WARM_NODE],
+                           si->curzone[CURSEG_WARM_NODE]);
+                seq_printf(s, "  - Indir nodes: %d, %d, %d\n",
+                           si->curseg[CURSEG_COLD_NODE],
+                           si->cursec[CURSEG_COLD_NODE],
+                           si->curzone[CURSEG_COLD_NODE]);
+                seq_printf(s, "\n  - Valid: %d\n  - Dirty: %d\n",
+                           si->main_area_segs - si->dirty_count -
+                           si->prefree_count - si->free_segs,
+                           si->dirty_count);
+                seq_printf(s, "  - Prefree: %d\n  - Free: %d (%d)\n\n",
+                           si->prefree_count, si->free_segs, si->free_secs);
+                seq_printf(s, "GC calls: %d (BG: %d)\n",
+                           si->call_count, si->bg_gc);
+                seq_printf(s, "  - data segments : %d\n", si->data_segs);
+                seq_printf(s, "  - node segments : %d\n", si->node_segs);
+                seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
+                seq_printf(s, "  - data blocks : %d\n", si->data_blks);
+                seq_printf(s, "  - node blocks : %d\n", si->node_blks);
+                seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
+                           si->hit_ext, si->total_ext);
+                seq_printf(s, "\nBalancing F2FS Async:\n");
+                seq_printf(s, "  - nodes %4d in %4d\n",
+                           si->ndirty_node, si->node_pages);
+                seq_printf(s, "  - dents %4d in dirs:%4d\n",
+                           si->ndirty_dent, si->ndirty_dirs);
+                seq_printf(s, "  - meta %4d in %4d\n",
+                           si->ndirty_meta, si->meta_pages);
+                seq_printf(s, "  - NATs %5d > %lu\n",
+                           si->nats, NM_WOUT_THRESHOLD);
+                seq_printf(s, "  - SITs: %5d\n  - free_nids: %5d\n",
+                           si->sits, si->fnids);
+                seq_printf(s, "\nDistribution of User Blocks:");
+                seq_printf(s, " [ valid | invalid | free ]\n");
+                seq_printf(s, "  [");
+                for (j = 0; j < si->util_valid; j++)
+                        seq_printf(s, "-");
+                seq_printf(s, "|");
+                for (j = 0; j < si->util_invalid; j++)
+                        seq_printf(s, "-");
+                seq_printf(s, "|");
+                for (j = 0; j < si->util_free; j++)
+                        seq_printf(s, "-");
+                seq_printf(s, "]\n\n");
+                seq_printf(s, "SSR: %u blocks in %u segments\n",
+                           si->block_count[SSR], si->segment_count[SSR]);
+                seq_printf(s, "LFS: %u blocks in %u segments\n",
+                           si->block_count[LFS], si->segment_count[LFS]);
+                /* segment usage info */
+                update_sit_info(si->sbi);
+                seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
+                           si->bimodal, si->avg_vblocks);
+                /* memory footprint */
+                update_mem_info(si->sbi);
+                seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
+                                (si->base_mem + si->cache_mem) >> 10,
+                                si->base_mem >> 10, si->cache_mem >> 10);
+                mutex_unlock(&si->stat_lock);
+        }
+        return 0;
+}
+static int stat_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, stat_show, inode->i_private);
+}
+static const struct file_operations stat_fops = {
+        .open = stat_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int init_stats(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+        struct f2fs_stat_info *si;
+        sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+        if (!sbi->stat_info)
+                return -ENOMEM;
+        si = sbi->stat_info;
+        mutex_init(&si->stat_lock);
+        list_add_tail(&si->stat_list, &f2fs_stat_list);
+        si->all_area_segs = le32_to_cpu(raw_super->segment_count);
+        si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
+        si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
+        si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa);
+        si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
+        si->main_area_sections = le32_to_cpu(raw_super->section_count);
+        si->main_area_zones = si->main_area_sections /
+                                le32_to_cpu(raw_super->secs_per_zone);
+        si->sbi = sbi;
+        return 0;
+}
+int f2fs_build_stats(struct f2fs_sb_info *sbi)
+{
+        int retval;
+        retval = init_stats(sbi);
+        if (retval)
+                return retval;
+        if (!debugfs_root)
+                debugfs_root = debugfs_create_dir("f2fs", NULL);
+        debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops);
+        return 0;
+}
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_stat_info *si = sbi->stat_info;
+        list_del(&si->stat_list);
+        mutex_lock(&si->stat_lock);
+        si->sbi = NULL;
+        mutex_unlock(&si->stat_lock);
+        kfree(sbi->stat_info);
+}
+void destroy_root_stats(void)
+{
+        debugfs_remove_recursive(debugfs_root);
+        debugfs_root = NULL;
+}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
new file mode 100644
index 000000000000..b4e24f32b54e
--- /dev/null
+++ b/fs/f2fs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * fs/f2fs/dir.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "acl.h"
+static unsigned long dir_blocks(struct inode *inode)
+{
+        return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
+                                                        >> PAGE_CACHE_SHIFT;
+}
+static unsigned int dir_buckets(unsigned int level)
+{
+        if (level < MAX_DIR_HASH_DEPTH / 2)
+                return 1 << level;
+        else
+                return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
+}
+static unsigned int bucket_blocks(unsigned int level)
+{
+        if (level < MAX_DIR_HASH_DEPTH / 2)
+                return 2;
+        else
+                return 4;
+}
+static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+        [F2FS_FT_UNKNOWN]       = DT_UNKNOWN,
+        [F2FS_FT_REG_FILE]      = DT_REG,
+        [F2FS_FT_DIR]           = DT_DIR,
+        [F2FS_FT_CHRDEV]        = DT_CHR,
+        [F2FS_FT_BLKDEV]        = DT_BLK,
+        [F2FS_FT_FIFO]          = DT_FIFO,
+        [F2FS_FT_SOCK]          = DT_SOCK,
+        [F2FS_FT_SYMLINK]       = DT_LNK,
+};
+#define S_SHIFT 12
+static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = F2FS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = F2FS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = F2FS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = F2FS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = F2FS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = F2FS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = F2FS_FT_SYMLINK,
+};
+static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+{
+        mode_t mode = inode->i_mode;
+        de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+static unsigned long dir_block_index(unsigned int level, unsigned int idx)
+{
+        unsigned long i;
+        unsigned long bidx = 0;
+        for (i = 0; i < level; i++)
+                bidx += dir_buckets(i) * bucket_blocks(i);
+        bidx += idx * bucket_blocks(level);
+        return bidx;
+}
+static bool early_match_name(const char *name, int namelen,
+                        f2fs_hash_t namehash, struct f2fs_dir_entry *de)
+{
+        if (le16_to_cpu(de->name_len) != namelen)
+                return false;
+        if (de->hash_code != namehash)
+                return false;
+        return true;
+}
+static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
+                        const char *name, int namelen, int *max_slots,
+                        f2fs_hash_t namehash, struct page **res_page)
+{
+        struct f2fs_dir_entry *de;
+        unsigned long bit_pos, end_pos, next_pos;
+        struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
+        int slots;
+        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                                        NR_DENTRY_IN_BLOCK, 0);
+        while (bit_pos < NR_DENTRY_IN_BLOCK) {
+                de = &dentry_blk->dentry[bit_pos];
+                slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+                if (early_match_name(name, namelen, namehash, de)) {
+                        if (!memcmp(dentry_blk->filename[bit_pos],
+                                                        name, namelen)) {
+                                *res_page = dentry_page;
+                                goto found;
+                        }
+                }
+                next_pos = bit_pos + slots;
+                bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                                NR_DENTRY_IN_BLOCK, next_pos);
+                if (bit_pos >= NR_DENTRY_IN_BLOCK)
+                        end_pos = NR_DENTRY_IN_BLOCK;
+                else
+                        end_pos = bit_pos;
+                if (*max_slots < end_pos - next_pos)
+                        *max_slots = end_pos - next_pos;
+        }
+        de = NULL;
+        kunmap(dentry_page);
+found:
+        return de;
+}
+static struct f2fs_dir_entry *find_in_level(struct inode *dir,
+                unsigned int level, const char *name, int namelen,
+                        f2fs_hash_t namehash, struct page **res_page)
+{
+        int s = GET_DENTRY_SLOTS(namelen);
+        unsigned int nbucket, nblock;
+        unsigned int bidx, end_block;
+        struct page *dentry_page;
+        struct f2fs_dir_entry *de = NULL;
+        bool room = false;
+        int max_slots = 0;
+        BUG_ON(level > MAX_DIR_HASH_DEPTH);
+        nbucket = dir_buckets(level);
+        nblock = bucket_blocks(level);
+        bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
+        end_block = bidx + nblock;
+        for (; bidx < end_block; bidx++) {
+                /* no need to allocate new dentry pages to all the indices */
+                dentry_page = find_data_page(dir, bidx);
+                if (IS_ERR(dentry_page)) {
+                        room = true;
+                        continue;
+                }
+                de = find_in_block(dentry_page, name, namelen,
+                                        &max_slots, namehash, res_page);
+                if (de)
+                        break;
+                if (max_slots >= s)
+                        room = true;
+                f2fs_put_page(dentry_page, 0);
+        }
+        if (!de && room && F2FS_I(dir)->chash != namehash) {
+                F2FS_I(dir)->chash = namehash;
+                F2FS_I(dir)->clevel = level;
+        }
+        return de;
+}
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+                        struct qstr *child, struct page **res_page)
+{
+        const char *name = child->name;
+        int namelen = child->len;
+        unsigned long npages = dir_blocks(dir);
+        struct f2fs_dir_entry *de = NULL;
+        f2fs_hash_t name_hash;
+        unsigned int max_depth;
+        unsigned int level;
+        if (npages == 0)
+                return NULL;
+        *res_page = NULL;
+        name_hash = f2fs_dentry_hash(name, namelen);
+        max_depth = F2FS_I(dir)->i_current_depth;
+        for (level = 0; level < max_depth; level++) {
+                de = find_in_level(dir, level, name,
+                                namelen, name_hash, res_page);
+                if (de)
+                        break;
+        }
+        if (!de && F2FS_I(dir)->chash != name_hash) {
+                F2FS_I(dir)->chash = name_hash;
+                F2FS_I(dir)->clevel = level - 1;
+        }
+        return de;
+}
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+{
+        struct page *page = NULL;
+        struct f2fs_dir_entry *de = NULL;
+        struct f2fs_dentry_block *dentry_blk = NULL;
+        page = get_lock_data_page(dir, 0);
+        if (IS_ERR(page))
+                return NULL;
+        dentry_blk = kmap(page);
+        de = &dentry_blk->dentry[1];
+        *p = page;
+        unlock_page(page);
+        return de;
+}
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+{
+        ino_t res = 0;
+        struct f2fs_dir_entry *de;
+        struct page *page;
+        de = f2fs_find_entry(dir, qstr, &page);
+        if (de) {
+                res = le32_to_cpu(de->ino);
+                kunmap(page);
+                f2fs_put_page(page, 0);
+        }
+        return res;
+}
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+                struct page *page, struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        mutex_lock_op(sbi, DENTRY_OPS);
+        lock_page(page);
+        wait_on_page_writeback(page);
+        de->ino = cpu_to_le32(inode->i_ino);
+        set_de_type(de, inode);
+        kunmap(page);
+        set_page_dirty(page);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        /* update parent inode number before releasing dentry page */
+        F2FS_I(inode)->i_pino = dir->i_ino;
+        f2fs_put_page(page, 1);
+        mutex_unlock_op(sbi, DENTRY_OPS);
+}
+void init_dent_inode(struct dentry *dentry, struct page *ipage)
+{
+        struct f2fs_node *rn;
+        if (IS_ERR(ipage))
+                return;
+        wait_on_page_writeback(ipage);
+        /* copy dentry info. to this inode page */
+        rn = (struct f2fs_node *)page_address(ipage);
+        rn->i.i_namelen = cpu_to_le32(dentry->d_name.len);
+        memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len);
+        set_page_dirty(ipage);
+}
+static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+                int err;
+                err = new_inode_page(inode, dentry);
+                if (err)
+                        return err;
+                if (S_ISDIR(inode->i_mode)) {
+                        err = f2fs_make_empty(inode, dir);
+                        if (err) {
+                                remove_inode_page(inode);
+                                return err;
+                        }
+                }
+                err = f2fs_init_acl(inode, dir);
+                if (err) {
+                        remove_inode_page(inode);
+                        return err;
+                }
+        } else {
+                struct page *ipage;
+                ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+                if (IS_ERR(ipage))
+                        return PTR_ERR(ipage);
+                init_dent_inode(dentry, ipage);
+                f2fs_put_page(ipage, 1);
+        }
+        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+                inc_nlink(inode);
+                f2fs_write_inode(inode, NULL);
+        }
+        return 0;
+}
+static void update_parent_metadata(struct inode *dir, struct inode *inode,
+                                                unsigned int current_depth)
+{
+        bool need_dir_update = false;
+        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+                if (S_ISDIR(inode->i_mode)) {
+                        inc_nlink(dir);
+                        need_dir_update = true;
+                }
+                clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+        }
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        if (F2FS_I(dir)->i_current_depth != current_depth) {
+                F2FS_I(dir)->i_current_depth = current_depth;
+                need_dir_update = true;
+        }
+        if (need_dir_update)
+                f2fs_write_inode(dir, NULL);
+        else
+                mark_inode_dirty(dir);
+        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
+                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+}
+static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
+{
+        int bit_start = 0;
+        int zero_start, zero_end;
+next:
+        zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
+                                                NR_DENTRY_IN_BLOCK,
+                                                bit_start);
+        if (zero_start >= NR_DENTRY_IN_BLOCK)
+                return NR_DENTRY_IN_BLOCK;
+        zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                                                NR_DENTRY_IN_BLOCK,
+                                                zero_start);
+        if (zero_end - zero_start >= slots)
+                return zero_start;
+        bit_start = zero_end + 1;
+        if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
+                return NR_DENTRY_IN_BLOCK;
+        goto next;
+}
+int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+{
+        unsigned int bit_pos;
+        unsigned int level;
+        unsigned int current_depth;
+        unsigned long bidx, block;
+        f2fs_hash_t dentry_hash;
+        struct f2fs_dir_entry *de;
+        unsigned int nbucket, nblock;
+        struct inode *dir = dentry->d_parent->d_inode;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        struct page *dentry_page = NULL;
+        struct f2fs_dentry_block *dentry_blk = NULL;
+        int slots = GET_DENTRY_SLOTS(namelen);
+        int err = 0;
+        int i;
+        dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
+        level = 0;
+        current_depth = F2FS_I(dir)->i_current_depth;
+        if (F2FS_I(dir)->chash == dentry_hash) {
+                level = F2FS_I(dir)->clevel;
+                F2FS_I(dir)->chash = 0;
+        }
+start:
+        if (current_depth == MAX_DIR_HASH_DEPTH)
+                return -ENOSPC;
+        /* Increase the depth, if required */
+        if (level == current_depth)
+                ++current_depth;
+        nbucket = dir_buckets(level);
+        nblock = bucket_blocks(level);
+        bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
+        for (block = bidx; block <= (bidx + nblock - 1); block++) {
+                mutex_lock_op(sbi, DENTRY_OPS);
+                dentry_page = get_new_data_page(dir, block, true);
+                if (IS_ERR(dentry_page)) {
+                        mutex_unlock_op(sbi, DENTRY_OPS);
+                        return PTR_ERR(dentry_page);
+                }
+                dentry_blk = kmap(dentry_page);
+                bit_pos = room_for_filename(dentry_blk, slots);
+                if (bit_pos < NR_DENTRY_IN_BLOCK)
+                        goto add_dentry;
+                kunmap(dentry_page);
+                f2fs_put_page(dentry_page, 1);
+                mutex_unlock_op(sbi, DENTRY_OPS);
+        }
+        /* Move to next level to find the empty slot for new dentry */
+        ++level;
+        goto start;
+add_dentry:
+        err = init_inode_metadata(inode, dentry);
+        if (err)
+                goto fail;
+        wait_on_page_writeback(dentry_page);
+        de = &dentry_blk->dentry[bit_pos];
+        de->hash_code = dentry_hash;
+        de->name_len = cpu_to_le16(namelen);
+        memcpy(dentry_blk->filename[bit_pos], name, namelen);
+        de->ino = cpu_to_le32(inode->i_ino);
+        set_de_type(de, inode);
+        for (i = 0; i < slots; i++)
+                test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+        set_page_dirty(dentry_page);
+        update_parent_metadata(dir, inode, current_depth);
+        /* update parent inode number before releasing dentry page */
+        F2FS_I(inode)->i_pino = dir->i_ino;
+fail:
+        kunmap(dentry_page);
+        f2fs_put_page(dentry_page, 1);
+        mutex_unlock_op(sbi, DENTRY_OPS);
+        return err;
+}
+/*
+ * It only removes the dentry from the dentry page,corresponding name
+ * entry in name page does not need to be touched during deletion.
+ */
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+                                                struct inode *inode)
+{
+        struct  f2fs_dentry_block *dentry_blk;
+        unsigned int bit_pos;
+        struct address_space *mapping = page->mapping;
+        struct inode *dir = mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+        void *kaddr = page_address(page);
+        int i;
+        mutex_lock_op(sbi, DENTRY_OPS);
+        lock_page(page);
+        wait_on_page_writeback(page);
+        dentry_blk = (struct f2fs_dentry_block *)kaddr;
+        bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
+        for (i = 0; i < slots; i++)
+                test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+        /* Let's check and deallocate this dentry page */
+        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                        NR_DENTRY_IN_BLOCK,
+                        0);
+        kunmap(page); /* kunmap - pair of f2fs_find_entry */
+        set_page_dirty(page);
+        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+        if (inode && S_ISDIR(inode->i_mode)) {
+                drop_nlink(dir);
+                f2fs_write_inode(dir, NULL);
+        } else {
+                mark_inode_dirty(dir);
+        }
+        if (inode) {
+                inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+                drop_nlink(inode);
+                if (S_ISDIR(inode->i_mode)) {
+                        drop_nlink(inode);
+                        i_size_write(inode, 0);
+                }
+                f2fs_write_inode(inode, NULL);
+                if (inode->i_nlink == 0)
+                        add_orphan_inode(sbi, inode->i_ino);
+        }
+        if (bit_pos == NR_DENTRY_IN_BLOCK) {
+                truncate_hole(dir, page->index, page->index + 1);
+                clear_page_dirty_for_io(page);
+                ClearPageUptodate(page);
+                dec_page_count(sbi, F2FS_DIRTY_DENTS);
+                inode_dec_dirty_dents(dir);
+        }
+        f2fs_put_page(page, 1);
+        mutex_unlock_op(sbi, DENTRY_OPS);
+}
+int f2fs_make_empty(struct inode *inode, struct inode *parent)
+{
+        struct page *dentry_page;
+        struct f2fs_dentry_block *dentry_blk;
+        struct f2fs_dir_entry *de;
+        void *kaddr;
+        dentry_page = get_new_data_page(inode, 0, true);
+        if (IS_ERR(dentry_page))
+                return PTR_ERR(dentry_page);
+        kaddr = kmap_atomic(dentry_page);
+        dentry_blk = (struct f2fs_dentry_block *)kaddr;
+        de = &dentry_blk->dentry[0];
+        de->name_len = cpu_to_le16(1);
+        de->hash_code = 0;
+        de->ino = cpu_to_le32(inode->i_ino);
+        memcpy(dentry_blk->filename[0], ".", 1);
+        set_de_type(de, inode);
+        de = &dentry_blk->dentry[1];
+        de->hash_code = 0;
+        de->name_len = cpu_to_le16(2);
+        de->ino = cpu_to_le32(parent->i_ino);
+        memcpy(dentry_blk->filename[1], "..", 2);
+        set_de_type(de, inode);
+        test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
+        test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+        kunmap_atomic(kaddr);
+        set_page_dirty(dentry_page);
+        f2fs_put_page(dentry_page, 1);
+        return 0;
+}
+bool f2fs_empty_dir(struct inode *dir)
+{
+        unsigned long bidx;
+        struct page *dentry_page;
+        unsigned int bit_pos;
+        struct  f2fs_dentry_block *dentry_blk;
+        unsigned long nblock = dir_blocks(dir);
+        for (bidx = 0; bidx < nblock; bidx++) {
+                void *kaddr;
+                dentry_page = get_lock_data_page(dir, bidx);
+                if (IS_ERR(dentry_page)) {
+                        if (PTR_ERR(dentry_page) == -ENOENT)
+                                continue;
+                        else
+                                return false;
+                }
+                kaddr = kmap_atomic(dentry_page);
+                dentry_blk = (struct f2fs_dentry_block *)kaddr;
+                if (bidx == 0)
+                        bit_pos = 2;
+                else
+                        bit_pos = 0;
+                bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                                                NR_DENTRY_IN_BLOCK,
+                                                bit_pos);
+                kunmap_atomic(kaddr);
+                f2fs_put_page(dentry_page, 1);
+                if (bit_pos < NR_DENTRY_IN_BLOCK)
+                        return false;
+        }
+        return true;
+}
+static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        unsigned long pos = file->f_pos;
+        struct inode *inode = file->f_dentry->d_inode;
+        unsigned long npages = dir_blocks(inode);
+        unsigned char *types = NULL;
+        unsigned int bit_pos = 0, start_bit_pos = 0;
+        int over = 0;
+        struct f2fs_dentry_block *dentry_blk = NULL;
+        struct f2fs_dir_entry *de = NULL;
+        struct page *dentry_page = NULL;
+        unsigned int n = 0;
+        unsigned char d_type = DT_UNKNOWN;
+        int slots;
+        types = f2fs_filetype_table;
+        bit_pos = (pos % NR_DENTRY_IN_BLOCK);
+        n = (pos / NR_DENTRY_IN_BLOCK);
+        for ( ; n < npages; n++) {
+                dentry_page = get_lock_data_page(inode, n);
+                if (IS_ERR(dentry_page))
+                        continue;
+                start_bit_pos = bit_pos;
+                dentry_blk = kmap(dentry_page);
+                while (bit_pos < NR_DENTRY_IN_BLOCK) {
+                        d_type = DT_UNKNOWN;
+                        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                                                        NR_DENTRY_IN_BLOCK,
+                                                        bit_pos);
+                        if (bit_pos >= NR_DENTRY_IN_BLOCK)
+                                break;
+                        de = &dentry_blk->dentry[bit_pos];
+                        if (types && de->file_type < F2FS_FT_MAX)
+                                d_type = types[de->file_type];
+                        over = filldir(dirent,
+                                        dentry_blk->filename[bit_pos],
+                                        le16_to_cpu(de->name_len),
+                                        (n * NR_DENTRY_IN_BLOCK) + bit_pos,
+                                        le32_to_cpu(de->ino), d_type);
+                        if (over) {
+                                file->f_pos += bit_pos - start_bit_pos;
+                                goto success;
+                        }
+                        slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+                        bit_pos += slots;
+                }
+                bit_pos = 0;
+                file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+                kunmap(dentry_page);
+                f2fs_put_page(dentry_page, 1);
+                dentry_page = NULL;
+        }
+success:
+        if (dentry_page && !IS_ERR(dentry_page)) {
+                kunmap(dentry_page);
+                f2fs_put_page(dentry_page, 1);
+        }
+        return 0;
+}
+const struct file_operations f2fs_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = f2fs_readdir,
+        .fsync          = f2fs_sync_file,
+        .unlocked_ioctl = f2fs_ioctl,
+};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
new file mode 100644
index 000000000000..a18d63db2fb6
--- /dev/null
+++ b/fs/f2fs/f2fs.h
@@ -0,0 +1,1083 @@
+/*
+ * fs/f2fs/f2fs.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_F2FS_H
+#define _LINUX_F2FS_H
+#include <linux/types.h>
+#include <linux/page-flags.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/magic.h>
+/*
+ * For mount options
+ */
+#define F2FS_MOUNT_BG_GC                0x00000001
+#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
+#define F2FS_MOUNT_DISCARD              0x00000004
+#define F2FS_MOUNT_NOHEAP               0x00000008
+#define F2FS_MOUNT_XATTR_USER           0x00000010
+#define F2FS_MOUNT_POSIX_ACL            0x00000020
+#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
+#define clear_opt(sbi, option)  (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option)    (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option)   (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define ver_after(a, b) (typecheck(unsigned long long, a) &&            \
+                typecheck(unsigned long long, b) &&                     \
+                ((long long)((a) - (b)) > 0))
+typedef u64 block_t;
+typedef u32 nid_t;
+struct f2fs_mount_info {
+        unsigned int    opt;
+};
+static inline __u32 f2fs_crc32(void *buff, size_t len)
+{
+        return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+}
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+{
+        return f2fs_crc32(buff, buff_size) == blk_crc;
+}
+/*
+ * For checkpoint manager
+ */
+enum {
+        NAT_BITMAP,
+        SIT_BITMAP
+};
+/* for the list of orphan inodes */
+struct orphan_inode_entry {
+        struct list_head list;  /* list head */
+        nid_t ino;              /* inode number */
+};
+/* for the list of directory inodes */
+struct dir_inode_entry {
+        struct list_head list;  /* list head */
+        struct inode *inode;    /* vfs inode pointer */
+};
+/* for the list of fsync inodes, used only during recovery */
+struct fsync_inode_entry {
+        struct list_head list;  /* list head */
+        struct inode *inode;    /* vfs inode pointer */
+        block_t blkaddr;        /* block address locating the last inode */
+};
+#define nats_in_cursum(sum)             (le16_to_cpu(sum->n_nats))
+#define sits_in_cursum(sum)             (le16_to_cpu(sum->n_sits))
+#define nat_in_journal(sum, i)          (sum->nat_j.entries[i].ne)
+#define nid_in_journal(sum, i)          (sum->nat_j.entries[i].nid)
+#define sit_in_journal(sum, i)          (sum->sit_j.entries[i].se)
+#define segno_in_journal(sum, i)        (sum->sit_j.entries[i].segno)
+static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+        int before = nats_in_cursum(rs);
+        rs->n_nats = cpu_to_le16(before + i);
+        return before;
+}
+static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+        int before = sits_in_cursum(rs);
+        rs->n_sits = cpu_to_le16(before + i);
+        return before;
+}
+/*
+ * For INODE and NODE manager
+ */
+#define XATTR_NODE_OFFSET       (-1)    /*
+                                         * store xattrs to one node block per
+                                         * file keeping -1 as its node offset to
+                                         * distinguish from index node blocks.
+                                         */
+#define RDONLY_NODE             1       /*
+                                         * specify a read-only mode when getting
+                                         * a node block. 0 is read-write mode.
+                                         * used by get_dnode_of_data().
+                                         */
+#define F2FS_LINK_MAX           32000   /* maximum link count per file */
+/* for in-memory extent cache entry */
+struct extent_info {
+        rwlock_t ext_lock;      /* rwlock for consistency */
+        unsigned int fofs;      /* start offset in a file */
+        u32 blk_addr;           /* start block address of the extent */
+        unsigned int len;       /* lenth of the extent */
+};
+/*
+ * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
+ */
+#define FADVISE_COLD_BIT        0x01
+struct f2fs_inode_info {
+        struct inode vfs_inode;         /* serve a vfs inode */
+        unsigned long i_flags;          /* keep an inode flags for ioctl */
+        unsigned char i_advise;         /* use to give file attribute hints */
+        unsigned int i_current_depth;   /* use only in directory structure */
+        unsigned int i_pino;            /* parent inode number */
+        umode_t i_acl_mode;             /* keep file acl mode temporarily */
+        /* Use below internally in f2fs*/
+        unsigned long flags;            /* use to pass per-file flags */
+        unsigned long long data_version;/* lastes version of data for fsync */
+        atomic_t dirty_dents;           /* # of dirty dentry pages */
+        f2fs_hash_t chash;              /* hash value of given file name */
+        unsigned int clevel;            /* maximum level of given file name */
+        nid_t i_xattr_nid;              /* node id that contains xattrs */
+        struct extent_info ext;         /* in-memory extent cache entry */
+};
+static inline void get_extent_info(struct extent_info *ext,
+                                        struct f2fs_extent i_ext)
+{
+        write_lock(&ext->ext_lock);
+        ext->fofs = le32_to_cpu(i_ext.fofs);
+        ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
+        ext->len = le32_to_cpu(i_ext.len);
+        write_unlock(&ext->ext_lock);
+}
+static inline void set_raw_extent(struct extent_info *ext,
+                                        struct f2fs_extent *i_ext)
+{
+        read_lock(&ext->ext_lock);
+        i_ext->fofs = cpu_to_le32(ext->fofs);
+        i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
+        i_ext->len = cpu_to_le32(ext->len);
+        read_unlock(&ext->ext_lock);
+}
+struct f2fs_nm_info {
+        block_t nat_blkaddr;            /* base disk address of NAT */
+        nid_t max_nid;                  /* maximum possible node ids */
+        nid_t init_scan_nid;            /* the first nid to be scanned */
+        nid_t next_scan_nid;            /* the next nid to be scanned */
+        /* NAT cache management */
+        struct radix_tree_root nat_root;/* root of the nat entry cache */
+        rwlock_t nat_tree_lock;         /* protect nat_tree_lock */
+        unsigned int nat_cnt;           /* the # of cached nat entries */
+        struct list_head nat_entries;   /* cached nat entry list (clean) */
+        struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
+        /* free node ids management */
+        struct list_head free_nid_list; /* a list for free nids */
+        spinlock_t free_nid_list_lock;  /* protect free nid list */
+        unsigned int fcnt;              /* the number of free node id */
+        struct mutex build_lock;        /* lock for build free nids */
+        /* for checkpoint */
+        char *nat_bitmap;               /* NAT bitmap pointer */
+        int bitmap_size;                /* bitmap size */
+};
+/*
+ * this structure is used as one of function parameters.
+ * all the information are dedicated to a given direct node block determined
+ * by the data offset in a file.
+ */
+struct dnode_of_data {
+        struct inode *inode;            /* vfs inode pointer */
+        struct page *inode_page;        /* its inode page, NULL is possible */
+        struct page *node_page;         /* cached direct node page */
+        nid_t nid;                      /* node id of the direct node block */
+        unsigned int ofs_in_node;       /* data offset in the node page */
+        bool inode_page_locked;         /* inode page is locked or not */
+        block_t data_blkaddr;           /* block address of the node block */
+};
+static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
+                struct page *ipage, struct page *npage, nid_t nid)
+{
+        dn->inode = inode;
+        dn->inode_page = ipage;
+        dn->node_page = npage;
+        dn->nid = nid;
+        dn->inode_page_locked = 0;
+}
+/*
+ * For SIT manager
+ *
+ * By default, there are 6 active log areas across the whole main area.
+ * When considering hot and cold data separation to reduce cleaning overhead,
+ * we split 3 for data logs and 3 for node logs as hot, warm, and cold types,
+ * respectively.
+ * In the current design, you should not change the numbers intentionally.
+ * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6
+ * logs individually according to the underlying devices. (default: 6)
+ * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for
+ * data and 8 for node logs.
+ */
+#define NR_CURSEG_DATA_TYPE     (3)
+#define NR_CURSEG_NODE_TYPE     (3)
+#define NR_CURSEG_TYPE  (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
+enum {
+        CURSEG_HOT_DATA = 0,    /* directory entry blocks */
+        CURSEG_WARM_DATA,       /* data blocks */
+        CURSEG_COLD_DATA,       /* multimedia or GCed data blocks */
+        CURSEG_HOT_NODE,        /* direct node blocks of directory files */
+        CURSEG_WARM_NODE,       /* direct node blocks of normal files */
+        CURSEG_COLD_NODE,       /* indirect node blocks */
+        NO_CHECK_TYPE
+};
+struct f2fs_sm_info {
+        struct sit_info *sit_info;              /* whole segment information */
+        struct free_segmap_info *free_info;     /* free segment information */
+        struct dirty_seglist_info *dirty_info;  /* dirty segment information */
+        struct curseg_info *curseg_array;       /* active segment information */
+        struct list_head wblist_head;   /* list of under-writeback pages */
+        spinlock_t wblist_lock;         /* lock for checkpoint */
+        block_t seg0_blkaddr;           /* block address of 0'th segment */
+        block_t main_blkaddr;           /* start block address of main area */
+        block_t ssa_blkaddr;            /* start block address of SSA area */
+        unsigned int segment_count;     /* total # of segments */
+        unsigned int main_segments;     /* # of segments in main area */
+        unsigned int reserved_segments; /* # of reserved segments */
+        unsigned int ovp_segments;      /* # of overprovision segments */
+};
+/*
+ * For directory operation
+ */
+#define NODE_DIR1_BLOCK         (ADDRS_PER_INODE + 1)
+#define NODE_DIR2_BLOCK         (ADDRS_PER_INODE + 2)
+#define NODE_IND1_BLOCK         (ADDRS_PER_INODE + 3)
+#define NODE_IND2_BLOCK         (ADDRS_PER_INODE + 4)
+#define NODE_DIND_BLOCK         (ADDRS_PER_INODE + 5)
+/*
+ * For superblock
+ */
+/*
+ * COUNT_TYPE for monitoring
+ *
+ * f2fs monitors the number of several block types such as on-writeback,
+ * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
+ */
+enum count_type {
+        F2FS_WRITEBACK,
+        F2FS_DIRTY_DENTS,
+        F2FS_DIRTY_NODES,
+        F2FS_DIRTY_META,
+        NR_COUNT_TYPE,
+};
+/*
+ * FS_LOCK nesting subclasses for the lock validator:
+ *
+ * The locking order between these classes is
+ * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
+ *    -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
+ */
+enum lock_type {
+        RENAME,         /* for renaming operations */
+        DENTRY_OPS,     /* for directory operations */
+        DATA_WRITE,     /* for data write */
+        DATA_NEW,       /* for data allocation */
+        DATA_TRUNC,     /* for data truncate */
+        NODE_NEW,       /* for node allocation */
+        NODE_TRUNC,     /* for node truncate */
+        NODE_WRITE,     /* for node write */
+        NR_LOCK_TYPE,
+};
+/*
+ * The below are the page types of bios used in submti_bio().
+ * The available types are:
+ * DATA                 User data pages. It operates as async mode.
+ * NODE                 Node pages. It operates as async mode.
+ * META                 FS metadata pages such as SIT, NAT, CP.
+ * NR_PAGE_TYPE         The number of page types.
+ * META_FLUSH           Make sure the previous pages are written
+ *                      with waiting the bio's completion
+ * ...                  Only can be used with META.
+ */
+enum page_type {
+        DATA,
+        NODE,
+        META,
+        NR_PAGE_TYPE,
+        META_FLUSH,
+};
+struct f2fs_sb_info {
+        struct super_block *sb;                 /* pointer to VFS super block */
+        struct buffer_head *raw_super_buf;      /* buffer head of raw sb */
+        struct f2fs_super_block *raw_super;     /* raw super block pointer */
+        int s_dirty;                            /* dirty flag for checkpoint */
+        /* for node-related operations */
+        struct f2fs_nm_info *nm_info;           /* node manager */
+        struct inode *node_inode;               /* cache node blocks */
+        /* for segment-related operations */
+        struct f2fs_sm_info *sm_info;           /* segment manager */
+        struct bio *bio[NR_PAGE_TYPE];          /* bios to merge */
+        sector_t last_block_in_bio[NR_PAGE_TYPE];       /* last block number */
+        struct rw_semaphore bio_sem;            /* IO semaphore */
+        /* for checkpoint */
+        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
+        struct inode *meta_inode;               /* cache meta blocks */
+        struct mutex cp_mutex;                  /* for checkpoint procedure */
+        struct mutex fs_lock[NR_LOCK_TYPE];     /* for blocking FS operations */
+        struct mutex write_inode;               /* mutex for write inode */
+        struct mutex writepages;                /* mutex for writepages() */
+        int por_doing;                          /* recovery is doing or not */
+        /* for orphan inode management */
+        struct list_head orphan_inode_list;     /* orphan inode list */
+        struct mutex orphan_inode_mutex;        /* for orphan inode list */
+        unsigned int n_orphans;                 /* # of orphan inodes */
+        /* for directory inode management */
+        struct list_head dir_inode_list;        /* dir inode list */
+        spinlock_t dir_inode_lock;              /* for dir inode list lock */
+        unsigned int n_dirty_dirs;              /* # of dir inodes */
+        /* basic file system units */
+        unsigned int log_sectors_per_block;     /* log2 sectors per block */
+        unsigned int log_blocksize;             /* log2 block size */
+        unsigned int blocksize;                 /* block size */
+        unsigned int root_ino_num;              /* root inode number*/
+        unsigned int node_ino_num;              /* node inode number*/
+        unsigned int meta_ino_num;              /* meta inode number*/
+        unsigned int log_blocks_per_seg;        /* log2 blocks per segment */
+        unsigned int blocks_per_seg;            /* blocks per segment */
+        unsigned int segs_per_sec;              /* segments per section */
+        unsigned int secs_per_zone;             /* sections per zone */
+        unsigned int total_sections;            /* total section count */
+        unsigned int total_node_count;          /* total node block count */
+        unsigned int total_valid_node_count;    /* valid node block count */
+        unsigned int total_valid_inode_count;   /* valid inode count */
+        int active_logs;                        /* # of active logs */
+        block_t user_block_count;               /* # of user blocks */
+        block_t total_valid_block_count;        /* # of valid blocks */
+        block_t alloc_valid_block_count;        /* # of allocated blocks */
+        block_t last_valid_block_count;         /* for recovery */
+        u32 s_next_generation;                  /* for NFS support */
+        atomic_t nr_pages[NR_COUNT_TYPE];       /* # of pages, see count_type */
+        struct f2fs_mount_info mount_opt;       /* mount options */
+        /* for cleaning operations */
+        struct mutex gc_mutex;                  /* mutex for GC */
+        struct f2fs_gc_kthread  *gc_thread;     /* GC thread */
+        /*
+         * for stat information.
+         * one is for the LFS mode, and the other is for the SSR mode.
+         */
+        struct f2fs_stat_info *stat_info;       /* FS status information */
+        unsigned int segment_count[2];          /* # of allocated segments */
+        unsigned int block_count[2];            /* # of allocated blocks */
+        unsigned int last_victim[2];            /* last victim segment # */
+        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
+        int bg_gc;                              /* background gc calls */
+        spinlock_t stat_lock;                   /* lock for stat operations */
+};
+/*
+ * Inline functions
+ */
+static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
+{
+        return container_of(inode, struct f2fs_inode_info, vfs_inode);
+}
+static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
+{
+        return (struct f2fs_super_block *)(sbi->raw_super);
+}
+static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
+{
+        return (struct f2fs_checkpoint *)(sbi->ckpt);
+}
+static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
+{
+        return (struct f2fs_nm_info *)(sbi->nm_info);
+}
+static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi)
+{
+        return (struct f2fs_sm_info *)(sbi->sm_info);
+}
+static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi)
+{
+        return (struct sit_info *)(SM_I(sbi)->sit_info);
+}
+static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi)
+{
+        return (struct free_segmap_info *)(SM_I(sbi)->free_info);
+}
+static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
+{
+        return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
+}
+static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+{
+        sbi->s_dirty = 1;
+}
+static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+{
+        sbi->s_dirty = 0;
+}
+static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+        unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+        return ckpt_flags & f;
+}
+static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+        unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+        ckpt_flags |= f;
+        cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+        unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+        ckpt_flags &= (~f);
+        cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+{
+        mutex_lock_nested(&sbi->fs_lock[t], t);
+}
+static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+{
+        mutex_unlock(&sbi->fs_lock[t]);
+}
+/*
+ * Check whether the given nid is within node id range.
+ */
+static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        BUG_ON((nid >= NM_I(sbi)->max_nid));
+}
+#define F2FS_DEFAULT_ALLOCATED_BLOCKS   1
+/*
+ * Check whether the inode has blocks or not
+ */
+static inline int F2FS_HAS_BLOCKS(struct inode *inode)
+{
+        if (F2FS_I(inode)->i_xattr_nid)
+                return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+        else
+                return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+}
+static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
+                                 struct inode *inode, blkcnt_t count)
+{
+        block_t valid_block_count;
+        spin_lock(&sbi->stat_lock);
+        valid_block_count =
+                sbi->total_valid_block_count + (block_t)count;
+        if (valid_block_count > sbi->user_block_count) {
+                spin_unlock(&sbi->stat_lock);
+                return false;
+        }
+        inode->i_blocks += count;
+        sbi->total_valid_block_count = valid_block_count;
+        sbi->alloc_valid_block_count += (block_t)count;
+        spin_unlock(&sbi->stat_lock);
+        return true;
+}
+static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+                                                struct inode *inode,
+                                                blkcnt_t count)
+{
+        spin_lock(&sbi->stat_lock);
+        BUG_ON(sbi->total_valid_block_count < (block_t) count);
+        BUG_ON(inode->i_blocks < count);
+        inode->i_blocks -= count;
+        sbi->total_valid_block_count -= (block_t)count;
+        spin_unlock(&sbi->stat_lock);
+        return 0;
+}
+static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+        atomic_inc(&sbi->nr_pages[count_type]);
+        F2FS_SET_SB_DIRT(sbi);
+}
+static inline void inode_inc_dirty_dents(struct inode *inode)
+{
+        atomic_inc(&F2FS_I(inode)->dirty_dents);
+}
+static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+        atomic_dec(&sbi->nr_pages[count_type]);
+}
+static inline void inode_dec_dirty_dents(struct inode *inode)
+{
+        atomic_dec(&F2FS_I(inode)->dirty_dents);
+}
+static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+{
+        return atomic_read(&sbi->nr_pages[count_type]);
+}
+static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
+{
+        block_t ret;
+        spin_lock(&sbi->stat_lock);
+        ret = sbi->total_valid_block_count;
+        spin_unlock(&sbi->stat_lock);
+        return ret;
+}
+static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
+{
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        /* return NAT or SIT bitmap */
+        if (flag == NAT_BITMAP)
+                return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
+        else if (flag == SIT_BITMAP)
+                return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
+        return 0;
+}
+static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
+{
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        int offset = (flag == NAT_BITMAP) ?
+                        le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
+        return &ckpt->sit_nat_version_bitmap + offset;
+}
+static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
+{
+        block_t start_addr;
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver);
+        start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+        /*
+         * odd numbered checkpoint should at cp segment 0
+         * and even segent must be at cp segment 1
+         */
+        if (!(ckpt_version & 1))
+                start_addr += sbi->blocks_per_seg;
+        return start_addr;
+}
+static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
+{
+        return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
+                                                struct inode *inode,
+                                                unsigned int count)
+{
+        block_t valid_block_count;
+        unsigned int valid_node_count;
+        spin_lock(&sbi->stat_lock);
+        valid_block_count = sbi->total_valid_block_count + (block_t)count;
+        sbi->alloc_valid_block_count += (block_t)count;
+        valid_node_count = sbi->total_valid_node_count + count;
+        if (valid_block_count > sbi->user_block_count) {
+                spin_unlock(&sbi->stat_lock);
+                return false;
+        }
+        if (valid_node_count > sbi->total_node_count) {
+                spin_unlock(&sbi->stat_lock);
+                return false;
+        }
+        if (inode)
+                inode->i_blocks += count;
+        sbi->total_valid_node_count = valid_node_count;
+        sbi->total_valid_block_count = valid_block_count;
+        spin_unlock(&sbi->stat_lock);
+        return true;
+}
+static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
+                                                struct inode *inode,
+                                                unsigned int count)
+{
+        spin_lock(&sbi->stat_lock);
+        BUG_ON(sbi->total_valid_block_count < count);
+        BUG_ON(sbi->total_valid_node_count < count);
+        BUG_ON(inode->i_blocks < count);
+        inode->i_blocks -= count;
+        sbi->total_valid_node_count -= count;
+        sbi->total_valid_block_count -= (block_t)count;
+        spin_unlock(&sbi->stat_lock);
+}
+static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
+{
+        unsigned int ret;
+        spin_lock(&sbi->stat_lock);
+        ret = sbi->total_valid_node_count;
+        spin_unlock(&sbi->stat_lock);
+        return ret;
+}
+static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+        spin_lock(&sbi->stat_lock);
+        BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
+        sbi->total_valid_inode_count++;
+        spin_unlock(&sbi->stat_lock);
+}
+static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+        spin_lock(&sbi->stat_lock);
+        BUG_ON(!sbi->total_valid_inode_count);
+        sbi->total_valid_inode_count--;
+        spin_unlock(&sbi->stat_lock);
+        return 0;
+}
+static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+{
+        unsigned int ret;
+        spin_lock(&sbi->stat_lock);
+        ret = sbi->total_valid_inode_count;
+        spin_unlock(&sbi->stat_lock);
+        return ret;
+}
+static inline void f2fs_put_page(struct page *page, int unlock)
+{
+        if (!page || IS_ERR(page))
+                return;
+        if (unlock) {
+                BUG_ON(!PageLocked(page));
+                unlock_page(page);
+        }
+        page_cache_release(page);
+}
+static inline void f2fs_put_dnode(struct dnode_of_data *dn)
+{
+        if (dn->node_page)
+                f2fs_put_page(dn->node_page, 1);
+        if (dn->inode_page && dn->node_page != dn->inode_page)
+                f2fs_put_page(dn->inode_page, 0);
+        dn->node_page = NULL;
+        dn->inode_page = NULL;
+}
+static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
+                                        size_t size, void (*ctor)(void *))
+{
+        return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
+}
+#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
+static inline bool IS_INODE(struct page *page)
+{
+        struct f2fs_node *p = (struct f2fs_node *)page_address(page);
+        return RAW_IS_INODE(p);
+}
+static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
+{
+        return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
+}
+static inline block_t datablock_addr(struct page *node_page,
+                unsigned int offset)
+{
+        struct f2fs_node *raw_node;
+        __le32 *addr_array;
+        raw_node = (struct f2fs_node *)page_address(node_page);
+        addr_array = blkaddr_in_node(raw_node);
+        return le32_to_cpu(addr_array[offset]);
+}
+static inline int f2fs_test_bit(unsigned int nr, char *addr)
+{
+        int mask;
+        addr += (nr >> 3);
+        mask = 1 << (7 - (nr & 0x07));
+        return mask & *addr;
+}
+static inline int f2fs_set_bit(unsigned int nr, char *addr)
+{
+        int mask;
+        int ret;
+        addr += (nr >> 3);
+        mask = 1 << (7 - (nr & 0x07));
+        ret = mask & *addr;
+        *addr |= mask;
+        return ret;
+}
+static inline int f2fs_clear_bit(unsigned int nr, char *addr)
+{
+        int mask;
+        int ret;
+        addr += (nr >> 3);
+        mask = 1 << (7 - (nr & 0x07));
+        ret = mask & *addr;
+        *addr &= ~mask;
+        return ret;
+}
+/* used for f2fs_inode_info->flags */
+enum {
+        FI_NEW_INODE,           /* indicate newly allocated inode */
+        FI_NEED_CP,             /* need to do checkpoint during fsync */
+        FI_INC_LINK,            /* need to increment i_nlink */
+        FI_ACL_MODE,            /* indicate acl mode */
+        FI_NO_ALLOC,            /* should not allocate any blocks */
+};
+static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+        set_bit(flag, &fi->flags);
+}
+static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+{
+        return test_bit(flag, &fi->flags);
+}
+static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+        clear_bit(flag, &fi->flags);
+}
+static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+{
+        fi->i_acl_mode = mode;
+        set_inode_flag(fi, FI_ACL_MODE);
+}
+static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+        if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+                clear_inode_flag(fi, FI_ACL_MODE);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * file.c
+ */
+int f2fs_sync_file(struct file *, loff_t, loff_t, int);
+void truncate_data_blocks(struct dnode_of_data *);
+void f2fs_truncate(struct inode *);
+int f2fs_setattr(struct dentry *, struct iattr *);
+int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+long f2fs_ioctl(struct file *, unsigned int, unsigned long);
+/*
+ * inode.c
+ */
+void f2fs_set_inode_flags(struct inode *);
+struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
+struct inode *f2fs_iget(struct super_block *, unsigned long);
+void update_inode(struct inode *, struct page *);
+int f2fs_write_inode(struct inode *, struct writeback_control *);
+void f2fs_evict_inode(struct inode *);
+/*
+ * namei.c
+ */
+struct dentry *f2fs_get_parent(struct dentry *child);
+/*
+ * dir.c
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
+                                                        struct page **);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
+ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
+                                struct page *, struct inode *);
+void init_dent_inode(struct dentry *, struct page *);
+int f2fs_add_link(struct dentry *, struct inode *);
+void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
+int f2fs_make_empty(struct inode *, struct inode *);
+bool f2fs_empty_dir(struct inode *);
+/*
+ * super.c
+ */
+int f2fs_sync_fs(struct super_block *, int);
+/*
+ * hash.c
+ */
+f2fs_hash_t f2fs_dentry_hash(const char *, int);
+/*
+ * node.c
+ */
+struct dnode_of_data;
+struct node_info;
+int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
+int truncate_inode_blocks(struct inode *, pgoff_t);
+int remove_inode_page(struct inode *);
+int new_inode_page(struct inode *, struct dentry *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int);
+void ra_node_page(struct f2fs_sb_info *, nid_t);
+struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_node_page_ra(struct page *, int);
+void sync_inode_page(struct dnode_of_data *);
+int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+bool alloc_nid(struct f2fs_sb_info *, nid_t *);
+void alloc_nid_done(struct f2fs_sb_info *, nid_t);
+void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
+void recover_node_page(struct f2fs_sb_info *, struct page *,
+                struct f2fs_summary *, struct node_info *, block_t);
+int recover_inode_page(struct f2fs_sb_info *, struct page *);
+int restore_node_summary(struct f2fs_sb_info *, unsigned int,
+                                struct f2fs_summary_block *);
+void flush_nat_entries(struct f2fs_sb_info *);
+int build_node_manager(struct f2fs_sb_info *);
+void destroy_node_manager(struct f2fs_sb_info *);
+int create_node_manager_caches(void);
+void destroy_node_manager_caches(void);
+/*
+ * segment.c
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *);
+void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
+void clear_prefree_segments(struct f2fs_sb_info *);
+int npages_for_summary_flush(struct f2fs_sb_info *);
+void allocate_new_segments(struct f2fs_sb_info *);
+struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
+struct bio *f2fs_bio_alloc(struct block_device *, int);
+void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
+int write_meta_page(struct f2fs_sb_info *, struct page *,
+                                        struct writeback_control *);
+void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
+                                        block_t, block_t *);
+void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
+                                        block_t, block_t *);
+void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+void recover_data_page(struct f2fs_sb_info *, struct page *,
+                                struct f2fs_summary *, block_t, block_t);
+void rewrite_node_page(struct f2fs_sb_info *, struct page *,
+                                struct f2fs_summary *, block_t, block_t);
+void write_data_summaries(struct f2fs_sb_info *, block_t);
+void write_node_summaries(struct f2fs_sb_info *, block_t);
+int lookup_journal_in_cursum(struct f2fs_summary_block *,
+                                        int, unsigned int, int);
+void flush_sit_entries(struct f2fs_sb_info *);
+int build_segment_manager(struct f2fs_sb_info *);
+void reset_victim_segmap(struct f2fs_sb_info *);
+void destroy_segment_manager(struct f2fs_sb_info *);
+/*
+ * checkpoint.c
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
+int check_orphan_space(struct f2fs_sb_info *);
+void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
+int recover_orphan_inodes(struct f2fs_sb_info *);
+int get_valid_checkpoint(struct f2fs_sb_info *);
+void set_dirty_dir_page(struct inode *, struct page *);
+void remove_dirty_dir_inode(struct inode *);
+void sync_dirty_dir_inodes(struct f2fs_sb_info *);
+void block_operations(struct f2fs_sb_info *);
+void write_checkpoint(struct f2fs_sb_info *, bool, bool);
+void init_orphan_info(struct f2fs_sb_info *);
+int create_checkpoint_caches(void);
+void destroy_checkpoint_caches(void);
+/*
+ * data.c
+ */
+int reserve_new_block(struct dnode_of_data *);
+void update_extent_cache(block_t, struct dnode_of_data *);
+struct page *find_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
+int do_write_data_page(struct page *);
+/*
+ * gc.c
+ */
+int start_gc_thread(struct f2fs_sb_info *);
+void stop_gc_thread(struct f2fs_sb_info *);
+block_t start_bidx_of_node(unsigned int);
+int f2fs_gc(struct f2fs_sb_info *, int);
+void build_gc_manager(struct f2fs_sb_info *);
+int create_gc_caches(void);
+void destroy_gc_caches(void);
+/*
+ * recovery.c
+ */
+void recover_fsync_data(struct f2fs_sb_info *);
+bool space_for_roll_forward(struct f2fs_sb_info *);
+/*
+ * debug.c
+ */
+#ifdef CONFIG_F2FS_STAT_FS
+struct f2fs_stat_info {
+        struct list_head stat_list;
+        struct f2fs_sb_info *sbi;
+        struct mutex stat_lock;
+        int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
+        int main_area_segs, main_area_sections, main_area_zones;
+        int hit_ext, total_ext;
+        int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+        int nats, sits, fnids;
+        int total_count, utilization;
+        int bg_gc;
+        unsigned int valid_count, valid_node_count, valid_inode_count;
+        unsigned int bimodal, avg_vblocks;
+        int util_free, util_valid, util_invalid;
+        int rsvd_segs, overp_segs;
+        int dirty_count, node_pages, meta_pages;
+        int prefree_count, call_count;
+        int tot_segs, node_segs, data_segs, free_segs, free_secs;
+        int tot_blks, data_blks, node_blks;
+        int curseg[NR_CURSEG_TYPE];
+        int cursec[NR_CURSEG_TYPE];
+        int curzone[NR_CURSEG_TYPE];
+        unsigned int segment_count[2];
+        unsigned int block_count[2];
+        unsigned base_mem, cache_mem;
+};
+#define stat_inc_call_count(si) ((si)->call_count++)
+#define stat_inc_seg_count(sbi, type)                                   \
+        do {                                                            \
+                struct f2fs_stat_info *si = sbi->stat_info;             \
+                (si)->tot_segs++;                                       \
+                if (type == SUM_TYPE_DATA)                              \
+                        si->data_segs++;                                \
+                else                                                    \
+                        si->node_segs++;                                \
+        } while (0)
+#define stat_inc_tot_blk_count(si, blks)                                \
+        (si->tot_blks += (blks))
+#define stat_inc_data_blk_count(sbi, blks)                              \
+        do {                                                            \
+                struct f2fs_stat_info *si = sbi->stat_info;             \
+                stat_inc_tot_blk_count(si, blks);                       \
+                si->data_blks += (blks);                                \
+        } while (0)
+#define stat_inc_node_blk_count(sbi, blks)                              \
+        do {                                                            \
+                struct f2fs_stat_info *si = sbi->stat_info;             \
+                stat_inc_tot_blk_count(si, blks);                       \
+                si->node_blks += (blks);                                \
+        } while (0)
+int f2fs_build_stats(struct f2fs_sb_info *);
+void f2fs_destroy_stats(struct f2fs_sb_info *);
+void destroy_root_stats(void);
+#else
+#define stat_inc_call_count(si)
+#define stat_inc_seg_count(si, type)
+#define stat_inc_tot_blk_count(si, blks)
+#define stat_inc_data_blk_count(si, blks)
+#define stat_inc_node_blk_count(sbi, blks)
+static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
+static inline void destroy_root_stats(void) { }
+#endif
+extern const struct file_operations f2fs_dir_operations;
+extern const struct file_operations f2fs_file_operations;
+extern const struct inode_operations f2fs_file_inode_operations;
+extern const struct address_space_operations f2fs_dblock_aops;
+extern const struct address_space_operations f2fs_node_aops;
+extern const struct address_space_operations f2fs_meta_aops;
+extern const struct inode_operations f2fs_dir_inode_operations;
+extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_special_inode_operations;
+#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
new file mode 100644
index 000000000000..f9e085dfb1f0
--- /dev/null
+++ b/fs/f2fs/file.c
@@ -0,0 +1,636 @@
+/*
+ * fs/f2fs/file.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/stat.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/mount.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "xattr.h"
+#include "acl.h"
+static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
+                                                struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        block_t old_blk_addr;
+        struct dnode_of_data dn;
+        int err;
+        f2fs_balance_fs(sbi);
+        sb_start_pagefault(inode->i_sb);
+        mutex_lock_op(sbi, DATA_NEW);
+        /* block allocation */
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, page->index, 0);
+        if (err) {
+                mutex_unlock_op(sbi, DATA_NEW);
+                goto out;
+        }
+        old_blk_addr = dn.data_blkaddr;
+        if (old_blk_addr == NULL_ADDR) {
+                err = reserve_new_block(&dn);
+                if (err) {
+                        f2fs_put_dnode(&dn);
+                        mutex_unlock_op(sbi, DATA_NEW);
+                        goto out;
+                }
+        }
+        f2fs_put_dnode(&dn);
+        mutex_unlock_op(sbi, DATA_NEW);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping ||
+                        page_offset(page) >= i_size_read(inode) ||
+                        !PageUptodate(page)) {
+                unlock_page(page);
+                err = -EFAULT;
+                goto out;
+        }
+        /*
+         * check to see if the page is mapped already (no holes)
+         */
+        if (PageMappedToDisk(page))
+                goto out;
+        /* fill the page */
+        wait_on_page_writeback(page);
+        /* page is wholly or partially inside EOF */
+        if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+                unsigned offset;
+                offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
+                zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+        }
+        set_page_dirty(page);
+        SetPageUptodate(page);
+        file_update_time(vma->vm_file);
+out:
+        sb_end_pagefault(inode->i_sb);
+        return block_page_mkwrite_return(err);
+}
+static const struct vm_operations_struct f2fs_file_vm_ops = {
+        .fault        = filemap_fault,
+        .page_mkwrite = f2fs_vm_page_mkwrite,
+};
+static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
+{
+        struct dentry *dentry;
+        nid_t pino;
+        inode = igrab(inode);
+        dentry = d_find_any_alias(inode);
+        if (!dentry) {
+                iput(inode);
+                return 0;
+        }
+        pino = dentry->d_parent->d_inode->i_ino;
+        dput(dentry);
+        iput(inode);
+        return !is_checkpointed_node(sbi, pino);
+}
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        unsigned long long cur_version;
+        int ret = 0;
+        bool need_cp = false;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = LONG_MAX,
+                .for_reclaim = 0,
+        };
+        if (inode->i_sb->s_flags & MS_RDONLY)
+                return 0;
+        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                goto out;
+        mutex_lock(&sbi->cp_mutex);
+        cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
+        mutex_unlock(&sbi->cp_mutex);
+        if (F2FS_I(inode)->data_version != cur_version &&
+                                        !(inode->i_state & I_DIRTY))
+                goto out;
+        F2FS_I(inode)->data_version--;
+        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+                need_cp = true;
+        if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+                need_cp = true;
+        if (!space_for_roll_forward(sbi))
+                need_cp = true;
+        if (need_to_sync_dir(sbi, inode))
+                need_cp = true;
+        f2fs_write_inode(inode, NULL);
+        if (need_cp) {
+                /* all the dirty node pages should be flushed for POR */
+                ret = f2fs_sync_fs(inode->i_sb, 1);
+                clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
+        } else {
+                while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
+                        f2fs_write_inode(inode, NULL);
+                filemap_fdatawait_range(sbi->node_inode->i_mapping,
+                                                        0, LONG_MAX);
+        }
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
+static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        file_accessed(file);
+        vma->vm_ops = &f2fs_file_vm_ops;
+        return 0;
+}
+static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+{
+        int nr_free = 0, ofs = dn->ofs_in_node;
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_node *raw_node;
+        __le32 *addr;
+        raw_node = page_address(dn->node_page);
+        addr = blkaddr_in_node(raw_node) + ofs;
+        for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+                block_t blkaddr = le32_to_cpu(*addr);
+                if (blkaddr == NULL_ADDR)
+                        continue;
+                update_extent_cache(NULL_ADDR, dn);
+                invalidate_blocks(sbi, blkaddr);
+                dec_valid_block_count(sbi, dn->inode, 1);
+                nr_free++;
+        }
+        if (nr_free) {
+                set_page_dirty(dn->node_page);
+                sync_inode_page(dn);
+        }
+        dn->ofs_in_node = ofs;
+        return nr_free;
+}
+void truncate_data_blocks(struct dnode_of_data *dn)
+{
+        truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
+}
+static void truncate_partial_data_page(struct inode *inode, u64 from)
+{
+        unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+        struct page *page;
+        if (!offset)
+                return;
+        page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
+        if (IS_ERR(page))
+                return;
+        lock_page(page);
+        wait_on_page_writeback(page);
+        zero_user(page, offset, PAGE_CACHE_SIZE - offset);
+        set_page_dirty(page);
+        f2fs_put_page(page, 1);
+}
+static int truncate_blocks(struct inode *inode, u64 from)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        unsigned int blocksize = inode->i_sb->s_blocksize;
+        struct dnode_of_data dn;
+        pgoff_t free_from;
+        int count = 0;
+        int err;
+        free_from = (pgoff_t)
+                        ((from + blocksize - 1) >> (sbi->log_blocksize));
+        mutex_lock_op(sbi, DATA_TRUNC);
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
+        if (err) {
+                if (err == -ENOENT)
+                        goto free_next;
+                mutex_unlock_op(sbi, DATA_TRUNC);
+                return err;
+        }
+        if (IS_INODE(dn.node_page))
+                count = ADDRS_PER_INODE;
+        else
+                count = ADDRS_PER_BLOCK;
+        count -= dn.ofs_in_node;
+        BUG_ON(count < 0);
+        if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
+                truncate_data_blocks_range(&dn, count);
+                free_from += count;
+        }
+        f2fs_put_dnode(&dn);
+free_next:
+        err = truncate_inode_blocks(inode, free_from);
+        mutex_unlock_op(sbi, DATA_TRUNC);
+        /* lastly zero out the first data page */
+        truncate_partial_data_page(inode, from);
+        return err;
+}
+void f2fs_truncate(struct inode *inode)
+{
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                                S_ISLNK(inode->i_mode)))
+                return;
+        if (!truncate_blocks(inode, i_size_read(inode))) {
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                mark_inode_dirty(inode);
+        }
+        f2fs_balance_fs(F2FS_SB(inode->i_sb));
+}
+static int f2fs_getattr(struct vfsmount *mnt,
+                         struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        generic_fillattr(inode, stat);
+        stat->blocks <<= 3;
+        return 0;
+}
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+{
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        unsigned int ia_valid = attr->ia_valid;
+        if (ia_valid & ATTR_UID)
+                inode->i_uid = attr->ia_uid;
+        if (ia_valid & ATTR_GID)
+                inode->i_gid = attr->ia_gid;
+        if (ia_valid & ATTR_ATIME)
+                inode->i_atime = timespec_trunc(attr->ia_atime,
+                                                inode->i_sb->s_time_gran);
+        if (ia_valid & ATTR_MTIME)
+                inode->i_mtime = timespec_trunc(attr->ia_mtime,
+                                                inode->i_sb->s_time_gran);
+        if (ia_valid & ATTR_CTIME)
+                inode->i_ctime = timespec_trunc(attr->ia_ctime,
+                                                inode->i_sb->s_time_gran);
+        if (ia_valid & ATTR_MODE) {
+                umode_t mode = attr->ia_mode;
+                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                        mode &= ~S_ISGID;
+                set_acl_inode(fi, mode);
+        }
+}
+#else
+#define __setattr_copy setattr_copy
+#endif
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        int err;
+        err = inode_change_ok(inode, attr);
+        if (err)
+                return err;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+                        attr->ia_size != i_size_read(inode)) {
+                truncate_setsize(inode, attr->ia_size);
+                f2fs_truncate(inode);
+        }
+        __setattr_copy(inode, attr);
+        if (attr->ia_valid & ATTR_MODE) {
+                err = f2fs_acl_chmod(inode);
+                if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
+                        inode->i_mode = fi->i_acl_mode;
+                        clear_inode_flag(fi, FI_ACL_MODE);
+                }
+        }
+        mark_inode_dirty(inode);
+        return err;
+}
+const struct inode_operations f2fs_file_inode_operations = {
+        .getattr        = f2fs_getattr,
+        .setattr        = f2fs_setattr,
+        .get_acl        = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = f2fs_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+};
+static void fill_zero(struct inode *inode, pgoff_t index,
+                                        loff_t start, loff_t len)
+{
+        struct page *page;
+        if (!len)
+                return;
+        page = get_new_data_page(inode, index, false);
+        if (!IS_ERR(page)) {
+                wait_on_page_writeback(page);
+                zero_user(page, start, len);
+                set_page_dirty(page);
+                f2fs_put_page(page, 1);
+        }
+}
+int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
+{
+        pgoff_t index;
+        int err;
+        for (index = pg_start; index < pg_end; index++) {
+                struct dnode_of_data dn;
+                struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+                mutex_lock_op(sbi, DATA_TRUNC);
+                set_new_dnode(&dn, inode, NULL, NULL, 0);
+                err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+                if (err) {
+                        mutex_unlock_op(sbi, DATA_TRUNC);
+                        if (err == -ENOENT)
+                                continue;
+                        return err;
+                }
+                if (dn.data_blkaddr != NULL_ADDR)
+                        truncate_data_blocks_range(&dn, 1);
+                f2fs_put_dnode(&dn);
+                mutex_unlock_op(sbi, DATA_TRUNC);
+        }
+        return 0;
+}
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+{
+        pgoff_t pg_start, pg_end;
+        loff_t off_start, off_end;
+        int ret = 0;
+        pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+        pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+        off_start = offset & (PAGE_CACHE_SIZE - 1);
+        off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+        if (pg_start == pg_end) {
+                fill_zero(inode, pg_start, off_start,
+                                                off_end - off_start);
+        } else {
+                if (off_start)
+                        fill_zero(inode, pg_start++, off_start,
+                                        PAGE_CACHE_SIZE - off_start);
+                if (off_end)
+                        fill_zero(inode, pg_end, 0, off_end);
+                if (pg_start < pg_end) {
+                        struct address_space *mapping = inode->i_mapping;
+                        loff_t blk_start, blk_end;
+                        blk_start = pg_start << PAGE_CACHE_SHIFT;
+                        blk_end = pg_end << PAGE_CACHE_SHIFT;
+                        truncate_inode_pages_range(mapping, blk_start,
+                                        blk_end - 1);
+                        ret = truncate_hole(inode, pg_start, pg_end);
+                }
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+                i_size_read(inode) <= (offset + len)) {
+                i_size_write(inode, offset);
+                mark_inode_dirty(inode);
+        }
+        return ret;
+}
+static int expand_inode_data(struct inode *inode, loff_t offset,
+                                        loff_t len, int mode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        pgoff_t index, pg_start, pg_end;
+        loff_t new_size = i_size_read(inode);
+        loff_t off_start, off_end;
+        int ret = 0;
+        ret = inode_newsize_ok(inode, (len + offset));
+        if (ret)
+                return ret;
+        pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+        pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+        off_start = offset & (PAGE_CACHE_SIZE - 1);
+        off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+        for (index = pg_start; index <= pg_end; index++) {
+                struct dnode_of_data dn;
+                mutex_lock_op(sbi, DATA_NEW);
+                set_new_dnode(&dn, inode, NULL, NULL, 0);
+                ret = get_dnode_of_data(&dn, index, 0);
+                if (ret) {
+                        mutex_unlock_op(sbi, DATA_NEW);
+                        break;
+                }
+                if (dn.data_blkaddr == NULL_ADDR) {
+                        ret = reserve_new_block(&dn);
+                        if (ret) {
+                                f2fs_put_dnode(&dn);
+                                mutex_unlock_op(sbi, DATA_NEW);
+                                break;
+                        }
+                }
+                f2fs_put_dnode(&dn);
+                mutex_unlock_op(sbi, DATA_NEW);
+                if (pg_start == pg_end)
+                        new_size = offset + len;
+                else if (index == pg_start && off_start)
+                        new_size = (index + 1) << PAGE_CACHE_SHIFT;
+                else if (index == pg_end)
+                        new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+                else
+                        new_size += PAGE_CACHE_SIZE;
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+                i_size_read(inode) < new_size) {
+                i_size_write(inode, new_size);
+                mark_inode_dirty(inode);
+        }
+        return ret;
+}
+static long f2fs_fallocate(struct file *file, int mode,
+                                loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        long ret;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                ret = punch_hole(inode, offset, len, mode);
+        else
+                ret = expand_inode_data(inode, offset, len, mode);
+        f2fs_balance_fs(sbi);
+        return ret;
+}
+#define F2FS_REG_FLMASK         (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK       (FS_NODUMP_FL | FS_NOATIME_FL)
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & F2FS_REG_FLMASK;
+        else
+                return flags & F2FS_OTHER_FLMASK;
+}
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        unsigned int flags;
+        int ret;
+        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                flags = fi->i_flags & FS_FL_USER_VISIBLE;
+                return put_user(flags, (int __user *) arg);
+        case FS_IOC_SETFLAGS:
+        {
+                unsigned int oldflags;
+                ret = mnt_want_write(filp->f_path.mnt);
+                if (ret)
+                        return ret;
+                if (!inode_owner_or_capable(inode)) {
+                        ret = -EACCES;
+                        goto out;
+                }
+                if (get_user(flags, (int __user *) arg)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                flags = f2fs_mask_flags(inode->i_mode, flags);
+                mutex_lock(&inode->i_mutex);
+                oldflags = fi->i_flags;
+                if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+                        if (!capable(CAP_LINUX_IMMUTABLE)) {
+                                mutex_unlock(&inode->i_mutex);
+                                ret = -EPERM;
+                                goto out;
+                        }
+                }
+                flags = flags & FS_FL_USER_MODIFIABLE;
+                flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+                fi->i_flags = flags;
+                mutex_unlock(&inode->i_mutex);
+                f2fs_set_inode_flags(inode);
+                inode->i_ctime = CURRENT_TIME;
+                mark_inode_dirty(inode);
+out:
+                mnt_drop_write(filp->f_path.mnt);
+                return ret;
+        }
+        default:
+                return -ENOTTY;
+        }
+}
+const struct file_operations f2fs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .open           = generic_file_open,
+        .mmap           = f2fs_file_mmap,
+        .fsync          = f2fs_sync_file,
+        .fallocate      = f2fs_fallocate,
+        .unlocked_ioctl = f2fs_ioctl,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
new file mode 100644
index 000000000000..644aa3808273
--- /dev/null
+++ b/fs/f2fs/gc.c
@@ -0,0 +1,742 @@
+/*
+ * fs/f2fs/gc.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/f2fs_fs.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/blkdev.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+static struct kmem_cache *winode_slab;
+static int gc_thread_func(void *data)
+{
+        struct f2fs_sb_info *sbi = data;
+        wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
+        long wait_ms;
+        wait_ms = GC_THREAD_MIN_SLEEP_TIME;
+        do {
+                if (try_to_freeze())
+                        continue;
+                else
+                        wait_event_interruptible_timeout(*wq,
+                                                kthread_should_stop(),
+                                                msecs_to_jiffies(wait_ms));
+                if (kthread_should_stop())
+                        break;
+                f2fs_balance_fs(sbi);
+                if (!test_opt(sbi, BG_GC))
+                        continue;
+                /*
+                 * [GC triggering condition]
+                 * 0. GC is not conducted currently.
+                 * 1. There are enough dirty segments.
+                 * 2. IO subsystem is idle by checking the # of writeback pages.
+                 * 3. IO subsystem is idle by checking the # of requests in
+                 *    bdev's request list.
+                 *
+                 * Note) We have to avoid triggering GCs too much frequently.
+                 * Because it is possible that some segments can be
+                 * invalidated soon after by user update or deletion.
+                 * So, I'd like to wait some time to collect dirty segments.
+                 */
+                if (!mutex_trylock(&sbi->gc_mutex))
+                        continue;
+                if (!is_idle(sbi)) {
+                        wait_ms = increase_sleep_time(wait_ms);
+                        mutex_unlock(&sbi->gc_mutex);
+                        continue;
+                }
+                if (has_enough_invalid_blocks(sbi))
+                        wait_ms = decrease_sleep_time(wait_ms);
+                else
+                        wait_ms = increase_sleep_time(wait_ms);
+                sbi->bg_gc++;
+                if (f2fs_gc(sbi, 1) == GC_NONE)
+                        wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
+                else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
+                        wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+        } while (!kthread_should_stop());
+        return 0;
+}
+int start_gc_thread(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_gc_kthread *gc_th;
+        gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+        if (!gc_th)
+                return -ENOMEM;
+        sbi->gc_thread = gc_th;
+        init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
+        sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
+                                GC_THREAD_NAME);
+        if (IS_ERR(gc_th->f2fs_gc_task)) {
+                kfree(gc_th);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void stop_gc_thread(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+        if (!gc_th)
+                return;
+        kthread_stop(gc_th->f2fs_gc_task);
+        kfree(gc_th);
+        sbi->gc_thread = NULL;
+}
+static int select_gc_type(int gc_type)
+{
+        return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+}
+static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
+                        int type, struct victim_sel_policy *p)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        if (p->alloc_mode) {
+                p->gc_mode = GC_GREEDY;
+                p->dirty_segmap = dirty_i->dirty_segmap[type];
+                p->ofs_unit = 1;
+        } else {
+                p->gc_mode = select_gc_type(gc_type);
+                p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
+                p->ofs_unit = sbi->segs_per_sec;
+        }
+        p->offset = sbi->last_victim[p->gc_mode];
+}
+static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
+                                struct victim_sel_policy *p)
+{
+        if (p->gc_mode == GC_GREEDY)
+                return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+        else if (p->gc_mode == GC_CB)
+                return UINT_MAX;
+        else /* No other gc_mode */
+                return 0;
+}
+static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        unsigned int segno;
+        /*
+         * If the gc_type is FG_GC, we can select victim segments
+         * selected by background GC before.
+         * Those segments guarantee they have small valid blocks.
+         */
+        segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
+                                                TOTAL_SEGS(sbi), 0);
+        if (segno < TOTAL_SEGS(sbi)) {
+                clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+                return segno;
+        }
+        return NULL_SEGNO;
+}
+static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned int secno = GET_SECNO(sbi, segno);
+        unsigned int start = secno * sbi->segs_per_sec;
+        unsigned long long mtime = 0;
+        unsigned int vblocks;
+        unsigned char age = 0;
+        unsigned char u;
+        unsigned int i;
+        for (i = 0; i < sbi->segs_per_sec; i++)
+                mtime += get_seg_entry(sbi, start + i)->mtime;
+        vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+        mtime = div_u64(mtime, sbi->segs_per_sec);
+        vblocks = div_u64(vblocks, sbi->segs_per_sec);
+        u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+        /* Handle if the system time is changed by user */
+        if (mtime < sit_i->min_mtime)
+                sit_i->min_mtime = mtime;
+        if (mtime > sit_i->max_mtime)
+                sit_i->max_mtime = mtime;
+        if (sit_i->max_mtime != sit_i->min_mtime)
+                age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
+                                sit_i->max_mtime - sit_i->min_mtime);
+        return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
+}
+static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
+                                        struct victim_sel_policy *p)
+{
+        if (p->alloc_mode == SSR)
+                return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+        /* alloc_mode == LFS */
+        if (p->gc_mode == GC_GREEDY)
+                return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+        else
+                return get_cb_cost(sbi, segno);
+}
+/*
+ * This function is called from two pathes.
+ * One is garbage collection and the other is SSR segment selection.
+ * When it is called during GC, it just gets a victim segment
+ * and it does not remove it from dirty seglist.
+ * When it is called from SSR segment selection, it finds a segment
+ * which has minimum valid blocks and removes it from dirty seglist.
+ */
+static int get_victim_by_default(struct f2fs_sb_info *sbi,
+                unsigned int *result, int gc_type, int type, char alloc_mode)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        struct victim_sel_policy p;
+        unsigned int segno;
+        int nsearched = 0;
+        p.alloc_mode = alloc_mode;
+        select_policy(sbi, gc_type, type, &p);
+        p.min_segno = NULL_SEGNO;
+        p.min_cost = get_max_cost(sbi, &p);
+        mutex_lock(&dirty_i->seglist_lock);
+        if (p.alloc_mode == LFS && gc_type == FG_GC) {
+                p.min_segno = check_bg_victims(sbi);
+                if (p.min_segno != NULL_SEGNO)
+                        goto got_it;
+        }
+        while (1) {
+                unsigned long cost;
+                segno = find_next_bit(p.dirty_segmap,
+                                                TOTAL_SEGS(sbi), p.offset);
+                if (segno >= TOTAL_SEGS(sbi)) {
+                        if (sbi->last_victim[p.gc_mode]) {
+                                sbi->last_victim[p.gc_mode] = 0;
+                                p.offset = 0;
+                                continue;
+                        }
+                        break;
+                }
+                p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+                if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
+                        continue;
+                if (gc_type == BG_GC &&
+                                test_bit(segno, dirty_i->victim_segmap[BG_GC]))
+                        continue;
+                if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
+                        continue;
+                cost = get_gc_cost(sbi, segno, &p);
+                if (p.min_cost > cost) {
+                        p.min_segno = segno;
+                        p.min_cost = cost;
+                }
+                if (cost == get_max_cost(sbi, &p))
+                        continue;
+                if (nsearched++ >= MAX_VICTIM_SEARCH) {
+                        sbi->last_victim[p.gc_mode] = segno;
+                        break;
+                }
+        }
+got_it:
+        if (p.min_segno != NULL_SEGNO) {
+                *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+                if (p.alloc_mode == LFS) {
+                        int i;
+                        for (i = 0; i < p.ofs_unit; i++)
+                                set_bit(*result + i,
+                                        dirty_i->victim_segmap[gc_type]);
+                }
+        }
+        mutex_unlock(&dirty_i->seglist_lock);
+        return (p.min_segno == NULL_SEGNO) ? 0 : 1;
+}
+static const struct victim_selection default_v_ops = {
+        .get_victim = get_victim_by_default,
+};
+static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
+{
+        struct list_head *this;
+        struct inode_entry *ie;
+        list_for_each(this, ilist) {
+                ie = list_entry(this, struct inode_entry, list);
+                if (ie->inode->i_ino == ino)
+                        return ie->inode;
+        }
+        return NULL;
+}
+static void add_gc_inode(struct inode *inode, struct list_head *ilist)
+{
+        struct list_head *this;
+        struct inode_entry *new_ie, *ie;
+        list_for_each(this, ilist) {
+                ie = list_entry(this, struct inode_entry, list);
+                if (ie->inode == inode) {
+                        iput(inode);
+                        return;
+                }
+        }
+repeat:
+        new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
+        if (!new_ie) {
+                cond_resched();
+                goto repeat;
+        }
+        new_ie->inode = inode;
+        list_add_tail(&new_ie->list, ilist);
+}
+static void put_gc_inode(struct list_head *ilist)
+{
+        struct inode_entry *ie, *next_ie;
+        list_for_each_entry_safe(ie, next_ie, ilist, list) {
+                iput(ie->inode);
+                list_del(&ie->list);
+                kmem_cache_free(winode_slab, ie);
+        }
+}
+static int check_valid_map(struct f2fs_sb_info *sbi,
+                                unsigned int segno, int offset)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        struct seg_entry *sentry;
+        int ret;
+        mutex_lock(&sit_i->sentry_lock);
+        sentry = get_seg_entry(sbi, segno);
+        ret = f2fs_test_bit(offset, sentry->cur_valid_map);
+        mutex_unlock(&sit_i->sentry_lock);
+        return ret ? GC_OK : GC_NEXT;
+}
+/*
+ * This function compares node address got in summary with that in NAT.
+ * On validity, copy that node with cold status, otherwise (invalid node)
+ * ignore that.
+ */
+static int gc_node_segment(struct f2fs_sb_info *sbi,
+                struct f2fs_summary *sum, unsigned int segno, int gc_type)
+{
+        bool initial = true;
+        struct f2fs_summary *entry;
+        int off;
+next_step:
+        entry = sum;
+        for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+                nid_t nid = le32_to_cpu(entry->nid);
+                struct page *node_page;
+                int err;
+                /*
+                 * It makes sure that free segments are able to write
+                 * all the dirty node pages before CP after this CP.
+                 * So let's check the space of dirty node pages.
+                 */
+                if (should_do_checkpoint(sbi)) {
+                        mutex_lock(&sbi->cp_mutex);
+                        block_operations(sbi);
+                        return GC_BLOCKED;
+                }
+                err = check_valid_map(sbi, segno, off);
+                if (err == GC_ERROR)
+                        return err;
+                else if (err == GC_NEXT)
+                        continue;
+                if (initial) {
+                        ra_node_page(sbi, nid);
+                        continue;
+                }
+                node_page = get_node_page(sbi, nid);
+                if (IS_ERR(node_page))
+                        continue;
+                /* set page dirty and write it */
+                if (!PageWriteback(node_page))
+                        set_page_dirty(node_page);
+                f2fs_put_page(node_page, 1);
+                stat_inc_node_blk_count(sbi, 1);
+        }
+        if (initial) {
+                initial = false;
+                goto next_step;
+        }
+        if (gc_type == FG_GC) {
+                struct writeback_control wbc = {
+                        .sync_mode = WB_SYNC_ALL,
+                        .nr_to_write = LONG_MAX,
+                        .for_reclaim = 0,
+                };
+                sync_node_pages(sbi, 0, &wbc);
+        }
+        return GC_DONE;
+}
+/*
+ * Calculate start block index that this node page contains
+ */
+block_t start_bidx_of_node(unsigned int node_ofs)
+{
+        block_t start_bidx;
+        unsigned int bidx, indirect_blks;
+        int dec;
+        indirect_blks = 2 * NIDS_PER_BLOCK + 4;
+        start_bidx = 1;
+        if (node_ofs == 0) {
+                start_bidx = 0;
+        } else if (node_ofs <= 2) {
+                bidx = node_ofs - 1;
+        } else if (node_ofs <= indirect_blks) {
+                dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+                bidx = node_ofs - 2 - dec;
+        } else {
+                dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+                bidx = node_ofs - 5 - dec;
+        }
+        if (start_bidx)
+                start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
+        return start_bidx;
+}
+static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+                struct node_info *dni, block_t blkaddr, unsigned int *nofs)
+{
+        struct page *node_page;
+        nid_t nid;
+        unsigned int ofs_in_node;
+        block_t source_blkaddr;
+        nid = le32_to_cpu(sum->nid);
+        ofs_in_node = le16_to_cpu(sum->ofs_in_node);
+        node_page = get_node_page(sbi, nid);
+        if (IS_ERR(node_page))
+                return GC_NEXT;
+        get_node_info(sbi, nid, dni);
+        if (sum->version != dni->version) {
+                f2fs_put_page(node_page, 1);
+                return GC_NEXT;
+        }
+        *nofs = ofs_of_node(node_page);
+        source_blkaddr = datablock_addr(node_page, ofs_in_node);
+        f2fs_put_page(node_page, 1);
+        if (source_blkaddr != blkaddr)
+                return GC_NEXT;
+        return GC_OK;
+}
+static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+{
+        if (page->mapping != inode->i_mapping)
+                goto out;
+        if (inode != page->mapping->host)
+                goto out;
+        if (PageWriteback(page))
+                goto out;
+        if (gc_type == BG_GC) {
+                set_page_dirty(page);
+                set_cold_data(page);
+        } else {
+                struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+                mutex_lock_op(sbi, DATA_WRITE);
+                if (clear_page_dirty_for_io(page) &&
+                        S_ISDIR(inode->i_mode)) {
+                        dec_page_count(sbi, F2FS_DIRTY_DENTS);
+                        inode_dec_dirty_dents(inode);
+                }
+                set_cold_data(page);
+                do_write_data_page(page);
+                mutex_unlock_op(sbi, DATA_WRITE);
+                clear_cold_data(page);
+        }
+out:
+        f2fs_put_page(page, 1);
+}
+/*
+ * This function tries to get parent node of victim data block, and identifies
+ * data block validity. If the block is valid, copy that with cold status and
+ * modify parent node.
+ * If the parent node is not valid or the data block address is different,
+ * the victim data block is ignored.
+ */
+static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+                struct list_head *ilist, unsigned int segno, int gc_type)
+{
+        struct super_block *sb = sbi->sb;
+        struct f2fs_summary *entry;
+        block_t start_addr;
+        int err, off;
+        int phase = 0;
+        start_addr = START_BLOCK(sbi, segno);
+next_step:
+        entry = sum;
+        for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+                struct page *data_page;
+                struct inode *inode;
+                struct node_info dni; /* dnode info for the data */
+                unsigned int ofs_in_node, nofs;
+                block_t start_bidx;
+                /*
+                 * It makes sure that free segments are able to write
+                 * all the dirty node pages before CP after this CP.
+                 * So let's check the space of dirty node pages.
+                 */
+                if (should_do_checkpoint(sbi)) {
+                        mutex_lock(&sbi->cp_mutex);
+                        block_operations(sbi);
+                        err = GC_BLOCKED;
+                        goto stop;
+                }
+                err = check_valid_map(sbi, segno, off);
+                if (err == GC_ERROR)
+                        goto stop;
+                else if (err == GC_NEXT)
+                        continue;
+                if (phase == 0) {
+                        ra_node_page(sbi, le32_to_cpu(entry->nid));
+                        continue;
+                }
+                /* Get an inode by ino with checking validity */
+                err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
+                if (err == GC_ERROR)
+                        goto stop;
+                else if (err == GC_NEXT)
+                        continue;
+                if (phase == 1) {
+                        ra_node_page(sbi, dni.ino);
+                        continue;
+                }
+                start_bidx = start_bidx_of_node(nofs);
+                ofs_in_node = le16_to_cpu(entry->ofs_in_node);
+                if (phase == 2) {
+                        inode = f2fs_iget_nowait(sb, dni.ino);
+                        if (IS_ERR(inode))
+                                continue;
+                        data_page = find_data_page(inode,
+                                        start_bidx + ofs_in_node);
+                        if (IS_ERR(data_page))
+                                goto next_iput;
+                        f2fs_put_page(data_page, 0);
+                        add_gc_inode(inode, ilist);
+                } else {
+                        inode = find_gc_inode(dni.ino, ilist);
+                        if (inode) {
+                                data_page = get_lock_data_page(inode,
+                                                start_bidx + ofs_in_node);
+                                if (IS_ERR(data_page))
+                                        continue;
+                                move_data_page(inode, data_page, gc_type);
+                                stat_inc_data_blk_count(sbi, 1);
+                        }
+                }
+                continue;
+next_iput:
+                iput(inode);
+        }
+        if (++phase < 4)
+                goto next_step;
+        err = GC_DONE;
+stop:
+        if (gc_type == FG_GC)
+                f2fs_submit_bio(sbi, DATA, true);
+        return err;
+}
+static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
+                                                int gc_type, int type)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        int ret;
+        mutex_lock(&sit_i->sentry_lock);
+        ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
+        mutex_unlock(&sit_i->sentry_lock);
+        return ret;
+}
+static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+                                struct list_head *ilist, int gc_type)
+{
+        struct page *sum_page;
+        struct f2fs_summary_block *sum;
+        int ret = GC_DONE;
+        /* read segment summary of victim */
+        sum_page = get_sum_page(sbi, segno);
+        if (IS_ERR(sum_page))
+                return GC_ERROR;
+        /*
+         * CP needs to lock sum_page. In this time, we don't need
+         * to lock this page, because this summary page is not gone anywhere.
+         * Also, this page is not gonna be updated before GC is done.
+         */
+        unlock_page(sum_page);
+        sum = page_address(sum_page);
+        switch (GET_SUM_TYPE((&sum->footer))) {
+        case SUM_TYPE_NODE:
+                ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
+                break;
+        case SUM_TYPE_DATA:
+                ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
+                break;
+        }
+        stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
+        stat_inc_call_count(sbi->stat_info);
+        f2fs_put_page(sum_page, 0);
+        return ret;
+}
+int f2fs_gc(struct f2fs_sb_info *sbi, int nGC)
+{
+        unsigned int segno;
+        int old_free_secs, cur_free_secs;
+        int gc_status, nfree;
+        struct list_head ilist;
+        int gc_type = BG_GC;
+        INIT_LIST_HEAD(&ilist);
+gc_more:
+        nfree = 0;
+        gc_status = GC_NONE;
+        if (has_not_enough_free_secs(sbi))
+                old_free_secs = reserved_sections(sbi);
+        else
+                old_free_secs = free_sections(sbi);
+        while (sbi->sb->s_flags & MS_ACTIVE) {
+                int i;
+                if (has_not_enough_free_secs(sbi))
+                        gc_type = FG_GC;
+                cur_free_secs = free_sections(sbi) + nfree;
+                /* We got free space successfully. */
+                if (nGC < cur_free_secs - old_free_secs)
+                        break;
+                if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+                        break;
+                for (i = 0; i < sbi->segs_per_sec; i++) {
+                        /*
+                         * do_garbage_collect will give us three gc_status:
+                         * GC_ERROR, GC_DONE, and GC_BLOCKED.
+                         * If GC is finished uncleanly, we have to return
+                         * the victim to dirty segment list.
+                         */
+                        gc_status = do_garbage_collect(sbi, segno + i,
+                                        &ilist, gc_type);
+                        if (gc_status != GC_DONE)
+                                goto stop;
+                        nfree++;
+                }
+        }
+stop:
+        if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) {
+                write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
+                if (nfree)
+                        goto gc_more;
+        }
+        mutex_unlock(&sbi->gc_mutex);
+        put_gc_inode(&ilist);
+        BUG_ON(!list_empty(&ilist));
+        return gc_status;
+}
+void build_gc_manager(struct f2fs_sb_info *sbi)
+{
+        DIRTY_I(sbi)->v_ops = &default_v_ops;
+}
+int create_gc_caches(void)
+{
+        winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
+                        sizeof(struct inode_entry), NULL);
+        if (!winode_slab)
+                return -ENOMEM;
+        return 0;
+}
+void destroy_gc_caches(void)
+{
+        kmem_cache_destroy(winode_slab);
+}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
new file mode 100644
index 000000000000..b026d9354ccd
--- /dev/null
+++ b/fs/f2fs/gc.h
@@ -0,0 +1,117 @@
+/*
+ * fs/f2fs/gc.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define GC_THREAD_NAME  "f2fs_gc_task"
+#define GC_THREAD_MIN_WB_PAGES          1       /*
+                                                 * a threshold to determine
+                                                 * whether IO subsystem is idle
+                                                 * or not
+                                                 */
+#define GC_THREAD_MIN_SLEEP_TIME        10000 /* milliseconds */
+#define GC_THREAD_MAX_SLEEP_TIME        30000
+#define GC_THREAD_NOGC_SLEEP_TIME       10000
+#define LIMIT_INVALID_BLOCK     40 /* percentage over total user space */
+#define LIMIT_FREE_BLOCK        40 /* percentage over invalid + free space */
+/* Search max. number of dirty segments to select a victim segment */
+#define MAX_VICTIM_SEARCH       20
+enum {
+        GC_NONE = 0,
+        GC_ERROR,
+        GC_OK,
+        GC_NEXT,
+        GC_BLOCKED,
+        GC_DONE,
+};
+struct f2fs_gc_kthread {
+        struct task_struct *f2fs_gc_task;
+        wait_queue_head_t gc_wait_queue_head;
+};
+struct inode_entry {
+        struct list_head list;
+        struct inode *inode;
+};
+/*
+ * inline functions
+ */
+static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
+{
+        if (free_segments(sbi) < overprovision_segments(sbi))
+                return 0;
+        else
+                return (free_segments(sbi) - overprovision_segments(sbi))
+                        << sbi->log_blocks_per_seg;
+}
+static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+{
+        return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+}
+static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+{
+        block_t reclaimable_user_blocks = sbi->user_block_count -
+                written_block_count(sbi);
+        return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
+}
+static inline long increase_sleep_time(long wait)
+{
+        wait += GC_THREAD_MIN_SLEEP_TIME;
+        if (wait > GC_THREAD_MAX_SLEEP_TIME)
+                wait = GC_THREAD_MAX_SLEEP_TIME;
+        return wait;
+}
+static inline long decrease_sleep_time(long wait)
+{
+        wait -= GC_THREAD_MIN_SLEEP_TIME;
+        if (wait <= GC_THREAD_MIN_SLEEP_TIME)
+                wait = GC_THREAD_MIN_SLEEP_TIME;
+        return wait;
+}
+static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
+{
+        block_t invalid_user_blocks = sbi->user_block_count -
+                                        written_block_count(sbi);
+        /*
+         * Background GC is triggered with the following condition.
+         * 1. There are a number of invalid blocks.
+         * 2. There is not enough free space.
+         */
+        if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
+                        free_user_blocks(sbi) < limit_free_user_blocks(sbi))
+                return true;
+        return false;
+}
+static inline int is_idle(struct f2fs_sb_info *sbi)
+{
+        struct block_device *bdev = sbi->sb->s_bdev;
+        struct request_queue *q = bdev_get_queue(bdev);
+        struct request_list *rl = &q->root_rl;
+        return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
+}
+static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi)
+{
+        unsigned int pages_per_sec = sbi->segs_per_sec *
+                                        (1 << sbi->log_blocks_per_seg);
+        int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
+                        >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+        int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
+                        >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+        return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2);
+}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
new file mode 100644
index 000000000000..a60f04200f8b
--- /dev/null
+++ b/fs/f2fs/hash.c
@@ -0,0 +1,97 @@
+/*
+ * fs/f2fs/hash.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/cryptohash.h>
+#include <linux/pagemap.h>
+#include "f2fs.h"
+/*
+ * Hashing code copied from ext3
+ */
+#define DELTA 0x9E3779B9
+static void TEA_transform(unsigned int buf[4], unsigned int const in[])
+{
+        __u32 sum = 0;
+        __u32 b0 = buf[0], b1 = buf[1];
+        __u32 a = in[0], b = in[1], c = in[2], d = in[3];
+        int n = 16;
+        do {
+                sum += DELTA;
+                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+        } while (--n);
+        buf[0] += b0;
+        buf[1] += b1;
+}
+static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num)
+{
+        unsigned pad, val;
+        int i;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num * 4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = msg[i] + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+f2fs_hash_t f2fs_dentry_hash(const char *name, int len)
+{
+        __u32 hash, minor_hash;
+        f2fs_hash_t f2fs_hash;
+        const char *p;
+        __u32 in[8], buf[4];
+        /* Initialize the default seed for the hash checksum functions */
+        buf[0] = 0x67452301;
+        buf[1] = 0xefcdab89;
+        buf[2] = 0x98badcfe;
+        buf[3] = 0x10325476;
+        p = name;
+        while (len > 0) {
+                str2hashbuf(p, len, in, 4);
+                TEA_transform(buf, in);
+                len -= 16;
+                p += 16;
+        }
+        hash = buf[0];
+        minor_hash = buf[1];
+        f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
+        return f2fs_hash;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
new file mode 100644
index 000000000000..df5fb381ebf1
--- /dev/null
+++ b/fs/f2fs/inode.c
@@ -0,0 +1,268 @@
+/*
+ * fs/f2fs/inode.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include "f2fs.h"
+#include "node.h"
+struct f2fs_iget_args {
+        u64 ino;
+        int on_free;
+};
+void f2fs_set_inode_flags(struct inode *inode)
+{
+        unsigned int flags = F2FS_I(inode)->i_flags;
+        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE |
+                        S_NOATIME | S_DIRSYNC);
+        if (flags & FS_SYNC_FL)
+                inode->i_flags |= S_SYNC;
+        if (flags & FS_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        if (flags & FS_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+        if (flags & FS_NOATIME_FL)
+                inode->i_flags |= S_NOATIME;
+        if (flags & FS_DIRSYNC_FL)
+                inode->i_flags |= S_DIRSYNC;
+}
+static int f2fs_iget_test(struct inode *inode, void *data)
+{
+        struct f2fs_iget_args *args = data;
+        if (inode->i_ino != args->ino)
+                return 0;
+        if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
+                args->on_free = 1;
+                return 0;
+        }
+        return 1;
+}
+struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
+{
+        struct f2fs_iget_args args = {
+                .ino = ino,
+                .on_free = 0
+        };
+        struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
+        if (inode)
+                return inode;
+        if (!args.on_free)
+                return f2fs_iget(sb, ino);
+        return ERR_PTR(-ENOENT);
+}
+static int do_read_inode(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct page *node_page;
+        struct f2fs_node *rn;
+        struct f2fs_inode *ri;
+        /* Check if ino is within scope */
+        check_nid_range(sbi, inode->i_ino);
+        node_page = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(node_page))
+                return PTR_ERR(node_page);
+        rn = page_address(node_page);
+        ri = &(rn->i);
+        inode->i_mode = le16_to_cpu(ri->i_mode);
+        i_uid_write(inode, le32_to_cpu(ri->i_uid));
+        i_gid_write(inode, le32_to_cpu(ri->i_gid));
+        set_nlink(inode, le32_to_cpu(ri->i_links));
+        inode->i_size = le64_to_cpu(ri->i_size);
+        inode->i_blocks = le64_to_cpu(ri->i_blocks);
+        inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+        inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
+        inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
+        inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+        inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+        inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+        inode->i_generation = le32_to_cpu(ri->i_generation);
+        fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
+        fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
+        fi->i_flags = le32_to_cpu(ri->i_flags);
+        fi->flags = 0;
+        fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
+        fi->i_advise = ri->i_advise;
+        fi->i_pino = le32_to_cpu(ri->i_pino);
+        get_extent_info(&fi->ext, ri->i_ext);
+        f2fs_put_page(node_page, 1);
+        return 0;
+}
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *inode;
+        int ret;
+        inode = iget_locked(sb, ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
+                goto make_now;
+        ret = do_read_inode(inode);
+        if (ret)
+                goto bad_inode;
+        if (!sbi->por_doing && inode->i_nlink == 0) {
+                ret = -ENOENT;
+                goto bad_inode;
+        }
+make_now:
+        if (ino == F2FS_NODE_INO(sbi)) {
+                inode->i_mapping->a_ops = &f2fs_node_aops;
+                mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+        } else if (ino == F2FS_META_INO(sbi)) {
+                inode->i_mapping->a_ops = &f2fs_meta_aops;
+                mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+        } else if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &f2fs_file_inode_operations;
+                inode->i_fop = &f2fs_file_operations;
+                inode->i_mapping->a_ops = &f2fs_dblock_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &f2fs_dir_inode_operations;
+                inode->i_fop = &f2fs_dir_operations;
+                inode->i_mapping->a_ops = &f2fs_dblock_aops;
+                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
+                                __GFP_ZERO);
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &f2fs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &f2fs_dblock_aops;
+        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+                        S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+                inode->i_op = &f2fs_special_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+        } else {
+                ret = -EIO;
+                goto bad_inode;
+        }
+        unlock_new_inode(inode);
+        return inode;
+bad_inode:
+        iget_failed(inode);
+        return ERR_PTR(ret);
+}
+void update_inode(struct inode *inode, struct page *node_page)
+{
+        struct f2fs_node *rn;
+        struct f2fs_inode *ri;
+        wait_on_page_writeback(node_page);
+        rn = page_address(node_page);
+        ri = &(rn->i);
+        ri->i_mode = cpu_to_le16(inode->i_mode);
+        ri->i_advise = F2FS_I(inode)->i_advise;
+        ri->i_uid = cpu_to_le32(i_uid_read(inode));
+        ri->i_gid = cpu_to_le32(i_gid_read(inode));
+        ri->i_links = cpu_to_le32(inode->i_nlink);
+        ri->i_size = cpu_to_le64(i_size_read(inode));
+        ri->i_blocks = cpu_to_le64(inode->i_blocks);
+        set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+        ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+        ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+        ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
+        ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
+        ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
+        ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
+        ri->i_generation = cpu_to_le32(inode->i_generation);
+        set_page_dirty(node_page);
+}
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct page *node_page;
+        bool need_lock = false;
+        if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+                        inode->i_ino == F2FS_META_INO(sbi))
+                return 0;
+        node_page = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(node_page))
+                return PTR_ERR(node_page);
+        if (!PageDirty(node_page)) {
+                need_lock = true;
+                f2fs_put_page(node_page, 1);
+                mutex_lock(&sbi->write_inode);
+                node_page = get_node_page(sbi, inode->i_ino);
+                if (IS_ERR(node_page)) {
+                        mutex_unlock(&sbi->write_inode);
+                        return PTR_ERR(node_page);
+                }
+        }
+        update_inode(inode, node_page);
+        f2fs_put_page(node_page, 1);
+        if (need_lock)
+                mutex_unlock(&sbi->write_inode);
+        return 0;
+}
+/*
+ * Called at the last iput() if i_nlink is zero
+ */
+void f2fs_evict_inode(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        truncate_inode_pages(&inode->i_data, 0);
+        if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+                        inode->i_ino == F2FS_META_INO(sbi))
+                goto no_delete;
+        BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
+        remove_dirty_dir_inode(inode);
+        if (inode->i_nlink || is_bad_inode(inode))
+                goto no_delete;
+        set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+        i_size_write(inode, 0);
+        if (F2FS_HAS_BLOCKS(inode))
+                f2fs_truncate(inode);
+        remove_inode_page(inode);
+no_delete:
+        clear_inode(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
new file mode 100644
index 000000000000..89b7675dc377
--- /dev/null
+++ b/fs/f2fs/namei.c
@@ -0,0 +1,503 @@
+/*
+ * fs/f2fs/namei.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+{
+        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        nid_t ino;
+        struct inode *inode;
+        bool nid_free = false;
+        int err;
+        inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        mutex_lock_op(sbi, NODE_NEW);
+        if (!alloc_nid(sbi, &ino)) {
+                mutex_unlock_op(sbi, NODE_NEW);
+                err = -ENOSPC;
+                goto fail;
+        }
+        mutex_unlock_op(sbi, NODE_NEW);
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else {
+                inode->i_gid = current_fsgid();
+        }
+        inode->i_ino = ino;
+        inode->i_mode = mode;
+        inode->i_blocks = 0;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        inode->i_generation = sbi->s_next_generation++;
+        err = insert_inode_locked(inode);
+        if (err) {
+                err = -EINVAL;
+                nid_free = true;
+                goto out;
+        }
+        mark_inode_dirty(inode);
+        return inode;
+out:
+        clear_nlink(inode);
+        unlock_new_inode(inode);
+fail:
+        iput(inode);
+        if (nid_free)
+                alloc_nid_failed(sbi, ino);
+        return ERR_PTR(err);
+}
+static int is_multimedia_file(const unsigned char *s, const char *sub)
+{
+        int slen = strlen(s);
+        int sublen = strlen(sub);
+        int ret;
+        if (sublen > slen)
+                return 1;
+        ret = memcmp(s + slen - sublen, sub, sublen);
+        if (ret) {      /* compare upper case */
+                int i;
+                char upper_sub[8];
+                for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
+                        upper_sub[i] = toupper(sub[i]);
+                return memcmp(s + slen - sublen, upper_sub, sublen);
+        }
+        return ret;
+}
+/*
+ * Set multimedia files as cold files for hot/cold data separation
+ */
+static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
+                const unsigned char *name)
+{
+        int i;
+        __u8 (*extlist)[8] = sbi->raw_super->extension_list;
+        int count = le32_to_cpu(sbi->raw_super->extension_count);
+        for (i = 0; i < count; i++) {
+                if (!is_multimedia_file(name, extlist[i])) {
+                        F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+                        break;
+                }
+        }
+}
+static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+                                                bool excl)
+{
+        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *inode;
+        nid_t ino = 0;
+        int err;
+        inode = f2fs_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
+                set_cold_file(sbi, inode, dentry->d_name.name);
+        inode->i_op = &f2fs_file_inode_operations;
+        inode->i_fop = &f2fs_file_operations;
+        inode->i_mapping->a_ops = &f2fs_dblock_aops;
+        ino = inode->i_ino;
+        err = f2fs_add_link(dentry, inode);
+        if (err)
+                goto out;
+        alloc_nid_done(sbi, ino);
+        if (!sbi->por_doing)
+                d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        f2fs_balance_fs(sbi);
+        return 0;
+out:
+        clear_nlink(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        alloc_nid_failed(sbi, ino);
+        return err;
+}
+static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        struct inode *inode = old_dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        int err;
+        inode->i_ctime = CURRENT_TIME;
+        atomic_inc(&inode->i_count);
+        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+        err = f2fs_add_link(dentry, inode);
+        if (err)
+                goto out;
+        d_instantiate(dentry, inode);
+        f2fs_balance_fs(sbi);
+        return 0;
+out:
+        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+        iput(inode);
+        return err;
+}
+struct dentry *f2fs_get_parent(struct dentry *child)
+{
+        struct qstr dotdot = QSTR_INIT("..", 2);
+        unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
+}
+static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
+                unsigned int flags)
+{
+        struct inode *inode = NULL;
+        struct f2fs_dir_entry *de;
+        struct page *page;
+        if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        de = f2fs_find_entry(dir, &dentry->d_name, &page);
+        if (de) {
+                nid_t ino = le32_to_cpu(de->ino);
+                kunmap(page);
+                f2fs_put_page(page, 0);
+                inode = f2fs_iget(dir->i_sb, ino);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return d_splice_alias(inode, dentry);
+}
+static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *inode = dentry->d_inode;
+        struct f2fs_dir_entry *de;
+        struct page *page;
+        int err = -ENOENT;
+        de = f2fs_find_entry(dir, &dentry->d_name, &page);
+        if (!de)
+                goto fail;
+        err = check_orphan_space(sbi);
+        if (err) {
+                kunmap(page);
+                f2fs_put_page(page, 0);
+                goto fail;
+        }
+        f2fs_delete_entry(de, page, inode);
+        /* In order to evict this inode,  we set it dirty */
+        mark_inode_dirty(inode);
+        f2fs_balance_fs(sbi);
+fail:
+        return err;
+}
+static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
+                                        const char *symname)
+{
+        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *inode;
+        unsigned symlen = strlen(symname) + 1;
+        int err;
+        inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &f2fs_symlink_inode_operations;
+        inode->i_mapping->a_ops = &f2fs_dblock_aops;
+        err = f2fs_add_link(dentry, inode);
+        if (err)
+                goto out;
+        err = page_symlink(inode, symname, symlen);
+        alloc_nid_done(sbi, inode->i_ino);
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        f2fs_balance_fs(sbi);
+        return err;
+out:
+        clear_nlink(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        alloc_nid_failed(sbi, inode->i_ino);
+        return err;
+}
+static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct inode *inode;
+        int err;
+        inode = f2fs_new_inode(dir, S_IFDIR | mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &f2fs_dir_inode_operations;
+        inode->i_fop = &f2fs_dir_operations;
+        inode->i_mapping->a_ops = &f2fs_dblock_aops;
+        mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+        err = f2fs_add_link(dentry, inode);
+        if (err)
+                goto out_fail;
+        alloc_nid_done(sbi, inode->i_ino);
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        f2fs_balance_fs(sbi);
+        return 0;
+out_fail:
+        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+        clear_nlink(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        alloc_nid_failed(sbi, inode->i_ino);
+        return err;
+}
+static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        if (f2fs_empty_dir(inode))
+                return f2fs_unlink(dir, dentry);
+        return -ENOTEMPTY;
+}
+static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
+                                umode_t mode, dev_t rdev)
+{
+        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *inode;
+        int err = 0;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        inode = f2fs_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        init_special_inode(inode, inode->i_mode, rdev);
+        inode->i_op = &f2fs_special_inode_operations;
+        err = f2fs_add_link(dentry, inode);
+        if (err)
+                goto out;
+        alloc_nid_done(sbi, inode->i_ino);
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        f2fs_balance_fs(sbi);
+        return 0;
+out:
+        clear_nlink(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        alloc_nid_failed(sbi, inode->i_ino);
+        return err;
+}
+static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct super_block *sb = old_dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct page *old_dir_page;
+        struct page *old_page;
+        struct f2fs_dir_entry *old_dir_entry = NULL;
+        struct f2fs_dir_entry *old_entry;
+        struct f2fs_dir_entry *new_entry;
+        int err = -ENOENT;
+        old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+        if (!old_entry)
+                goto out;
+        if (S_ISDIR(old_inode->i_mode)) {
+                err = -EIO;
+                old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
+                if (!old_dir_entry)
+                        goto out_old;
+        }
+        mutex_lock_op(sbi, RENAME);
+        if (new_inode) {
+                struct page *new_page;
+                err = -ENOTEMPTY;
+                if (old_dir_entry && !f2fs_empty_dir(new_inode))
+                        goto out_dir;
+                err = -ENOENT;
+                new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
+                                                &new_page);
+                if (!new_entry)
+                        goto out_dir;
+                f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+                new_inode->i_ctime = CURRENT_TIME;
+                if (old_dir_entry)
+                        drop_nlink(new_inode);
+                drop_nlink(new_inode);
+                if (!new_inode->i_nlink)
+                        add_orphan_inode(sbi, new_inode->i_ino);
+                f2fs_write_inode(new_inode, NULL);
+        } else {
+                err = f2fs_add_link(new_dentry, old_inode);
+                if (err)
+                        goto out_dir;
+                if (old_dir_entry) {
+                        inc_nlink(new_dir);
+                        f2fs_write_inode(new_dir, NULL);
+                }
+        }
+        old_inode->i_ctime = CURRENT_TIME;
+        set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
+        mark_inode_dirty(old_inode);
+        f2fs_delete_entry(old_entry, old_page, NULL);
+        if (old_dir_entry) {
+                if (old_dir != new_dir) {
+                        f2fs_set_link(old_inode, old_dir_entry,
+                                                old_dir_page, new_dir);
+                } else {
+                        kunmap(old_dir_page);
+                        f2fs_put_page(old_dir_page, 0);
+                }
+                drop_nlink(old_dir);
+                f2fs_write_inode(old_dir, NULL);
+        }
+        mutex_unlock_op(sbi, RENAME);
+        f2fs_balance_fs(sbi);
+        return 0;
+out_dir:
+        if (old_dir_entry) {
+                kunmap(old_dir_page);
+                f2fs_put_page(old_dir_page, 0);
+        }
+        mutex_unlock_op(sbi, RENAME);
+out_old:
+        kunmap(old_page);
+        f2fs_put_page(old_page, 0);
+out:
+        return err;
+}
+const struct inode_operations f2fs_dir_inode_operations = {
+        .create         = f2fs_create,
+        .lookup         = f2fs_lookup,
+        .link           = f2fs_link,
+        .unlink         = f2fs_unlink,
+        .symlink        = f2fs_symlink,
+        .mkdir          = f2fs_mkdir,
+        .rmdir          = f2fs_rmdir,
+        .mknod          = f2fs_mknod,
+        .rename         = f2fs_rename,
+        .setattr        = f2fs_setattr,
+        .get_acl        = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = f2fs_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+};
+const struct inode_operations f2fs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = f2fs_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+};
+const struct inode_operations f2fs_special_inode_operations = {
+        .setattr        = f2fs_setattr,
+        .get_acl        = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = f2fs_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
new file mode 100644
index 000000000000..19870361497e
--- /dev/null
+++ b/fs/f2fs/node.c
@@ -0,0 +1,1764 @@
+/*
+ * fs/f2fs/node.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+static struct kmem_cache *nat_entry_slab;
+static struct kmem_cache *free_nid_slab;
+static void clear_node_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        unsigned int long flags;
+        if (PageDirty(page)) {
+                spin_lock_irqsave(&mapping->tree_lock, flags);
+                radix_tree_tag_clear(&mapping->page_tree,
+                                page_index(page),
+                                PAGECACHE_TAG_DIRTY);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+                clear_page_dirty_for_io(page);
+                dec_page_count(sbi, F2FS_DIRTY_NODES);
+        }
+        ClearPageUptodate(page);
+}
+static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        pgoff_t index = current_nat_addr(sbi, nid);
+        return get_meta_page(sbi, index);
+}
+static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        struct page *src_page;
+        struct page *dst_page;
+        pgoff_t src_off;
+        pgoff_t dst_off;
+        void *src_addr;
+        void *dst_addr;
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        src_off = current_nat_addr(sbi, nid);
+        dst_off = next_nat_addr(sbi, src_off);
+        /* get current nat block page with lock */
+        src_page = get_meta_page(sbi, src_off);
+        /* Dirty src_page means that it is already the new target NAT page. */
+        if (PageDirty(src_page))
+                return src_page;
+        dst_page = grab_meta_page(sbi, dst_off);
+        src_addr = page_address(src_page);
+        dst_addr = page_address(dst_page);
+        memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+        set_page_dirty(dst_page);
+        f2fs_put_page(src_page, 1);
+        set_to_next_nat(nm_i, nid);
+        return dst_page;
+}
+/*
+ * Readahead NAT pages
+ */
+static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
+{
+        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct page *page;
+        pgoff_t index;
+        int i;
+        for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
+                if (nid >= nm_i->max_nid)
+                        nid = 0;
+                index = current_nat_addr(sbi, nid);
+                page = grab_cache_page(mapping, index);
+                if (!page)
+                        continue;
+                if (f2fs_readpage(sbi, page, index, READ)) {
+                        f2fs_put_page(page, 1);
+                        continue;
+                }
+                page_cache_release(page);
+        }
+}
+static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
+{
+        return radix_tree_lookup(&nm_i->nat_root, n);
+}
+static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
+                nid_t start, unsigned int nr, struct nat_entry **ep)
+{
+        return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
+}
+static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
+{
+        list_del(&e->list);
+        radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
+        nm_i->nat_cnt--;
+        kmem_cache_free(nat_entry_slab, e);
+}
+int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct nat_entry *e;
+        int is_cp = 1;
+        read_lock(&nm_i->nat_tree_lock);
+        e = __lookup_nat_cache(nm_i, nid);
+        if (e && !e->checkpointed)
+                is_cp = 0;
+        read_unlock(&nm_i->nat_tree_lock);
+        return is_cp;
+}
+static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+        struct nat_entry *new;
+        new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+        if (!new)
+                return NULL;
+        if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
+                kmem_cache_free(nat_entry_slab, new);
+                return NULL;
+        }
+        memset(new, 0, sizeof(struct nat_entry));
+        nat_set_nid(new, nid);
+        list_add_tail(&new->list, &nm_i->nat_entries);
+        nm_i->nat_cnt++;
+        return new;
+}
+static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+                                                struct f2fs_nat_entry *ne)
+{
+        struct nat_entry *e;
+retry:
+        write_lock(&nm_i->nat_tree_lock);
+        e = __lookup_nat_cache(nm_i, nid);
+        if (!e) {
+                e = grab_nat_entry(nm_i, nid);
+                if (!e) {
+                        write_unlock(&nm_i->nat_tree_lock);
+                        goto retry;
+                }
+                nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
+                nat_set_ino(e, le32_to_cpu(ne->ino));
+                nat_set_version(e, ne->version);
+                e->checkpointed = true;
+        }
+        write_unlock(&nm_i->nat_tree_lock);
+}
+static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
+                        block_t new_blkaddr)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct nat_entry *e;
+retry:
+        write_lock(&nm_i->nat_tree_lock);
+        e = __lookup_nat_cache(nm_i, ni->nid);
+        if (!e) {
+                e = grab_nat_entry(nm_i, ni->nid);
+                if (!e) {
+                        write_unlock(&nm_i->nat_tree_lock);
+                        goto retry;
+                }
+                e->ni = *ni;
+                e->checkpointed = true;
+                BUG_ON(ni->blk_addr == NEW_ADDR);
+        } else if (new_blkaddr == NEW_ADDR) {
+                /*
+                 * when nid is reallocated,
+                 * previous nat entry can be remained in nat cache.
+                 * So, reinitialize it with new information.
+                 */
+                e->ni = *ni;
+                BUG_ON(ni->blk_addr != NULL_ADDR);
+        }
+        if (new_blkaddr == NEW_ADDR)
+                e->checkpointed = false;
+        /* sanity check */
+        BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
+        BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
+                        new_blkaddr == NULL_ADDR);
+        BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
+                        new_blkaddr == NEW_ADDR);
+        BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
+                        nat_get_blkaddr(e) != NULL_ADDR &&
+                        new_blkaddr == NEW_ADDR);
+        /* increament version no as node is removed */
+        if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
+                unsigned char version = nat_get_version(e);
+                nat_set_version(e, inc_node_version(version));
+        }
+        /* change address */
+        nat_set_blkaddr(e, new_blkaddr);
+        __set_nat_cache_dirty(nm_i, e);
+        write_unlock(&nm_i->nat_tree_lock);
+}
+static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
+                return 0;
+        write_lock(&nm_i->nat_tree_lock);
+        while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
+                struct nat_entry *ne;
+                ne = list_first_entry(&nm_i->nat_entries,
+                                        struct nat_entry, list);
+                __del_from_nat_cache(nm_i, ne);
+                nr_shrink--;
+        }
+        write_unlock(&nm_i->nat_tree_lock);
+        return nr_shrink;
+}
+/*
+ * This function returns always success
+ */
+void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        nid_t start_nid = START_NID(nid);
+        struct f2fs_nat_block *nat_blk;
+        struct page *page = NULL;
+        struct f2fs_nat_entry ne;
+        struct nat_entry *e;
+        int i;
+        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+        ni->nid = nid;
+        /* Check nat cache */
+        read_lock(&nm_i->nat_tree_lock);
+        e = __lookup_nat_cache(nm_i, nid);
+        if (e) {
+                ni->ino = nat_get_ino(e);
+                ni->blk_addr = nat_get_blkaddr(e);
+                ni->version = nat_get_version(e);
+        }
+        read_unlock(&nm_i->nat_tree_lock);
+        if (e)
+                return;
+        /* Check current segment summary */
+        mutex_lock(&curseg->curseg_mutex);
+        i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+        if (i >= 0) {
+                ne = nat_in_journal(sum, i);
+                node_info_from_raw_nat(ni, &ne);
+        }
+        mutex_unlock(&curseg->curseg_mutex);
+        if (i >= 0)
+                goto cache;
+        /* Fill node_info from nat page */
+        page = get_current_nat_page(sbi, start_nid);
+        nat_blk = (struct f2fs_nat_block *)page_address(page);
+        ne = nat_blk->entries[nid - start_nid];
+        node_info_from_raw_nat(ni, &ne);
+        f2fs_put_page(page, 1);
+cache:
+        /* cache nat entry */
+        cache_nat_entry(NM_I(sbi), nid, &ne);
+}
+/*
+ * The maximum depth is four.
+ * Offset[0] will have raw inode offset.
+ */
+static int get_node_path(long block, int offset[4], unsigned int noffset[4])
+{
+        const long direct_index = ADDRS_PER_INODE;
+        const long direct_blks = ADDRS_PER_BLOCK;
+        const long dptrs_per_blk = NIDS_PER_BLOCK;
+        const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+        const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
+        int n = 0;
+        int level = 0;
+        noffset[0] = 0;
+        if (block < direct_index) {
+                offset[n++] = block;
+                level = 0;
+                goto got;
+        }
+        block -= direct_index;
+        if (block < direct_blks) {
+                offset[n++] = NODE_DIR1_BLOCK;
+                noffset[n] = 1;
+                offset[n++] = block;
+                level = 1;
+                goto got;
+        }
+        block -= direct_blks;
+        if (block < direct_blks) {
+                offset[n++] = NODE_DIR2_BLOCK;
+                noffset[n] = 2;
+                offset[n++] = block;
+                level = 1;
+                goto got;
+        }
+        block -= direct_blks;
+        if (block < indirect_blks) {
+                offset[n++] = NODE_IND1_BLOCK;
+                noffset[n] = 3;
+                offset[n++] = block / direct_blks;
+                noffset[n] = 4 + offset[n - 1];
+                offset[n++] = block % direct_blks;
+                level = 2;
+                goto got;
+        }
+        block -= indirect_blks;
+        if (block < indirect_blks) {
+                offset[n++] = NODE_IND2_BLOCK;
+                noffset[n] = 4 + dptrs_per_blk;
+                offset[n++] = block / direct_blks;
+                noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
+                offset[n++] = block % direct_blks;
+                level = 2;
+                goto got;
+        }
+        block -= indirect_blks;
+        if (block < dindirect_blks) {
+                offset[n++] = NODE_DIND_BLOCK;
+                noffset[n] = 5 + (dptrs_per_blk * 2);
+                offset[n++] = block / indirect_blks;
+                noffset[n] = 6 + (dptrs_per_blk * 2) +
+                              offset[n - 1] * (dptrs_per_blk + 1);
+                offset[n++] = (block / direct_blks) % dptrs_per_blk;
+                noffset[n] = 7 + (dptrs_per_blk * 2) +
+                              offset[n - 2] * (dptrs_per_blk + 1) +
+                              offset[n - 1];
+                offset[n++] = block % direct_blks;
+                level = 3;
+                goto got;
+        } else {
+                BUG();
+        }
+got:
+        return level;
+}
+/*
+ * Caller should call f2fs_put_dnode(dn).
+ */
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct page *npage[4];
+        struct page *parent;
+        int offset[4];
+        unsigned int noffset[4];
+        nid_t nids[4];
+        int level, i;
+        int err = 0;
+        level = get_node_path(index, offset, noffset);
+        nids[0] = dn->inode->i_ino;
+        npage[0] = get_node_page(sbi, nids[0]);
+        if (IS_ERR(npage[0]))
+                return PTR_ERR(npage[0]);
+        parent = npage[0];
+        nids[1] = get_nid(parent, offset[0], true);
+        dn->inode_page = npage[0];
+        dn->inode_page_locked = true;
+        /* get indirect or direct nodes */
+        for (i = 1; i <= level; i++) {
+                bool done = false;
+                if (!nids[i] && !ro) {
+                        mutex_lock_op(sbi, NODE_NEW);
+                        /* alloc new node */
+                        if (!alloc_nid(sbi, &(nids[i]))) {
+                                mutex_unlock_op(sbi, NODE_NEW);
+                                err = -ENOSPC;
+                                goto release_pages;
+                        }
+                        dn->nid = nids[i];
+                        npage[i] = new_node_page(dn, noffset[i]);
+                        if (IS_ERR(npage[i])) {
+                                alloc_nid_failed(sbi, nids[i]);
+                                mutex_unlock_op(sbi, NODE_NEW);
+                                err = PTR_ERR(npage[i]);
+                                goto release_pages;
+                        }
+                        set_nid(parent, offset[i - 1], nids[i], i == 1);
+                        alloc_nid_done(sbi, nids[i]);
+                        mutex_unlock_op(sbi, NODE_NEW);
+                        done = true;
+                } else if (ro && i == level && level > 1) {
+                        npage[i] = get_node_page_ra(parent, offset[i - 1]);
+                        if (IS_ERR(npage[i])) {
+                                err = PTR_ERR(npage[i]);
+                                goto release_pages;
+                        }
+                        done = true;
+                }
+                if (i == 1) {
+                        dn->inode_page_locked = false;
+                        unlock_page(parent);
+                } else {
+                        f2fs_put_page(parent, 1);
+                }
+                if (!done) {
+                        npage[i] = get_node_page(sbi, nids[i]);
+                        if (IS_ERR(npage[i])) {
+                                err = PTR_ERR(npage[i]);
+                                f2fs_put_page(npage[0], 0);
+                                goto release_out;
+                        }
+                }
+                if (i < level) {
+                        parent = npage[i];
+                        nids[i + 1] = get_nid(parent, offset[i], false);
+                }
+        }
+        dn->nid = nids[level];
+        dn->ofs_in_node = offset[level];
+        dn->node_page = npage[level];
+        dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+        return 0;
+release_pages:
+        f2fs_put_page(parent, 1);
+        if (i > 1)
+                f2fs_put_page(npage[0], 0);
+release_out:
+        dn->inode_page = NULL;
+        dn->node_page = NULL;
+        return err;
+}
+static void truncate_node(struct dnode_of_data *dn)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct node_info ni;
+        get_node_info(sbi, dn->nid, &ni);
+        BUG_ON(ni.blk_addr == NULL_ADDR);
+        if (ni.blk_addr != NULL_ADDR)
+                invalidate_blocks(sbi, ni.blk_addr);
+        /* Deallocate node address */
+        dec_valid_node_count(sbi, dn->inode, 1);
+        set_node_addr(sbi, &ni, NULL_ADDR);
+        if (dn->nid == dn->inode->i_ino) {
+                remove_orphan_inode(sbi, dn->nid);
+                dec_valid_inode_count(sbi);
+        } else {
+                sync_inode_page(dn);
+        }
+        clear_node_page_dirty(dn->node_page);
+        F2FS_SET_SB_DIRT(sbi);
+        f2fs_put_page(dn->node_page, 1);
+        dn->node_page = NULL;
+}
+static int truncate_dnode(struct dnode_of_data *dn)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct page *page;
+        if (dn->nid == 0)
+                return 1;
+        /* get direct node */
+        page = get_node_page(sbi, dn->nid);
+        if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
+                return 1;
+        else if (IS_ERR(page))
+                return PTR_ERR(page);
+        /* Make dnode_of_data for parameter */
+        dn->node_page = page;
+        dn->ofs_in_node = 0;
+        truncate_data_blocks(dn);
+        truncate_node(dn);
+        return 1;
+}
+static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
+                                                int ofs, int depth)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct dnode_of_data rdn = *dn;
+        struct page *page;
+        struct f2fs_node *rn;
+        nid_t child_nid;
+        unsigned int child_nofs;
+        int freed = 0;
+        int i, ret;
+        if (dn->nid == 0)
+                return NIDS_PER_BLOCK + 1;
+        page = get_node_page(sbi, dn->nid);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        rn = (struct f2fs_node *)page_address(page);
+        if (depth < 3) {
+                for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
+                        child_nid = le32_to_cpu(rn->in.nid[i]);
+                        if (child_nid == 0)
+                                continue;
+                        rdn.nid = child_nid;
+                        ret = truncate_dnode(&rdn);
+                        if (ret < 0)
+                                goto out_err;
+                        set_nid(page, i, 0, false);
+                }
+        } else {
+                child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
+                for (i = ofs; i < NIDS_PER_BLOCK; i++) {
+                        child_nid = le32_to_cpu(rn->in.nid[i]);
+                        if (child_nid == 0) {
+                                child_nofs += NIDS_PER_BLOCK + 1;
+                                continue;
+                        }
+                        rdn.nid = child_nid;
+                        ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
+                        if (ret == (NIDS_PER_BLOCK + 1)) {
+                                set_nid(page, i, 0, false);
+                                child_nofs += ret;
+                        } else if (ret < 0 && ret != -ENOENT) {
+                                goto out_err;
+                        }
+                }
+                freed = child_nofs;
+        }
+        if (!ofs) {
+                /* remove current indirect node */
+                dn->node_page = page;
+                truncate_node(dn);
+                freed++;
+        } else {
+                f2fs_put_page(page, 1);
+        }
+        return freed;
+out_err:
+        f2fs_put_page(page, 1);
+        return ret;
+}
+static int truncate_partial_nodes(struct dnode_of_data *dn,
+                        struct f2fs_inode *ri, int *offset, int depth)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct page *pages[2];
+        nid_t nid[3];
+        nid_t child_nid;
+        int err = 0;
+        int i;
+        int idx = depth - 2;
+        nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+        if (!nid[0])
+                return 0;
+        /* get indirect nodes in the path */
+        for (i = 0; i < depth - 1; i++) {
+                /* refernece count'll be increased */
+                pages[i] = get_node_page(sbi, nid[i]);
+                if (IS_ERR(pages[i])) {
+                        depth = i + 1;
+                        err = PTR_ERR(pages[i]);
+                        goto fail;
+                }
+                nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
+        }
+        /* free direct nodes linked to a partial indirect node */
+        for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+                child_nid = get_nid(pages[idx], i, false);
+                if (!child_nid)
+                        continue;
+                dn->nid = child_nid;
+                err = truncate_dnode(dn);
+                if (err < 0)
+                        goto fail;
+                set_nid(pages[idx], i, 0, false);
+        }
+        if (offset[depth - 1] == 0) {
+                dn->node_page = pages[idx];
+                dn->nid = nid[idx];
+                truncate_node(dn);
+        } else {
+                f2fs_put_page(pages[idx], 1);
+        }
+        offset[idx]++;
+        offset[depth - 1] = 0;
+fail:
+        for (i = depth - 3; i >= 0; i--)
+                f2fs_put_page(pages[i], 1);
+        return err;
+}
+/*
+ * All the block addresses of data and nodes should be nullified.
+ */
+int truncate_inode_blocks(struct inode *inode, pgoff_t from)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        int err = 0, cont = 1;
+        int level, offset[4], noffset[4];
+        unsigned int nofs;
+        struct f2fs_node *rn;
+        struct dnode_of_data dn;
+        struct page *page;
+        level = get_node_path(from, offset, noffset);
+        page = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        set_new_dnode(&dn, inode, page, NULL, 0);
+        unlock_page(page);
+        rn = page_address(page);
+        switch (level) {
+        case 0:
+        case 1:
+                nofs = noffset[1];
+                break;
+        case 2:
+                nofs = noffset[1];
+                if (!offset[level - 1])
+                        goto skip_partial;
+                err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+                if (err < 0 && err != -ENOENT)
+                        goto fail;
+                nofs += 1 + NIDS_PER_BLOCK;
+                break;
+        case 3:
+                nofs = 5 + 2 * NIDS_PER_BLOCK;
+                if (!offset[level - 1])
+                        goto skip_partial;
+                err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+                if (err < 0 && err != -ENOENT)
+                        goto fail;
+                break;
+        default:
+                BUG();
+        }
+skip_partial:
+        while (cont) {
+                dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+                switch (offset[0]) {
+                case NODE_DIR1_BLOCK:
+                case NODE_DIR2_BLOCK:
+                        err = truncate_dnode(&dn);
+                        break;
+                case NODE_IND1_BLOCK:
+                case NODE_IND2_BLOCK:
+                        err = truncate_nodes(&dn, nofs, offset[1], 2);
+                        break;
+                case NODE_DIND_BLOCK:
+                        err = truncate_nodes(&dn, nofs, offset[1], 3);
+                        cont = 0;
+                        break;
+                default:
+                        BUG();
+                }
+                if (err < 0 && err != -ENOENT)
+                        goto fail;
+                if (offset[1] == 0 &&
+                                rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+                        lock_page(page);
+                        wait_on_page_writeback(page);
+                        rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+                        set_page_dirty(page);
+                        unlock_page(page);
+                }
+                offset[1] = 0;
+                offset[0]++;
+                nofs += err;
+        }
+fail:
+        f2fs_put_page(page, 0);
+        return err > 0 ? 0 : err;
+}
+int remove_inode_page(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct page *page;
+        nid_t ino = inode->i_ino;
+        struct dnode_of_data dn;
+        mutex_lock_op(sbi, NODE_TRUNC);
+        page = get_node_page(sbi, ino);
+        if (IS_ERR(page)) {
+                mutex_unlock_op(sbi, NODE_TRUNC);
+                return PTR_ERR(page);
+        }
+        if (F2FS_I(inode)->i_xattr_nid) {
+                nid_t nid = F2FS_I(inode)->i_xattr_nid;
+                struct page *npage = get_node_page(sbi, nid);
+                if (IS_ERR(npage)) {
+                        mutex_unlock_op(sbi, NODE_TRUNC);
+                        return PTR_ERR(npage);
+                }
+                F2FS_I(inode)->i_xattr_nid = 0;
+                set_new_dnode(&dn, inode, page, npage, nid);
+                dn.inode_page_locked = 1;
+                truncate_node(&dn);
+        }
+        if (inode->i_blocks == 1) {
+                /* inernally call f2fs_put_page() */
+                set_new_dnode(&dn, inode, page, page, ino);
+                truncate_node(&dn);
+        } else if (inode->i_blocks == 0) {
+                struct node_info ni;
+                get_node_info(sbi, inode->i_ino, &ni);
+                /* called after f2fs_new_inode() is failed */
+                BUG_ON(ni.blk_addr != NULL_ADDR);
+                f2fs_put_page(page, 1);
+        } else {
+                BUG();
+        }
+        mutex_unlock_op(sbi, NODE_TRUNC);
+        return 0;
+}
+int new_inode_page(struct inode *inode, struct dentry *dentry)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct page *page;
+        struct dnode_of_data dn;
+        /* allocate inode page for new inode */
+        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+        mutex_lock_op(sbi, NODE_NEW);
+        page = new_node_page(&dn, 0);
+        init_dent_inode(dentry, page);
+        mutex_unlock_op(sbi, NODE_NEW);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        f2fs_put_page(page, 1);
+        return 0;
+}
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct address_space *mapping = sbi->node_inode->i_mapping;
+        struct node_info old_ni, new_ni;
+        struct page *page;
+        int err;
+        if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+                return ERR_PTR(-EPERM);
+        page = grab_cache_page(mapping, dn->nid);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        get_node_info(sbi, dn->nid, &old_ni);
+        SetPageUptodate(page);
+        fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
+        /* Reinitialize old_ni with new node page */
+        BUG_ON(old_ni.blk_addr != NULL_ADDR);
+        new_ni = old_ni;
+        new_ni.ino = dn->inode->i_ino;
+        if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+                err = -ENOSPC;
+                goto fail;
+        }
+        set_node_addr(sbi, &new_ni, NEW_ADDR);
+        dn->node_page = page;
+        sync_inode_page(dn);
+        set_page_dirty(page);
+        set_cold_node(dn->inode, page);
+        if (ofs == 0)
+                inc_valid_inode_count(sbi);
+        return page;
+fail:
+        f2fs_put_page(page, 1);
+        return ERR_PTR(err);
+}
+static int read_node_page(struct page *page, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        struct node_info ni;
+        get_node_info(sbi, page->index, &ni);
+        if (ni.blk_addr == NULL_ADDR)
+                return -ENOENT;
+        return f2fs_readpage(sbi, page, ni.blk_addr, type);
+}
+/*
+ * Readahead a node page
+ */
+void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        struct address_space *mapping = sbi->node_inode->i_mapping;
+        struct page *apage;
+        apage = find_get_page(mapping, nid);
+        if (apage && PageUptodate(apage))
+                goto release_out;
+        f2fs_put_page(apage, 0);
+        apage = grab_cache_page(mapping, nid);
+        if (!apage)
+                return;
+        if (read_node_page(apage, READA))
+                goto unlock_out;
+        page_cache_release(apage);
+        return;
+unlock_out:
+        unlock_page(apage);
+release_out:
+        page_cache_release(apage);
+}
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+        int err;
+        struct page *page;
+        struct address_space *mapping = sbi->node_inode->i_mapping;
+        page = grab_cache_page(mapping, nid);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        err = read_node_page(page, READ_SYNC);
+        if (err) {
+                f2fs_put_page(page, 1);
+                return ERR_PTR(err);
+        }
+        BUG_ON(nid != nid_of_node(page));
+        mark_page_accessed(page);
+        return page;
+}
+/*
+ * Return a locked page for the desired node page.
+ * And, readahead MAX_RA_NODE number of node pages.
+ */
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+        struct address_space *mapping = sbi->node_inode->i_mapping;
+        int i, end;
+        int err = 0;
+        nid_t nid;
+        struct page *page;
+        /* First, try getting the desired direct node. */
+        nid = get_nid(parent, start, false);
+        if (!nid)
+                return ERR_PTR(-ENOENT);
+        page = find_get_page(mapping, nid);
+        if (page && PageUptodate(page))
+                goto page_hit;
+        f2fs_put_page(page, 0);
+repeat:
+        page = grab_cache_page(mapping, nid);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        err = read_node_page(page, READA);
+        if (err) {
+                f2fs_put_page(page, 1);
+                return ERR_PTR(err);
+        }
+        /* Then, try readahead for siblings of the desired node */
+        end = start + MAX_RA_NODE;
+        end = min(end, NIDS_PER_BLOCK);
+        for (i = start + 1; i < end; i++) {
+                nid = get_nid(parent, i, false);
+                if (!nid)
+                        continue;
+                ra_node_page(sbi, nid);
+        }
+page_hit:
+        lock_page(page);
+        if (PageError(page)) {
+                f2fs_put_page(page, 1);
+                return ERR_PTR(-EIO);
+        }
+        /* Has the page been truncated? */
+        if (page->mapping != mapping) {
+                f2fs_put_page(page, 1);
+                goto repeat;
+        }
+        return page;
+}
+void sync_inode_page(struct dnode_of_data *dn)
+{
+        if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
+                update_inode(dn->inode, dn->node_page);
+        } else if (dn->inode_page) {
+                if (!dn->inode_page_locked)
+                        lock_page(dn->inode_page);
+                update_inode(dn->inode, dn->inode_page);
+                if (!dn->inode_page_locked)
+                        unlock_page(dn->inode_page);
+        } else {
+                f2fs_write_inode(dn->inode, NULL);
+        }
+}
+int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
+                                        struct writeback_control *wbc)
+{
+        struct address_space *mapping = sbi->node_inode->i_mapping;
+        pgoff_t index, end;
+        struct pagevec pvec;
+        int step = ino ? 2 : 0;
+        int nwritten = 0, wrote = 0;
+        pagevec_init(&pvec, 0);
+next_step:
+        index = 0;
+        end = LONG_MAX;
+        while (index <= end) {
+                int i, nr_pages;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                PAGECACHE_TAG_DIRTY,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * flushing sequence with step:
+                         * 0. indirect nodes
+                         * 1. dentry dnodes
+                         * 2. file dnodes
+                         */
+                        if (step == 0 && IS_DNODE(page))
+                                continue;
+                        if (step == 1 && (!IS_DNODE(page) ||
+                                                is_cold_node(page)))
+                                continue;
+                        if (step == 2 && (!IS_DNODE(page) ||
+                                                !is_cold_node(page)))
+                                continue;
+                        /*
+                         * If an fsync mode,
+                         * we should not skip writing node pages.
+                         */
+                        if (ino && ino_of_node(page) == ino)
+                                lock_page(page);
+                        else if (!trylock_page(page))
+                                continue;
+                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (ino && ino_of_node(page) != ino)
+                                goto continue_unlock;
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
+                        /* called by fsync() */
+                        if (ino && IS_DNODE(page)) {
+                                int mark = !is_checkpointed_node(sbi, ino);
+                                set_fsync_mark(page, 1);
+                                if (IS_INODE(page))
+                                        set_dentry_mark(page, mark);
+                                nwritten++;
+                        } else {
+                                set_fsync_mark(page, 0);
+                                set_dentry_mark(page, 0);
+                        }
+                        mapping->a_ops->writepage(page, wbc);
+                        wrote++;
+                        if (--wbc->nr_to_write == 0)
+                                break;
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+                if (wbc->nr_to_write == 0) {
+                        step = 2;
+                        break;
+                }
+        }
+        if (step < 2) {
+                step++;
+                goto next_step;
+        }
+        if (wrote)
+                f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
+        return nwritten;
+}
+static int f2fs_write_node_page(struct page *page,
+                                struct writeback_control *wbc)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        nid_t nid;
+        unsigned int nofs;
+        block_t new_addr;
+        struct node_info ni;
+        if (wbc->for_reclaim) {
+                dec_page_count(sbi, F2FS_DIRTY_NODES);
+                wbc->pages_skipped++;
+                set_page_dirty(page);
+                return AOP_WRITEPAGE_ACTIVATE;
+        }
+        wait_on_page_writeback(page);
+        mutex_lock_op(sbi, NODE_WRITE);
+        /* get old block addr of this node page */
+        nid = nid_of_node(page);
+        nofs = ofs_of_node(page);
+        BUG_ON(page->index != nid);
+        get_node_info(sbi, nid, &ni);
+        /* This page is already truncated */
+        if (ni.blk_addr == NULL_ADDR)
+                return 0;
+        set_page_writeback(page);
+        /* insert node offset */
+        write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
+        set_node_addr(sbi, &ni, new_addr);
+        dec_page_count(sbi, F2FS_DIRTY_NODES);
+        mutex_unlock_op(sbi, NODE_WRITE);
+        unlock_page(page);
+        return 0;
+}
+static int f2fs_write_node_pages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        struct block_device *bdev = sbi->sb->s_bdev;
+        long nr_to_write = wbc->nr_to_write;
+        if (wbc->for_kupdate)
+                return 0;
+        if (get_pages(sbi, F2FS_DIRTY_NODES) == 0)
+                return 0;
+        if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
+                write_checkpoint(sbi, false, false);
+                return 0;
+        }
+        /* if mounting is failed, skip writing node pages */
+        wbc->nr_to_write = bio_get_nr_vecs(bdev);
+        sync_node_pages(sbi, 0, wbc);
+        wbc->nr_to_write = nr_to_write -
+                (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
+        return 0;
+}
+static int f2fs_set_node_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        SetPageUptodate(page);
+        if (!PageDirty(page)) {
+                __set_page_dirty_nobuffers(page);
+                inc_page_count(sbi, F2FS_DIRTY_NODES);
+                SetPagePrivate(page);
+                return 1;
+        }
+        return 0;
+}
+static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+{
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        if (PageDirty(page))
+                dec_page_count(sbi, F2FS_DIRTY_NODES);
+        ClearPagePrivate(page);
+}
+static int f2fs_release_node_page(struct page *page, gfp_t wait)
+{
+        ClearPagePrivate(page);
+        return 0;
+}
+/*
+ * Structure of the f2fs node operations
+ */
+const struct address_space_operations f2fs_node_aops = {
+        .writepage      = f2fs_write_node_page,
+        .writepages     = f2fs_write_node_pages,
+        .set_page_dirty = f2fs_set_node_page_dirty,
+        .invalidatepage = f2fs_invalidate_node_page,
+        .releasepage    = f2fs_release_node_page,
+};
+static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
+{
+        struct list_head *this;
+        struct free_nid *i = NULL;
+        list_for_each(this, head) {
+                i = list_entry(this, struct free_nid, list);
+                if (i->nid == n)
+                        break;
+                i = NULL;
+        }
+        return i;
+}
+static void __del_from_free_nid_list(struct free_nid *i)
+{
+        list_del(&i->list);
+        kmem_cache_free(free_nid_slab, i);
+}
+static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+        struct free_nid *i;
+        if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+                return 0;
+retry:
+        i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
+        if (!i) {
+                cond_resched();
+                goto retry;
+        }
+        i->nid = nid;
+        i->state = NID_NEW;
+        spin_lock(&nm_i->free_nid_list_lock);
+        if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
+                spin_unlock(&nm_i->free_nid_list_lock);
+                kmem_cache_free(free_nid_slab, i);
+                return 0;
+        }
+        list_add_tail(&i->list, &nm_i->free_nid_list);
+        nm_i->fcnt++;
+        spin_unlock(&nm_i->free_nid_list_lock);
+        return 1;
+}
+static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+        struct free_nid *i;
+        spin_lock(&nm_i->free_nid_list_lock);
+        i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+        if (i && i->state == NID_NEW) {
+                __del_from_free_nid_list(i);
+                nm_i->fcnt--;
+        }
+        spin_unlock(&nm_i->free_nid_list_lock);
+}
+static int scan_nat_page(struct f2fs_nm_info *nm_i,
+                        struct page *nat_page, nid_t start_nid)
+{
+        struct f2fs_nat_block *nat_blk = page_address(nat_page);
+        block_t blk_addr;
+        int fcnt = 0;
+        int i;
+        /* 0 nid should not be used */
+        if (start_nid == 0)
+                ++start_nid;
+        i = start_nid % NAT_ENTRY_PER_BLOCK;
+        for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+                blk_addr  = le32_to_cpu(nat_blk->entries[i].block_addr);
+                BUG_ON(blk_addr == NEW_ADDR);
+                if (blk_addr == NULL_ADDR)
+                        fcnt += add_free_nid(nm_i, start_nid);
+        }
+        return fcnt;
+}
+static void build_free_nids(struct f2fs_sb_info *sbi)
+{
+        struct free_nid *fnid, *next_fnid;
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        nid_t nid = 0;
+        bool is_cycled = false;
+        int fcnt = 0;
+        int i;
+        nid = nm_i->next_scan_nid;
+        nm_i->init_scan_nid = nid;
+        ra_nat_pages(sbi, nid);
+        while (1) {
+                struct page *page = get_current_nat_page(sbi, nid);
+                fcnt += scan_nat_page(nm_i, page, nid);
+                f2fs_put_page(page, 1);
+                nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
+                if (nid >= nm_i->max_nid) {
+                        nid = 0;
+                        is_cycled = true;
+                }
+                if (fcnt > MAX_FREE_NIDS)
+                        break;
+                if (is_cycled && nm_i->init_scan_nid <= nid)
+                        break;
+        }
+        nm_i->next_scan_nid = nid;
+        /* find free nids from current sum_pages */
+        mutex_lock(&curseg->curseg_mutex);
+        for (i = 0; i < nats_in_cursum(sum); i++) {
+                block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
+                nid = le32_to_cpu(nid_in_journal(sum, i));
+                if (addr == NULL_ADDR)
+                        add_free_nid(nm_i, nid);
+                else
+                        remove_free_nid(nm_i, nid);
+        }
+        mutex_unlock(&curseg->curseg_mutex);
+        /* remove the free nids from current allocated nids */
+        list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
+                struct nat_entry *ne;
+                read_lock(&nm_i->nat_tree_lock);
+                ne = __lookup_nat_cache(nm_i, fnid->nid);
+                if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+                        remove_free_nid(nm_i, fnid->nid);
+                read_unlock(&nm_i->nat_tree_lock);
+        }
+}
+/*
+ * If this function returns success, caller can obtain a new nid
+ * from second parameter of this function.
+ * The returned nid could be used ino as well as nid when inode is created.
+ */
+bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct free_nid *i = NULL;
+        struct list_head *this;
+retry:
+        mutex_lock(&nm_i->build_lock);
+        if (!nm_i->fcnt) {
+                /* scan NAT in order to build free nid list */
+                build_free_nids(sbi);
+                if (!nm_i->fcnt) {
+                        mutex_unlock(&nm_i->build_lock);
+                        return false;
+                }
+        }
+        mutex_unlock(&nm_i->build_lock);
+        /*
+         * We check fcnt again since previous check is racy as
+         * we didn't hold free_nid_list_lock. So other thread
+         * could consume all of free nids.
+         */
+        spin_lock(&nm_i->free_nid_list_lock);
+        if (!nm_i->fcnt) {
+                spin_unlock(&nm_i->free_nid_list_lock);
+                goto retry;
+        }
+        BUG_ON(list_empty(&nm_i->free_nid_list));
+        list_for_each(this, &nm_i->free_nid_list) {
+                i = list_entry(this, struct free_nid, list);
+                if (i->state == NID_NEW)
+                        break;
+        }
+        BUG_ON(i->state != NID_NEW);
+        *nid = i->nid;
+        i->state = NID_ALLOC;
+        nm_i->fcnt--;
+        spin_unlock(&nm_i->free_nid_list_lock);
+        return true;
+}
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct free_nid *i;
+        spin_lock(&nm_i->free_nid_list_lock);
+        i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+        if (i) {
+                BUG_ON(i->state != NID_ALLOC);
+                __del_from_free_nid_list(i);
+        }
+        spin_unlock(&nm_i->free_nid_list_lock);
+}
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        alloc_nid_done(sbi, nid);
+        add_free_nid(NM_I(sbi), nid);
+}
+void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
+                struct f2fs_summary *sum, struct node_info *ni,
+                block_t new_blkaddr)
+{
+        rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
+        set_node_addr(sbi, ni, new_blkaddr);
+        clear_node_page_dirty(page);
+}
+int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+{
+        struct address_space *mapping = sbi->node_inode->i_mapping;
+        struct f2fs_node *src, *dst;
+        nid_t ino = ino_of_node(page);
+        struct node_info old_ni, new_ni;
+        struct page *ipage;
+        ipage = grab_cache_page(mapping, ino);
+        if (!ipage)
+                return -ENOMEM;
+        /* Should not use this inode  from free nid list */
+        remove_free_nid(NM_I(sbi), ino);
+        get_node_info(sbi, ino, &old_ni);
+        SetPageUptodate(ipage);
+        fill_node_footer(ipage, ino, ino, 0, true);
+        src = (struct f2fs_node *)page_address(page);
+        dst = (struct f2fs_node *)page_address(ipage);
+        memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
+        dst->i.i_size = 0;
+        dst->i.i_blocks = cpu_to_le64(1);
+        dst->i.i_links = cpu_to_le32(1);
+        dst->i.i_xattr_nid = 0;
+        new_ni = old_ni;
+        new_ni.ino = ino;
+        set_node_addr(sbi, &new_ni, NEW_ADDR);
+        inc_valid_inode_count(sbi);
+        f2fs_put_page(ipage, 1);
+        return 0;
+}
+int restore_node_summary(struct f2fs_sb_info *sbi,
+                        unsigned int segno, struct f2fs_summary_block *sum)
+{
+        struct f2fs_node *rn;
+        struct f2fs_summary *sum_entry;
+        struct page *page;
+        block_t addr;
+        int i, last_offset;
+        /* alloc temporal page for read node */
+        page = alloc_page(GFP_NOFS | __GFP_ZERO);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        lock_page(page);
+        /* scan the node segment */
+        last_offset = sbi->blocks_per_seg;
+        addr = START_BLOCK(sbi, segno);
+        sum_entry = &sum->entries[0];
+        for (i = 0; i < last_offset; i++, sum_entry++) {
+                if (f2fs_readpage(sbi, page, addr, READ_SYNC))
+                        goto out;
+                rn = (struct f2fs_node *)page_address(page);
+                sum_entry->nid = rn->footer.nid;
+                sum_entry->version = 0;
+                sum_entry->ofs_in_node = 0;
+                addr++;
+                /*
+                 * In order to read next node page,
+                 * we must clear PageUptodate flag.
+                 */
+                ClearPageUptodate(page);
+        }
+out:
+        unlock_page(page);
+        __free_pages(page, 0);
+        return 0;
+}
+static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        int i;
+        mutex_lock(&curseg->curseg_mutex);
+        if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
+                mutex_unlock(&curseg->curseg_mutex);
+                return false;
+        }
+        for (i = 0; i < nats_in_cursum(sum); i++) {
+                struct nat_entry *ne;
+                struct f2fs_nat_entry raw_ne;
+                nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+                raw_ne = nat_in_journal(sum, i);
+retry:
+                write_lock(&nm_i->nat_tree_lock);
+                ne = __lookup_nat_cache(nm_i, nid);
+                if (ne) {
+                        __set_nat_cache_dirty(nm_i, ne);
+                        write_unlock(&nm_i->nat_tree_lock);
+                        continue;
+                }
+                ne = grab_nat_entry(nm_i, nid);
+                if (!ne) {
+                        write_unlock(&nm_i->nat_tree_lock);
+                        goto retry;
+                }
+                nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
+                nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
+                nat_set_version(ne, raw_ne.version);
+                __set_nat_cache_dirty(nm_i, ne);
+                write_unlock(&nm_i->nat_tree_lock);
+        }
+        update_nats_in_cursum(sum, -i);
+        mutex_unlock(&curseg->curseg_mutex);
+        return true;
+}
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct list_head *cur, *n;
+        struct page *page = NULL;
+        struct f2fs_nat_block *nat_blk = NULL;
+        nid_t start_nid = 0, end_nid = 0;
+        bool flushed;
+        flushed = flush_nats_in_journal(sbi);
+        if (!flushed)
+                mutex_lock(&curseg->curseg_mutex);
+        /* 1) flush dirty nat caches */
+        list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
+                struct nat_entry *ne;
+                nid_t nid;
+                struct f2fs_nat_entry raw_ne;
+                int offset = -1;
+                block_t old_blkaddr, new_blkaddr;
+                ne = list_entry(cur, struct nat_entry, list);
+                nid = nat_get_nid(ne);
+                if (nat_get_blkaddr(ne) == NEW_ADDR)
+                        continue;
+                if (flushed)
+                        goto to_nat_page;
+                /* if there is room for nat enries in curseg->sumpage */
+                offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
+                if (offset >= 0) {
+                        raw_ne = nat_in_journal(sum, offset);
+                        old_blkaddr = le32_to_cpu(raw_ne.block_addr);
+                        goto flush_now;
+                }
+to_nat_page:
+                if (!page || (start_nid > nid || nid > end_nid)) {
+                        if (page) {
+                                f2fs_put_page(page, 1);
+                                page = NULL;
+                        }
+                        start_nid = START_NID(nid);
+                        end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
+                        /*
+                         * get nat block with dirty flag, increased reference
+                         * count, mapped and lock
+                         */
+                        page = get_next_nat_page(sbi, start_nid);
+                        nat_blk = page_address(page);
+                }
+                BUG_ON(!nat_blk);
+                raw_ne = nat_blk->entries[nid - start_nid];
+                old_blkaddr = le32_to_cpu(raw_ne.block_addr);
+flush_now:
+                new_blkaddr = nat_get_blkaddr(ne);
+                raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
+                raw_ne.block_addr = cpu_to_le32(new_blkaddr);
+                raw_ne.version = nat_get_version(ne);
+                if (offset < 0) {
+                        nat_blk->entries[nid - start_nid] = raw_ne;
+                } else {
+                        nat_in_journal(sum, offset) = raw_ne;
+                        nid_in_journal(sum, offset) = cpu_to_le32(nid);
+                }
+                if (nat_get_blkaddr(ne) == NULL_ADDR) {
+                        write_lock(&nm_i->nat_tree_lock);
+                        __del_from_nat_cache(nm_i, ne);
+                        write_unlock(&nm_i->nat_tree_lock);
+                        /* We can reuse this freed nid at this point */
+                        add_free_nid(NM_I(sbi), nid);
+                } else {
+                        write_lock(&nm_i->nat_tree_lock);
+                        __clear_nat_cache_dirty(nm_i, ne);
+                        ne->checkpointed = true;
+                        write_unlock(&nm_i->nat_tree_lock);
+                }
+        }
+        if (!flushed)
+                mutex_unlock(&curseg->curseg_mutex);
+        f2fs_put_page(page, 1);
+        /* 2) shrink nat caches if necessary */
+        try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
+}
+static int init_node_manager(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        unsigned char *version_bitmap;
+        unsigned int nat_segs, nat_blocks;
+        nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
+        /* segment_count_nat includes pair segment so divide to 2. */
+        nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
+        nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+        nm_i->fcnt = 0;
+        nm_i->nat_cnt = 0;
+        INIT_LIST_HEAD(&nm_i->free_nid_list);
+        INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+        INIT_LIST_HEAD(&nm_i->nat_entries);
+        INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
+        mutex_init(&nm_i->build_lock);
+        spin_lock_init(&nm_i->free_nid_list_lock);
+        rwlock_init(&nm_i->nat_tree_lock);
+        nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
+        nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+        nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+        nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
+        if (!nm_i->nat_bitmap)
+                return -ENOMEM;
+        version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
+        if (!version_bitmap)
+                return -EFAULT;
+        /* copy version bitmap */
+        memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
+        return 0;
+}
+int build_node_manager(struct f2fs_sb_info *sbi)
+{
+        int err;
+        sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
+        if (!sbi->nm_info)
+                return -ENOMEM;
+        err = init_node_manager(sbi);
+        if (err)
+                return err;
+        build_free_nids(sbi);
+        return 0;
+}
+void destroy_node_manager(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct free_nid *i, *next_i;
+        struct nat_entry *natvec[NATVEC_SIZE];
+        nid_t nid = 0;
+        unsigned int found;
+        if (!nm_i)
+                return;
+        /* destroy free nid list */
+        spin_lock(&nm_i->free_nid_list_lock);
+        list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+                BUG_ON(i->state == NID_ALLOC);
+                __del_from_free_nid_list(i);
+                nm_i->fcnt--;
+        }
+        BUG_ON(nm_i->fcnt);
+        spin_unlock(&nm_i->free_nid_list_lock);
+        /* destroy nat cache */
+        write_lock(&nm_i->nat_tree_lock);
+        while ((found = __gang_lookup_nat_cache(nm_i,
+                                        nid, NATVEC_SIZE, natvec))) {
+                unsigned idx;
+                for (idx = 0; idx < found; idx++) {
+                        struct nat_entry *e = natvec[idx];
+                        nid = nat_get_nid(e) + 1;
+                        __del_from_nat_cache(nm_i, e);
+                }
+        }
+        BUG_ON(nm_i->nat_cnt);
+        write_unlock(&nm_i->nat_tree_lock);
+        kfree(nm_i->nat_bitmap);
+        sbi->nm_info = NULL;
+        kfree(nm_i);
+}
+int create_node_manager_caches(void)
+{
+        nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+                        sizeof(struct nat_entry), NULL);
+        if (!nat_entry_slab)
+                return -ENOMEM;
+        free_nid_slab = f2fs_kmem_cache_create("free_nid",
+                        sizeof(struct free_nid), NULL);
+        if (!free_nid_slab) {
+                kmem_cache_destroy(nat_entry_slab);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void destroy_node_manager_caches(void)
+{
+        kmem_cache_destroy(free_nid_slab);
+        kmem_cache_destroy(nat_entry_slab);
+}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
new file mode 100644
index 000000000000..afdb130f782e
--- /dev/null
+++ b/fs/f2fs/node.h
@@ -0,0 +1,353 @@
+/*
+ * fs/f2fs/node.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* start node id of a node block dedicated to the given node id */
+#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+/* node block offset on the NAT area dedicated to the given start node id */
+#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+/* # of pages to perform readahead before building free nids */
+#define FREE_NID_PAGES 4
+/* maximum # of free node ids to produce during build_free_nids */
+#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
+/* maximum readahead size for node during getting data blocks */
+#define MAX_RA_NODE             128
+/* maximum cached nat entries to manage memory footprint */
+#define NM_WOUT_THRESHOLD       (64 * NAT_ENTRY_PER_BLOCK)
+/* vector size for gang look-up from nat cache that consists of radix tree */
+#define NATVEC_SIZE     64
+/*
+ * For node information
+ */
+struct node_info {
+        nid_t nid;              /* node id */
+        nid_t ino;              /* inode number of the node's owner */
+        block_t blk_addr;       /* block address of the node */
+        unsigned char version;  /* version of the node */
+};
+struct nat_entry {
+        struct list_head list;  /* for clean or dirty nat list */
+        bool checkpointed;      /* whether it is checkpointed or not */
+        struct node_info ni;    /* in-memory node information */
+};
+#define nat_get_nid(nat)                (nat->ni.nid)
+#define nat_set_nid(nat, n)             (nat->ni.nid = n)
+#define nat_get_blkaddr(nat)            (nat->ni.blk_addr)
+#define nat_set_blkaddr(nat, b)         (nat->ni.blk_addr = b)
+#define nat_get_ino(nat)                (nat->ni.ino)
+#define nat_set_ino(nat, i)             (nat->ni.ino = i)
+#define nat_get_version(nat)            (nat->ni.version)
+#define nat_set_version(nat, v)         (nat->ni.version = v)
+#define __set_nat_cache_dirty(nm_i, ne)                                 \
+        list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
+#define __clear_nat_cache_dirty(nm_i, ne)                               \
+        list_move_tail(&ne->list, &nm_i->nat_entries);
+#define inc_node_version(version)       (++version)
+static inline void node_info_from_raw_nat(struct node_info *ni,
+                                                struct f2fs_nat_entry *raw_ne)
+{
+        ni->ino = le32_to_cpu(raw_ne->ino);
+        ni->blk_addr = le32_to_cpu(raw_ne->block_addr);
+        ni->version = raw_ne->version;
+}
+/*
+ * For free nid mangement
+ */
+enum nid_state {
+        NID_NEW,        /* newly added to free nid list */
+        NID_ALLOC       /* it is allocated */
+};
+struct free_nid {
+        struct list_head list;  /* for free node id list */
+        nid_t nid;              /* node id */
+        int state;              /* in use or not: NID_NEW or NID_ALLOC */
+};
+static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct free_nid *fnid;
+        if (nm_i->fcnt <= 0)
+                return -1;
+        spin_lock(&nm_i->free_nid_list_lock);
+        fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+        *nid = fnid->nid;
+        spin_unlock(&nm_i->free_nid_list_lock);
+        return 0;
+}
+/*
+ * inline functions
+ */
+static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
+}
+static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        pgoff_t block_off;
+        pgoff_t block_addr;
+        int seg_off;
+        block_off = NAT_BLOCK_OFFSET(start);
+        seg_off = block_off >> sbi->log_blocks_per_seg;
+        block_addr = (pgoff_t)(nm_i->nat_blkaddr +
+                (seg_off << sbi->log_blocks_per_seg << 1) +
+                (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+        if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+                block_addr += sbi->blocks_per_seg;
+        return block_addr;
+}
+static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
+                                                pgoff_t block_addr)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        block_addr -= nm_i->nat_blkaddr;
+        if ((block_addr >> sbi->log_blocks_per_seg) % 2)
+                block_addr -= sbi->blocks_per_seg;
+        else
+                block_addr += sbi->blocks_per_seg;
+        return block_addr + nm_i->nat_blkaddr;
+}
+static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
+{
+        unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
+        if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+                f2fs_clear_bit(block_off, nm_i->nat_bitmap);
+        else
+                f2fs_set_bit(block_off, nm_i->nat_bitmap);
+}
+static inline void fill_node_footer(struct page *page, nid_t nid,
+                                nid_t ino, unsigned int ofs, bool reset)
+{
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        if (reset)
+                memset(rn, 0, sizeof(*rn));
+        rn->footer.nid = cpu_to_le32(nid);
+        rn->footer.ino = cpu_to_le32(ino);
+        rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+}
+static inline void copy_node_footer(struct page *dst, struct page *src)
+{
+        void *src_addr = page_address(src);
+        void *dst_addr = page_address(dst);
+        struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
+        struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
+        memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
+}
+static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        rn->footer.cp_ver = ckpt->checkpoint_ver;
+        rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
+}
+static inline nid_t ino_of_node(struct page *node_page)
+{
+        void *kaddr = page_address(node_page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        return le32_to_cpu(rn->footer.ino);
+}
+static inline nid_t nid_of_node(struct page *node_page)
+{
+        void *kaddr = page_address(node_page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        return le32_to_cpu(rn->footer.nid);
+}
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+        void *kaddr = page_address(node_page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        unsigned flag = le32_to_cpu(rn->footer.flag);
+        return flag >> OFFSET_BIT_SHIFT;
+}
+static inline unsigned long long cpver_of_node(struct page *node_page)
+{
+        void *kaddr = page_address(node_page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        return le64_to_cpu(rn->footer.cp_ver);
+}
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+        void *kaddr = page_address(node_page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        return le32_to_cpu(rn->footer.next_blkaddr);
+}
+/*
+ * f2fs assigns the following node offsets described as (num).
+ * N = NIDS_PER_BLOCK
+ *
+ *  Inode block (0)
+ *    |- direct node (1)
+ *    |- direct node (2)
+ *    |- indirect node (3)
+ *    |            `- direct node (4 => 4 + N - 1)
+ *    |- indirect node (4 + N)
+ *    |            `- direct node (5 + N => 5 + 2N - 1)
+ *    `- double indirect node (5 + 2N)
+ *                 `- indirect node (6 + 2N)
+ *                       `- direct node (x(N + 1))
+ */
+static inline bool IS_DNODE(struct page *node_page)
+{
+        unsigned int ofs = ofs_of_node(node_page);
+        if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
+                        ofs == 5 + 2 * NIDS_PER_BLOCK)
+                return false;
+        if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
+                ofs -= 6 + 2 * NIDS_PER_BLOCK;
+                if ((long int)ofs % (NIDS_PER_BLOCK + 1))
+                        return false;
+        }
+        return true;
+}
+static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+{
+        struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+        wait_on_page_writeback(p);
+        if (i)
+                rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
+        else
+                rn->in.nid[off] = cpu_to_le32(nid);
+        set_page_dirty(p);
+}
+static inline nid_t get_nid(struct page *p, int off, bool i)
+{
+        struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+        if (i)
+                return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
+        return le32_to_cpu(rn->in.nid[off]);
+}
+/*
+ * Coldness identification:
+ *  - Mark cold files in f2fs_inode_info
+ *  - Mark cold node blocks in their node footer
+ *  - Mark cold data pages in page cache
+ */
+static inline int is_cold_file(struct inode *inode)
+{
+        return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+}
+static inline int is_cold_data(struct page *page)
+{
+        return PageChecked(page);
+}
+static inline void set_cold_data(struct page *page)
+{
+        SetPageChecked(page);
+}
+static inline void clear_cold_data(struct page *page)
+{
+        ClearPageChecked(page);
+}
+static inline int is_cold_node(struct page *page)
+{
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        return flag & (0x1 << COLD_BIT_SHIFT);
+}
+static inline unsigned char is_fsync_dnode(struct page *page)
+{
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        return flag & (0x1 << FSYNC_BIT_SHIFT);
+}
+static inline unsigned char is_dent_dnode(struct page *page)
+{
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        return flag & (0x1 << DENT_BIT_SHIFT);
+}
+static inline void set_cold_node(struct inode *inode, struct page *page)
+{
+        struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
+        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        if (S_ISDIR(inode->i_mode))
+                flag &= ~(0x1 << COLD_BIT_SHIFT);
+        else
+                flag |= (0x1 << COLD_BIT_SHIFT);
+        rn->footer.flag = cpu_to_le32(flag);
+}
+static inline void set_fsync_mark(struct page *page, int mark)
+{
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        if (mark)
+                flag |= (0x1 << FSYNC_BIT_SHIFT);
+        else
+                flag &= ~(0x1 << FSYNC_BIT_SHIFT);
+        rn->footer.flag = cpu_to_le32(flag);
+}
+static inline void set_dentry_mark(struct page *page, int mark)
+{
+        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        if (mark)
+                flag |= (0x1 << DENT_BIT_SHIFT);
+        else
+                flag &= ~(0x1 << DENT_BIT_SHIFT);
+        rn->footer.flag = cpu_to_le32(flag);
+}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
new file mode 100644
index 000000000000..b07e9b6ef376
--- /dev/null
+++ b/fs/f2fs/recovery.c
@@ -0,0 +1,375 @@
+/*
+ * fs/f2fs/recovery.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+static struct kmem_cache *fsync_entry_slab;
+bool space_for_roll_forward(struct f2fs_sb_info *sbi)
+{
+        if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
+                        > sbi->user_block_count)
+                return false;
+        return true;
+}
+static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
+                                                                nid_t ino)
+{
+        struct list_head *this;
+        struct fsync_inode_entry *entry;
+        list_for_each(this, head) {
+                entry = list_entry(this, struct fsync_inode_entry, list);
+                if (entry->inode->i_ino == ino)
+                        return entry;
+        }
+        return NULL;
+}
+static int recover_dentry(struct page *ipage, struct inode *inode)
+{
+        struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+        struct f2fs_inode *raw_inode = &(raw_node->i);
+        struct dentry dent, parent;
+        struct f2fs_dir_entry *de;
+        struct page *page;
+        struct inode *dir;
+        int err = 0;
+        if (!is_dent_dnode(ipage))
+                goto out;
+        dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
+        if (IS_ERR(dir)) {
+                err = -EINVAL;
+                goto out;
+        }
+        parent.d_inode = dir;
+        dent.d_parent = &parent;
+        dent.d_name.len = le32_to_cpu(raw_inode->i_namelen);
+        dent.d_name.name = raw_inode->i_name;
+        de = f2fs_find_entry(dir, &dent.d_name, &page);
+        if (de) {
+                kunmap(page);
+                f2fs_put_page(page, 0);
+        } else {
+                f2fs_add_link(&dent, inode);
+        }
+        iput(dir);
+out:
+        kunmap(ipage);
+        return err;
+}
+static int recover_inode(struct inode *inode, struct page *node_page)
+{
+        void *kaddr = page_address(node_page);
+        struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
+        struct f2fs_inode *raw_inode = &(raw_node->i);
+        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+        i_size_write(inode, le64_to_cpu(raw_inode->i_size));
+        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+        inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+        inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+        inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+        return recover_dentry(node_page, inode);
+}
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+{
+        unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+        struct curseg_info *curseg;
+        struct page *page;
+        block_t blkaddr;
+        int err = 0;
+        /* get node pages in the current segment */
+        curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+        blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
+        /* read node page */
+        page = alloc_page(GFP_F2FS_ZERO);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        lock_page(page);
+        while (1) {
+                struct fsync_inode_entry *entry;
+                if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+                        goto out;
+                if (cp_ver != cpver_of_node(page))
+                        goto out;
+                if (!is_fsync_dnode(page))
+                        goto next;
+                entry = get_fsync_inode(head, ino_of_node(page));
+                if (entry) {
+                        entry->blkaddr = blkaddr;
+                        if (IS_INODE(page) && is_dent_dnode(page))
+                                set_inode_flag(F2FS_I(entry->inode),
+                                                        FI_INC_LINK);
+                } else {
+                        if (IS_INODE(page) && is_dent_dnode(page)) {
+                                if (recover_inode_page(sbi, page)) {
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                        }
+                        /* add this fsync inode to the list */
+                        entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+                        if (!entry) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        INIT_LIST_HEAD(&entry->list);
+                        list_add_tail(&entry->list, head);
+                        entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
+                        if (IS_ERR(entry->inode)) {
+                                err = PTR_ERR(entry->inode);
+                                goto out;
+                        }
+                        entry->blkaddr = blkaddr;
+                }
+                if (IS_INODE(page)) {
+                        err = recover_inode(entry->inode, page);
+                        if (err)
+                                goto out;
+                }
+next:
+                /* check next segment */
+                blkaddr = next_blkaddr_of_node(page);
+                ClearPageUptodate(page);
+        }
+out:
+        unlock_page(page);
+        __free_pages(page, 0);
+        return err;
+}
+static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
+                                        struct list_head *head)
+{
+        struct list_head *this;
+        struct fsync_inode_entry *entry;
+        list_for_each(this, head) {
+                entry = list_entry(this, struct fsync_inode_entry, list);
+                iput(entry->inode);
+                list_del(&entry->list);
+                kmem_cache_free(fsync_entry_slab, entry);
+        }
+}
+static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+                                                block_t blkaddr)
+{
+        struct seg_entry *sentry;
+        unsigned int segno = GET_SEGNO(sbi, blkaddr);
+        unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
+                                        (sbi->blocks_per_seg - 1);
+        struct f2fs_summary sum;
+        nid_t ino;
+        void *kaddr;
+        struct inode *inode;
+        struct page *node_page;
+        block_t bidx;
+        int i;
+        sentry = get_seg_entry(sbi, segno);
+        if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
+                return;
+        /* Get the previous summary */
+        for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
+                struct curseg_info *curseg = CURSEG_I(sbi, i);
+                if (curseg->segno == segno) {
+                        sum = curseg->sum_blk->entries[blkoff];
+                        break;
+                }
+        }
+        if (i > CURSEG_COLD_DATA) {
+                struct page *sum_page = get_sum_page(sbi, segno);
+                struct f2fs_summary_block *sum_node;
+                kaddr = page_address(sum_page);
+                sum_node = (struct f2fs_summary_block *)kaddr;
+                sum = sum_node->entries[blkoff];
+                f2fs_put_page(sum_page, 1);
+        }
+        /* Get the node page */
+        node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+        bidx = start_bidx_of_node(ofs_of_node(node_page)) +
+                                le16_to_cpu(sum.ofs_in_node);
+        ino = ino_of_node(node_page);
+        f2fs_put_page(node_page, 1);
+        /* Deallocate previous index in the node page */
+        inode = f2fs_iget_nowait(sbi->sb, ino);
+        truncate_hole(inode, bidx, bidx + 1);
+        iput(inode);
+}
+static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+                                        struct page *page, block_t blkaddr)
+{
+        unsigned int start, end;
+        struct dnode_of_data dn;
+        struct f2fs_summary sum;
+        struct node_info ni;
+        start = start_bidx_of_node(ofs_of_node(page));
+        if (IS_INODE(page))
+                end = start + ADDRS_PER_INODE;
+        else
+                end = start + ADDRS_PER_BLOCK;
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        if (get_dnode_of_data(&dn, start, 0))
+                return;
+        wait_on_page_writeback(dn.node_page);
+        get_node_info(sbi, dn.nid, &ni);
+        BUG_ON(ni.ino != ino_of_node(page));
+        BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
+        for (; start < end; start++) {
+                block_t src, dest;
+                src = datablock_addr(dn.node_page, dn.ofs_in_node);
+                dest = datablock_addr(page, dn.ofs_in_node);
+                if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
+                        if (src == NULL_ADDR) {
+                                int err = reserve_new_block(&dn);
+                                /* We should not get -ENOSPC */
+                                BUG_ON(err);
+                        }
+                        /* Check the previous node page having this index */
+                        check_index_in_prev_nodes(sbi, dest);
+                        set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+                        /* write dummy data page */
+                        recover_data_page(sbi, NULL, &sum, src, dest);
+                        update_extent_cache(dest, &dn);
+                }
+                dn.ofs_in_node++;
+        }
+        /* write node page in place */
+        set_summary(&sum, dn.nid, 0, 0);
+        if (IS_INODE(dn.node_page))
+                sync_inode_page(&dn);
+        copy_node_footer(dn.node_page, page);
+        fill_node_footer(dn.node_page, dn.nid, ni.ino,
+                                        ofs_of_node(page), false);
+        set_page_dirty(dn.node_page);
+        recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+        f2fs_put_dnode(&dn);
+}
+static void recover_data(struct f2fs_sb_info *sbi,
+                                struct list_head *head, int type)
+{
+        unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+        struct curseg_info *curseg;
+        struct page *page;
+        block_t blkaddr;
+        /* get node pages in the current segment */
+        curseg = CURSEG_I(sbi, type);
+        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+        /* read node page */
+        page = alloc_page(GFP_NOFS | __GFP_ZERO);
+        if (IS_ERR(page))
+                return;
+        lock_page(page);
+        while (1) {
+                struct fsync_inode_entry *entry;
+                if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+                        goto out;
+                if (cp_ver != cpver_of_node(page))
+                        goto out;
+                entry = get_fsync_inode(head, ino_of_node(page));
+                if (!entry)
+                        goto next;
+                do_recover_data(sbi, entry->inode, page, blkaddr);
+                if (entry->blkaddr == blkaddr) {
+                        iput(entry->inode);
+                        list_del(&entry->list);
+                        kmem_cache_free(fsync_entry_slab, entry);
+                }
+next:
+                /* check next segment */
+                blkaddr = next_blkaddr_of_node(page);
+                ClearPageUptodate(page);
+        }
+out:
+        unlock_page(page);
+        __free_pages(page, 0);
+        allocate_new_segments(sbi);
+}
+void recover_fsync_data(struct f2fs_sb_info *sbi)
+{
+        struct list_head inode_list;
+        fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
+                        sizeof(struct fsync_inode_entry), NULL);
+        if (unlikely(!fsync_entry_slab))
+                return;
+        INIT_LIST_HEAD(&inode_list);
+        /* step #1: find fsynced inode numbers */
+        if (find_fsync_dnodes(sbi, &inode_list))
+                goto out;
+        if (list_empty(&inode_list))
+                goto out;
+        /* step #2: recover data */
+        sbi->por_doing = 1;
+        recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+        sbi->por_doing = 0;
+        BUG_ON(!list_empty(&inode_list));
+out:
+        destroy_fsync_dnodes(sbi, &inode_list);
+        kmem_cache_destroy(fsync_entry_slab);
+        write_checkpoint(sbi, false, false);
+}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
new file mode 100644
index 000000000000..1b26e4ea1016
--- /dev/null
+++ b/fs/f2fs/segment.c
@@ -0,0 +1,1791 @@
+/*
+ * fs/f2fs/segment.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "f2fs.h"
+#include "segment.h"
+#include "node.h"
+static int need_to_flush(struct f2fs_sb_info *sbi)
+{
+        unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
+                        sbi->segs_per_sec;
+        int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
+                >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+        int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
+                >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+        if (sbi->por_doing)
+                return 0;
+        if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
+                                                reserved_sections(sbi)))
+                return 1;
+        return 0;
+}
+/*
+ * This function balances dirty node and dentry pages.
+ * In addition, it controls garbage collection.
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = LONG_MAX,
+                .for_reclaim = 0,
+        };
+        if (sbi->por_doing)
+                return;
+        /*
+         * We should do checkpoint when there are so many dirty node pages
+         * with enough free segments. After then, we should do GC.
+         */
+        if (need_to_flush(sbi)) {
+                sync_dirty_dir_inodes(sbi);
+                sync_node_pages(sbi, 0, &wbc);
+        }
+        if (has_not_enough_free_secs(sbi)) {
+                mutex_lock(&sbi->gc_mutex);
+                f2fs_gc(sbi, 1);
+        }
+}
+static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+                enum dirty_type dirty_type)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        /* need not be added */
+        if (IS_CURSEG(sbi, segno))
+                return;
+        if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+                dirty_i->nr_dirty[dirty_type]++;
+        if (dirty_type == DIRTY) {
+                struct seg_entry *sentry = get_seg_entry(sbi, segno);
+                dirty_type = sentry->type;
+                if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+                        dirty_i->nr_dirty[dirty_type]++;
+        }
+}
+static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+                enum dirty_type dirty_type)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+                dirty_i->nr_dirty[dirty_type]--;
+        if (dirty_type == DIRTY) {
+                struct seg_entry *sentry = get_seg_entry(sbi, segno);
+                dirty_type = sentry->type;
+                if (test_and_clear_bit(segno,
+                                        dirty_i->dirty_segmap[dirty_type]))
+                        dirty_i->nr_dirty[dirty_type]--;
+                clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
+                clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+        }
+}
+/*
+ * Should not occur error such as -ENOMEM.
+ * Adding dirty entry into seglist is not critical operation.
+ * If a given segment is one of current working segments, it won't be added.
+ */
+void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        unsigned short valid_blocks;
+        if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
+                return;
+        mutex_lock(&dirty_i->seglist_lock);
+        valid_blocks = get_valid_blocks(sbi, segno, 0);
+        if (valid_blocks == 0) {
+                __locate_dirty_segment(sbi, segno, PRE);
+                __remove_dirty_segment(sbi, segno, DIRTY);
+        } else if (valid_blocks < sbi->blocks_per_seg) {
+                __locate_dirty_segment(sbi, segno, DIRTY);
+        } else {
+                /* Recovery routine with SSR needs this */
+                __remove_dirty_segment(sbi, segno, DIRTY);
+        }
+        mutex_unlock(&dirty_i->seglist_lock);
+        return;
+}
+/*
+ * Should call clear_prefree_segments after checkpoint is done.
+ */
+static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        unsigned int segno, offset = 0;
+        unsigned int total_segs = TOTAL_SEGS(sbi);
+        mutex_lock(&dirty_i->seglist_lock);
+        while (1) {
+                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
+                                offset);
+                if (segno >= total_segs)
+                        break;
+                __set_test_and_free(sbi, segno);
+                offset = segno + 1;
+        }
+        mutex_unlock(&dirty_i->seglist_lock);
+}
+void clear_prefree_segments(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        unsigned int segno, offset = 0;
+        unsigned int total_segs = TOTAL_SEGS(sbi);
+        mutex_lock(&dirty_i->seglist_lock);
+        while (1) {
+                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
+                                offset);
+                if (segno >= total_segs)
+                        break;
+                offset = segno + 1;
+                if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
+                        dirty_i->nr_dirty[PRE]--;
+                /* Let's use trim */
+                if (test_opt(sbi, DISCARD))
+                        blkdev_issue_discard(sbi->sb->s_bdev,
+                                        START_BLOCK(sbi, segno) <<
+                                        sbi->log_sectors_per_block,
+                                        1 << (sbi->log_sectors_per_block +
+                                                sbi->log_blocks_per_seg),
+                                        GFP_NOFS, 0);
+        }
+        mutex_unlock(&dirty_i->seglist_lock);
+}
+static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+                sit_i->dirty_sentries++;
+}
+static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
+                                        unsigned int segno, int modified)
+{
+        struct seg_entry *se = get_seg_entry(sbi, segno);
+        se->type = type;
+        if (modified)
+                __mark_sit_entry_dirty(sbi, segno);
+}
+static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
+{
+        struct seg_entry *se;
+        unsigned int segno, offset;
+        long int new_vblocks;
+        segno = GET_SEGNO(sbi, blkaddr);
+        se = get_seg_entry(sbi, segno);
+        new_vblocks = se->valid_blocks + del;
+        offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+        BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
+                                (new_vblocks > sbi->blocks_per_seg)));
+        se->valid_blocks = new_vblocks;
+        se->mtime = get_mtime(sbi);
+        SIT_I(sbi)->max_mtime = se->mtime;
+        /* Update valid block bitmap */
+        if (del > 0) {
+                if (f2fs_set_bit(offset, se->cur_valid_map))
+                        BUG();
+        } else {
+                if (!f2fs_clear_bit(offset, se->cur_valid_map))
+                        BUG();
+        }
+        if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+                se->ckpt_valid_blocks += del;
+        __mark_sit_entry_dirty(sbi, segno);
+        /* update total number of valid blocks to be written in ckpt area */
+        SIT_I(sbi)->written_valid_blocks += del;
+        if (sbi->segs_per_sec > 1)
+                get_sec_entry(sbi, segno)->valid_blocks += del;
+}
+static void refresh_sit_entry(struct f2fs_sb_info *sbi,
+                        block_t old_blkaddr, block_t new_blkaddr)
+{
+        update_sit_entry(sbi, new_blkaddr, 1);
+        if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+                update_sit_entry(sbi, old_blkaddr, -1);
+}
+void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
+{
+        unsigned int segno = GET_SEGNO(sbi, addr);
+        struct sit_info *sit_i = SIT_I(sbi);
+        BUG_ON(addr == NULL_ADDR);
+        if (addr == NEW_ADDR)
+                return;
+        /* add it into sit main buffer */
+        mutex_lock(&sit_i->sentry_lock);
+        update_sit_entry(sbi, addr, -1);
+        /* add it into dirty seglist */
+        locate_dirty_segment(sbi, segno);
+        mutex_unlock(&sit_i->sentry_lock);
+}
+/*
+ * This function should be resided under the curseg_mutex lock
+ */
+static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
+                struct f2fs_summary *sum, unsigned short offset)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        void *addr = curseg->sum_blk;
+        addr += offset * sizeof(struct f2fs_summary);
+        memcpy(addr, sum, sizeof(struct f2fs_summary));
+        return;
+}
+/*
+ * Calculate the number of current summary pages for writing
+ */
+int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+{
+        int total_size_bytes = 0;
+        int valid_sum_count = 0;
+        int i, sum_space;
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+                if (sbi->ckpt->alloc_type[i] == SSR)
+                        valid_sum_count += sbi->blocks_per_seg;
+                else
+                        valid_sum_count += curseg_blkoff(sbi, i);
+        }
+        total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
+                        + sizeof(struct nat_journal) + 2
+                        + sizeof(struct sit_journal) + 2;
+        sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
+        if (total_size_bytes < sum_space)
+                return 1;
+        else if (total_size_bytes < 2 * sum_space)
+                return 2;
+        return 3;
+}
+/*
+ * Caller should put this summary page
+ */
+struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+        return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
+}
+static void write_sum_page(struct f2fs_sb_info *sbi,
+                        struct f2fs_summary_block *sum_blk, block_t blk_addr)
+{
+        struct page *page = grab_meta_page(sbi, blk_addr);
+        void *kaddr = page_address(page);
+        memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+        set_page_dirty(page);
+        f2fs_put_page(page, 1);
+}
+static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
+                                        int ofs_unit, int type)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
+        unsigned int segno, next_segno, i;
+        int ofs = 0;
+        /*
+         * If there is not enough reserved sections,
+         * we should not reuse prefree segments.
+         */
+        if (has_not_enough_free_secs(sbi))
+                return NULL_SEGNO;
+        /*
+         * NODE page should not reuse prefree segment,
+         * since those information is used for SPOR.
+         */
+        if (IS_NODESEG(type))
+                return NULL_SEGNO;
+next:
+        segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
+        ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
+        if (segno < TOTAL_SEGS(sbi)) {
+                /* skip intermediate segments in a section */
+                if (segno % ofs_unit)
+                        goto next;
+                /* skip if whole section is not prefree */
+                next_segno = find_next_zero_bit(prefree_segmap,
+                                                TOTAL_SEGS(sbi), segno + 1);
+                if (next_segno - segno < ofs_unit)
+                        goto next;
+                /* skip if whole section was not free at the last checkpoint */
+                for (i = 0; i < ofs_unit; i++)
+                        if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
+                                goto next;
+                return segno;
+        }
+        return NULL_SEGNO;
+}
+/*
+ * Find a new segment from the free segments bitmap to right order
+ * This function should be returned with success, otherwise BUG
+ */
+static void get_new_segment(struct f2fs_sb_info *sbi,
+                        unsigned int *newseg, bool new_sec, int dir)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int total_secs = sbi->total_sections;
+        unsigned int segno, secno, zoneno;
+        unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
+        unsigned int hint = *newseg / sbi->segs_per_sec;
+        unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+        unsigned int left_start = hint;
+        bool init = true;
+        int go_left = 0;
+        int i;
+        write_lock(&free_i->segmap_lock);
+        if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+                segno = find_next_zero_bit(free_i->free_segmap,
+                                        TOTAL_SEGS(sbi), *newseg + 1);
+                if (segno < TOTAL_SEGS(sbi))
+                        goto got_it;
+        }
+find_other_zone:
+        secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
+        if (secno >= total_secs) {
+                if (dir == ALLOC_RIGHT) {
+                        secno = find_next_zero_bit(free_i->free_secmap,
+                                                total_secs, 0);
+                        BUG_ON(secno >= total_secs);
+                } else {
+                        go_left = 1;
+                        left_start = hint - 1;
+                }
+        }
+        if (go_left == 0)
+                goto skip_left;
+        while (test_bit(left_start, free_i->free_secmap)) {
+                if (left_start > 0) {
+                        left_start--;
+                        continue;
+                }
+                left_start = find_next_zero_bit(free_i->free_secmap,
+                                                total_secs, 0);
+                BUG_ON(left_start >= total_secs);
+                break;
+        }
+        secno = left_start;
+skip_left:
+        hint = secno;
+        segno = secno * sbi->segs_per_sec;
+        zoneno = secno / sbi->secs_per_zone;
+        /* give up on finding another zone */
+        if (!init)
+                goto got_it;
+        if (sbi->secs_per_zone == 1)
+                goto got_it;
+        if (zoneno == old_zoneno)
+                goto got_it;
+        if (dir == ALLOC_LEFT) {
+                if (!go_left && zoneno + 1 >= total_zones)
+                        goto got_it;
+                if (go_left && zoneno == 0)
+                        goto got_it;
+        }
+        for (i = 0; i < NR_CURSEG_TYPE; i++)
+                if (CURSEG_I(sbi, i)->zone == zoneno)
+                        break;
+        if (i < NR_CURSEG_TYPE) {
+                /* zone is in user, try another */
+                if (go_left)
+                        hint = zoneno * sbi->secs_per_zone - 1;
+                else if (zoneno + 1 >= total_zones)
+                        hint = 0;
+                else
+                        hint = (zoneno + 1) * sbi->secs_per_zone;
+                init = false;
+                goto find_other_zone;
+        }
+got_it:
+        /* set it as dirty segment in free segmap */
+        BUG_ON(test_bit(segno, free_i->free_segmap));
+        __set_inuse(sbi, segno);
+        *newseg = segno;
+        write_unlock(&free_i->segmap_lock);
+}
+static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        struct summary_footer *sum_footer;
+        curseg->segno = curseg->next_segno;
+        curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+        curseg->next_blkoff = 0;
+        curseg->next_segno = NULL_SEGNO;
+        sum_footer = &(curseg->sum_blk->footer);
+        memset(sum_footer, 0, sizeof(struct summary_footer));
+        if (IS_DATASEG(type))
+                SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
+        if (IS_NODESEG(type))
+                SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
+        __set_sit_entry_type(sbi, type, curseg->segno, modified);
+}
+/*
+ * Allocate a current working segment.
+ * This function always allocates a free segment in LFS manner.
+ */
+static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        unsigned int segno = curseg->segno;
+        int dir = ALLOC_LEFT;
+        write_sum_page(sbi, curseg->sum_blk,
+                                GET_SUM_BLOCK(sbi, curseg->segno));
+        if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
+                dir = ALLOC_RIGHT;
+        if (test_opt(sbi, NOHEAP))
+                dir = ALLOC_RIGHT;
+        get_new_segment(sbi, &segno, new_sec, dir);
+        curseg->next_segno = segno;
+        reset_curseg(sbi, type, 1);
+        curseg->alloc_type = LFS;
+}
+static void __next_free_blkoff(struct f2fs_sb_info *sbi,
+                        struct curseg_info *seg, block_t start)
+{
+        struct seg_entry *se = get_seg_entry(sbi, seg->segno);
+        block_t ofs;
+        for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
+                if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
+                        && !f2fs_test_bit(ofs, se->cur_valid_map))
+                        break;
+        }
+        seg->next_blkoff = ofs;
+}
+/*
+ * If a segment is written by LFS manner, next block offset is just obtained
+ * by increasing the current block offset. However, if a segment is written by
+ * SSR manner, next block offset obtained by calling __next_free_blkoff
+ */
+static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
+                                struct curseg_info *seg)
+{
+        if (seg->alloc_type == SSR)
+                __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
+        else
+                seg->next_blkoff++;
+}
+/*
+ * This function always allocates a used segment (from dirty seglist) by SSR
+ * manner, so it should recover the existing segment information of valid blocks
+ */
+static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        unsigned int new_segno = curseg->next_segno;
+        struct f2fs_summary_block *sum_node;
+        struct page *sum_page;
+        write_sum_page(sbi, curseg->sum_blk,
+                                GET_SUM_BLOCK(sbi, curseg->segno));
+        __set_test_and_inuse(sbi, new_segno);
+        mutex_lock(&dirty_i->seglist_lock);
+        __remove_dirty_segment(sbi, new_segno, PRE);
+        __remove_dirty_segment(sbi, new_segno, DIRTY);
+        mutex_unlock(&dirty_i->seglist_lock);
+        reset_curseg(sbi, type, 1);
+        curseg->alloc_type = SSR;
+        __next_free_blkoff(sbi, curseg, 0);
+        if (reuse) {
+                sum_page = get_sum_page(sbi, new_segno);
+                sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+                memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+                f2fs_put_page(sum_page, 1);
+        }
+}
+/*
+ * flush out current segment and replace it with new segment
+ * This function should be returned with success, otherwise BUG
+ */
+static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
+                                                int type, bool force)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        unsigned int ofs_unit;
+        if (force) {
+                new_curseg(sbi, type, true);
+                goto out;
+        }
+        ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
+        curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
+        if (curseg->next_segno != NULL_SEGNO)
+                change_curseg(sbi, type, false);
+        else if (type == CURSEG_WARM_NODE)
+                new_curseg(sbi, type, false);
+        else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
+                change_curseg(sbi, type, true);
+        else
+                new_curseg(sbi, type, false);
+out:
+        sbi->segment_count[curseg->alloc_type]++;
+}
+void allocate_new_segments(struct f2fs_sb_info *sbi)
+{
+        struct curseg_info *curseg;
+        unsigned int old_curseg;
+        int i;
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+                curseg = CURSEG_I(sbi, i);
+                old_curseg = curseg->segno;
+                SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+                locate_dirty_segment(sbi, old_curseg);
+        }
+}
+static const struct segment_allocation default_salloc_ops = {
+        .allocate_segment = allocate_segment_by_default,
+};
+static void f2fs_end_io_write(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_private *p = bio->bi_private;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (!uptodate) {
+                        SetPageError(page);
+                        if (page->mapping)
+                                set_bit(AS_EIO, &page->mapping->flags);
+                        set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
+                        set_page_dirty(page);
+                }
+                end_page_writeback(page);
+                dec_page_count(p->sbi, F2FS_WRITEBACK);
+        } while (bvec >= bio->bi_io_vec);
+        if (p->is_sync)
+                complete(p->wait);
+        kfree(p);
+        bio_put(bio);
+}
+struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
+{
+        struct bio *bio;
+        struct bio_private *priv;
+retry:
+        priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
+        if (!priv) {
+                cond_resched();
+                goto retry;
+        }
+        /* No failure on bio allocation */
+        bio = bio_alloc(GFP_NOIO, npages);
+        bio->bi_bdev = bdev;
+        bio->bi_private = priv;
+        return bio;
+}
+static void do_submit_bio(struct f2fs_sb_info *sbi,
+                                enum page_type type, bool sync)
+{
+        int rw = sync ? WRITE_SYNC : WRITE;
+        enum page_type btype = type > META ? META : type;
+        if (type >= META_FLUSH)
+                rw = WRITE_FLUSH_FUA;
+        if (sbi->bio[btype]) {
+                struct bio_private *p = sbi->bio[btype]->bi_private;
+                p->sbi = sbi;
+                sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+                if (type == META_FLUSH) {
+                        DECLARE_COMPLETION_ONSTACK(wait);
+                        p->is_sync = true;
+                        p->wait = &wait;
+                        submit_bio(rw, sbi->bio[btype]);
+                        wait_for_completion(&wait);
+                } else {
+                        p->is_sync = false;
+                        submit_bio(rw, sbi->bio[btype]);
+                }
+                sbi->bio[btype] = NULL;
+        }
+}
+void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
+{
+        down_write(&sbi->bio_sem);
+        do_submit_bio(sbi, type, sync);
+        up_write(&sbi->bio_sem);
+}
+static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
+                                block_t blk_addr, enum page_type type)
+{
+        struct block_device *bdev = sbi->sb->s_bdev;
+        verify_block_addr(sbi, blk_addr);
+        down_write(&sbi->bio_sem);
+        inc_page_count(sbi, F2FS_WRITEBACK);
+        if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
+                do_submit_bio(sbi, type, false);
+alloc_new:
+        if (sbi->bio[type] == NULL) {
+                sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
+                sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+                /*
+                 * The end_io will be assigned at the sumbission phase.
+                 * Until then, let bio_add_page() merge consecutive IOs as much
+                 * as possible.
+                 */
+        }
+        if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
+                                                        PAGE_CACHE_SIZE) {
+                do_submit_bio(sbi, type, false);
+                goto alloc_new;
+        }
+        sbi->last_block_in_bio[type] = blk_addr;
+        up_write(&sbi->bio_sem);
+}
+static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        if (curseg->next_blkoff < sbi->blocks_per_seg)
+                return true;
+        return false;
+}
+static int __get_segment_type_2(struct page *page, enum page_type p_type)
+{
+        if (p_type == DATA)
+                return CURSEG_HOT_DATA;
+        else
+                return CURSEG_HOT_NODE;
+}
+static int __get_segment_type_4(struct page *page, enum page_type p_type)
+{
+        if (p_type == DATA) {
+                struct inode *inode = page->mapping->host;
+                if (S_ISDIR(inode->i_mode))
+                        return CURSEG_HOT_DATA;
+                else
+                        return CURSEG_COLD_DATA;
+        } else {
+                if (IS_DNODE(page) && !is_cold_node(page))
+                        return CURSEG_HOT_NODE;
+                else
+                        return CURSEG_COLD_NODE;
+        }
+}
+static int __get_segment_type_6(struct page *page, enum page_type p_type)
+{
+        if (p_type == DATA) {
+                struct inode *inode = page->mapping->host;
+                if (S_ISDIR(inode->i_mode))
+                        return CURSEG_HOT_DATA;
+                else if (is_cold_data(page) || is_cold_file(inode))
+                        return CURSEG_COLD_DATA;
+                else
+                        return CURSEG_WARM_DATA;
+        } else {
+                if (IS_DNODE(page))
+                        return is_cold_node(page) ? CURSEG_WARM_NODE :
+                                                CURSEG_HOT_NODE;
+                else
+                        return CURSEG_COLD_NODE;
+        }
+}
+static int __get_segment_type(struct page *page, enum page_type p_type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        switch (sbi->active_logs) {
+        case 2:
+                return __get_segment_type_2(page, p_type);
+        case 4:
+                return __get_segment_type_4(page, p_type);
+        case 6:
+                return __get_segment_type_6(page, p_type);
+        default:
+                BUG();
+        }
+}
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+                        block_t old_blkaddr, block_t *new_blkaddr,
+                        struct f2fs_summary *sum, enum page_type p_type)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        struct curseg_info *curseg;
+        unsigned int old_cursegno;
+        int type;
+        type = __get_segment_type(page, p_type);
+        curseg = CURSEG_I(sbi, type);
+        mutex_lock(&curseg->curseg_mutex);
+        *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+        old_cursegno = curseg->segno;
+        /*
+         * __add_sum_entry should be resided under the curseg_mutex
+         * because, this function updates a summary entry in the
+         * current summary block.
+         */
+        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        mutex_lock(&sit_i->sentry_lock);
+        __refresh_next_blkoff(sbi, curseg);
+        sbi->block_count[curseg->alloc_type]++;
+        /*
+         * SIT information should be updated before segment allocation,
+         * since SSR needs latest valid block information.
+         */
+        refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+        if (!__has_curseg_space(sbi, type))
+                sit_i->s_ops->allocate_segment(sbi, type, false);
+        locate_dirty_segment(sbi, old_cursegno);
+        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+        mutex_unlock(&sit_i->sentry_lock);
+        if (p_type == NODE)
+                fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+        /* writeout dirty page into bdev */
+        submit_write_page(sbi, page, *new_blkaddr, p_type);
+        mutex_unlock(&curseg->curseg_mutex);
+}
+int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+                        struct writeback_control *wbc)
+{
+        if (wbc->for_reclaim)
+                return AOP_WRITEPAGE_ACTIVATE;
+        set_page_writeback(page);
+        submit_write_page(sbi, page, page->index, META);
+        return 0;
+}
+void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+                unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
+{
+        struct f2fs_summary sum;
+        set_summary(&sum, nid, 0, 0);
+        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+}
+void write_data_page(struct inode *inode, struct page *page,
+                struct dnode_of_data *dn, block_t old_blkaddr,
+                block_t *new_blkaddr)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_summary sum;
+        struct node_info ni;
+        BUG_ON(old_blkaddr == NULL_ADDR);
+        get_node_info(sbi, dn->nid, &ni);
+        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+        do_write_page(sbi, page, old_blkaddr,
+                        new_blkaddr, &sum, DATA);
+}
+void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
+                                        block_t old_blk_addr)
+{
+        submit_write_page(sbi, page, old_blk_addr, DATA);
+}
+void recover_data_page(struct f2fs_sb_info *sbi,
+                        struct page *page, struct f2fs_summary *sum,
+                        block_t old_blkaddr, block_t new_blkaddr)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        struct curseg_info *curseg;
+        unsigned int segno, old_cursegno;
+        struct seg_entry *se;
+        int type;
+        segno = GET_SEGNO(sbi, new_blkaddr);
+        se = get_seg_entry(sbi, segno);
+        type = se->type;
+        if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+                if (old_blkaddr == NULL_ADDR)
+                        type = CURSEG_COLD_DATA;
+                else
+                        type = CURSEG_WARM_DATA;
+        }
+        curseg = CURSEG_I(sbi, type);
+        mutex_lock(&curseg->curseg_mutex);
+        mutex_lock(&sit_i->sentry_lock);
+        old_cursegno = curseg->segno;
+        /* change the current segment */
+        if (segno != curseg->segno) {
+                curseg->next_segno = segno;
+                change_curseg(sbi, type, true);
+        }
+        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+                                        (sbi->blocks_per_seg - 1);
+        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+        locate_dirty_segment(sbi, old_cursegno);
+        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+        mutex_unlock(&sit_i->sentry_lock);
+        mutex_unlock(&curseg->curseg_mutex);
+}
+void rewrite_node_page(struct f2fs_sb_info *sbi,
+                        struct page *page, struct f2fs_summary *sum,
+                        block_t old_blkaddr, block_t new_blkaddr)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        int type = CURSEG_WARM_NODE;
+        struct curseg_info *curseg;
+        unsigned int segno, old_cursegno;
+        block_t next_blkaddr = next_blkaddr_of_node(page);
+        unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+        curseg = CURSEG_I(sbi, type);
+        mutex_lock(&curseg->curseg_mutex);
+        mutex_lock(&sit_i->sentry_lock);
+        segno = GET_SEGNO(sbi, new_blkaddr);
+        old_cursegno = curseg->segno;
+        /* change the current segment */
+        if (segno != curseg->segno) {
+                curseg->next_segno = segno;
+                change_curseg(sbi, type, true);
+        }
+        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+                                        (sbi->blocks_per_seg - 1);
+        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        /* change the current log to the next block addr in advance */
+        if (next_segno != segno) {
+                curseg->next_segno = next_segno;
+                change_curseg(sbi, type, true);
+        }
+        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
+                                        (sbi->blocks_per_seg - 1);
+        /* rewrite node page */
+        set_page_writeback(page);
+        submit_write_page(sbi, page, new_blkaddr, NODE);
+        f2fs_submit_bio(sbi, NODE, true);
+        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+        locate_dirty_segment(sbi, old_cursegno);
+        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+        mutex_unlock(&sit_i->sentry_lock);
+        mutex_unlock(&curseg->curseg_mutex);
+}
+static int read_compacted_summaries(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        struct curseg_info *seg_i;
+        unsigned char *kaddr;
+        struct page *page;
+        block_t start;
+        int i, j, offset;
+        start = start_sum_block(sbi);
+        page = get_meta_page(sbi, start++);
+        kaddr = (unsigned char *)page_address(page);
+        /* Step 1: restore nat cache */
+        seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+        /* Step 2: restore sit cache */
+        seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+        memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
+                                                SUM_JOURNAL_SIZE);
+        offset = 2 * SUM_JOURNAL_SIZE;
+        /* Step 3: restore summary entries */
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+                unsigned short blk_off;
+                unsigned int segno;
+                seg_i = CURSEG_I(sbi, i);
+                segno = le32_to_cpu(ckpt->cur_data_segno[i]);
+                blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
+                seg_i->next_segno = segno;
+                reset_curseg(sbi, i, 0);
+                seg_i->alloc_type = ckpt->alloc_type[i];
+                seg_i->next_blkoff = blk_off;
+                if (seg_i->alloc_type == SSR)
+                        blk_off = sbi->blocks_per_seg;
+                for (j = 0; j < blk_off; j++) {
+                        struct f2fs_summary *s;
+                        s = (struct f2fs_summary *)(kaddr + offset);
+                        seg_i->sum_blk->entries[j] = *s;
+                        offset += SUMMARY_SIZE;
+                        if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+                                                SUM_FOOTER_SIZE)
+                                continue;
+                        f2fs_put_page(page, 1);
+                        page = NULL;
+                        page = get_meta_page(sbi, start++);
+                        kaddr = (unsigned char *)page_address(page);
+                        offset = 0;
+                }
+        }
+        f2fs_put_page(page, 1);
+        return 0;
+}
+static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
+{
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        struct f2fs_summary_block *sum;
+        struct curseg_info *curseg;
+        struct page *new;
+        unsigned short blk_off;
+        unsigned int segno = 0;
+        block_t blk_addr = 0;
+        /* get segment number and block addr */
+        if (IS_DATASEG(type)) {
+                segno = le32_to_cpu(ckpt->cur_data_segno[type]);
+                blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
+                                                        CURSEG_HOT_DATA]);
+                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
+                else
+                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
+        } else {
+                segno = le32_to_cpu(ckpt->cur_node_segno[type -
+                                                        CURSEG_HOT_NODE]);
+                blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
+                                                        CURSEG_HOT_NODE]);
+                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
+                                                        type - CURSEG_HOT_NODE);
+                else
+                        blk_addr = GET_SUM_BLOCK(sbi, segno);
+        }
+        new = get_meta_page(sbi, blk_addr);
+        sum = (struct f2fs_summary_block *)page_address(new);
+        if (IS_NODESEG(type)) {
+                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+                        struct f2fs_summary *ns = &sum->entries[0];
+                        int i;
+                        for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+                                ns->version = 0;
+                                ns->ofs_in_node = 0;
+                        }
+                } else {
+                        if (restore_node_summary(sbi, segno, sum)) {
+                                f2fs_put_page(new, 1);
+                                return -EINVAL;
+                        }
+                }
+        }
+        /* set uncompleted segment to curseg */
+        curseg = CURSEG_I(sbi, type);
+        mutex_lock(&curseg->curseg_mutex);
+        memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+        curseg->next_segno = segno;
+        reset_curseg(sbi, type, 0);
+        curseg->alloc_type = ckpt->alloc_type[type];
+        curseg->next_blkoff = blk_off;
+        mutex_unlock(&curseg->curseg_mutex);
+        f2fs_put_page(new, 1);
+        return 0;
+}
+static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
+{
+        int type = CURSEG_HOT_DATA;
+        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+                /* restore for compacted data summary */
+                if (read_compacted_summaries(sbi))
+                        return -EINVAL;
+                type = CURSEG_HOT_NODE;
+        }
+        for (; type <= CURSEG_COLD_NODE; type++)
+                if (read_normal_summaries(sbi, type))
+                        return -EINVAL;
+        return 0;
+}
+static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+        struct page *page;
+        unsigned char *kaddr;
+        struct f2fs_summary *summary;
+        struct curseg_info *seg_i;
+        int written_size = 0;
+        int i, j;
+        page = grab_meta_page(sbi, blkaddr++);
+        kaddr = (unsigned char *)page_address(page);
+        /* Step 1: write nat cache */
+        seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+        written_size += SUM_JOURNAL_SIZE;
+        /* Step 2: write sit cache */
+        seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+        memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
+                                                SUM_JOURNAL_SIZE);
+        written_size += SUM_JOURNAL_SIZE;
+        set_page_dirty(page);
+        /* Step 3: write summary entries */
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+                unsigned short blkoff;
+                seg_i = CURSEG_I(sbi, i);
+                if (sbi->ckpt->alloc_type[i] == SSR)
+                        blkoff = sbi->blocks_per_seg;
+                else
+                        blkoff = curseg_blkoff(sbi, i);
+                for (j = 0; j < blkoff; j++) {
+                        if (!page) {
+                                page = grab_meta_page(sbi, blkaddr++);
+                                kaddr = (unsigned char *)page_address(page);
+                                written_size = 0;
+                        }
+                        summary = (struct f2fs_summary *)(kaddr + written_size);
+                        *summary = seg_i->sum_blk->entries[j];
+                        written_size += SUMMARY_SIZE;
+                        set_page_dirty(page);
+                        if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+                                                        SUM_FOOTER_SIZE)
+                                continue;
+                        f2fs_put_page(page, 1);
+                        page = NULL;
+                }
+        }
+        if (page)
+                f2fs_put_page(page, 1);
+}
+static void write_normal_summaries(struct f2fs_sb_info *sbi,
+                                        block_t blkaddr, int type)
+{
+        int i, end;
+        if (IS_DATASEG(type))
+                end = type + NR_CURSEG_DATA_TYPE;
+        else
+                end = type + NR_CURSEG_NODE_TYPE;
+        for (i = type; i < end; i++) {
+                struct curseg_info *sum = CURSEG_I(sbi, i);
+                mutex_lock(&sum->curseg_mutex);
+                write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
+                mutex_unlock(&sum->curseg_mutex);
+        }
+}
+void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+                write_compacted_summaries(sbi, start_blk);
+        else
+                write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
+}
+void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+                write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+        return;
+}
+int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+                                        unsigned int val, int alloc)
+{
+        int i;
+        if (type == NAT_JOURNAL) {
+                for (i = 0; i < nats_in_cursum(sum); i++) {
+                        if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+                                return i;
+                }
+                if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+                        return update_nats_in_cursum(sum, 1);
+        } else if (type == SIT_JOURNAL) {
+                for (i = 0; i < sits_in_cursum(sum); i++)
+                        if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+                                return i;
+                if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+                        return update_sits_in_cursum(sum, 1);
+        }
+        return -1;
+}
+static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
+                                        unsigned int segno)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+        block_t blk_addr = sit_i->sit_base_addr + offset;
+        check_seg_range(sbi, segno);
+        /* calculate sit block address */
+        if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+                blk_addr += sit_i->sit_blocks;
+        return get_meta_page(sbi, blk_addr);
+}
+static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
+                                        unsigned int start)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        struct page *src_page, *dst_page;
+        pgoff_t src_off, dst_off;
+        void *src_addr, *dst_addr;
+        src_off = current_sit_addr(sbi, start);
+        dst_off = next_sit_addr(sbi, src_off);
+        /* get current sit block page without lock */
+        src_page = get_meta_page(sbi, src_off);
+        dst_page = grab_meta_page(sbi, dst_off);
+        BUG_ON(PageDirty(src_page));
+        src_addr = page_address(src_page);
+        dst_addr = page_address(dst_page);
+        memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+        set_page_dirty(dst_page);
+        f2fs_put_page(src_page, 1);
+        set_to_next_sit(sit_i, start);
+        return dst_page;
+}
+static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        int i;
+        /*
+         * If the journal area in the current summary is full of sit entries,
+         * all the sit entries will be flushed. Otherwise the sit entries
+         * are not able to replace with newly hot sit entries.
+         */
+        if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
+                for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+                        unsigned int segno;
+                        segno = le32_to_cpu(segno_in_journal(sum, i));
+                        __mark_sit_entry_dirty(sbi, segno);
+                }
+                update_sits_in_cursum(sum, -sits_in_cursum(sum));
+                return 1;
+        }
+        return 0;
+}
+/*
+ * CP calls this function, which flushes SIT entries including sit_journal,
+ * and moves prefree segs to free segs.
+ */
+void flush_sit_entries(struct f2fs_sb_info *sbi)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        unsigned long nsegs = TOTAL_SEGS(sbi);
+        struct page *page = NULL;
+        struct f2fs_sit_block *raw_sit = NULL;
+        unsigned int start = 0, end = 0;
+        unsigned int segno = -1;
+        bool flushed;
+        mutex_lock(&curseg->curseg_mutex);
+        mutex_lock(&sit_i->sentry_lock);
+        /*
+         * "flushed" indicates whether sit entries in journal are flushed
+         * to the SIT area or not.
+         */
+        flushed = flush_sits_in_journal(sbi);
+        while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) {
+                struct seg_entry *se = get_seg_entry(sbi, segno);
+                int sit_offset, offset;
+                sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+                if (flushed)
+                        goto to_sit_page;
+                offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
+                if (offset >= 0) {
+                        segno_in_journal(sum, offset) = cpu_to_le32(segno);
+                        seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
+                        goto flush_done;
+                }
+to_sit_page:
+                if (!page || (start > segno) || (segno > end)) {
+                        if (page) {
+                                f2fs_put_page(page, 1);
+                                page = NULL;
+                        }
+                        start = START_SEGNO(sit_i, segno);
+                        end = start + SIT_ENTRY_PER_BLOCK - 1;
+                        /* read sit block that will be updated */
+                        page = get_next_sit_page(sbi, start);
+                        raw_sit = page_address(page);
+                }
+                /* udpate entry in SIT block */
+                seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
+flush_done:
+                __clear_bit(segno, bitmap);
+                sit_i->dirty_sentries--;
+        }
+        mutex_unlock(&sit_i->sentry_lock);
+        mutex_unlock(&curseg->curseg_mutex);
+        /* writeout last modified SIT block */
+        f2fs_put_page(page, 1);
+        set_prefree_as_free_segments(sbi);
+}
+static int build_sit_info(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        struct sit_info *sit_i;
+        unsigned int sit_segs, start;
+        char *src_bitmap, *dst_bitmap;
+        unsigned int bitmap_size;
+        /* allocate memory for SIT information */
+        sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
+        if (!sit_i)
+                return -ENOMEM;
+        SM_I(sbi)->sit_info = sit_i;
+        sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+        if (!sit_i->sentries)
+                return -ENOMEM;
+        bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!sit_i->dirty_sentries_bitmap)
+                return -ENOMEM;
+        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+                sit_i->sentries[start].cur_valid_map
+                        = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+                sit_i->sentries[start].ckpt_valid_map
+                        = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+                if (!sit_i->sentries[start].cur_valid_map
+                                || !sit_i->sentries[start].ckpt_valid_map)
+                        return -ENOMEM;
+        }
+        if (sbi->segs_per_sec > 1) {
+                sit_i->sec_entries = vzalloc(sbi->total_sections *
+                                        sizeof(struct sec_entry));
+                if (!sit_i->sec_entries)
+                        return -ENOMEM;
+        }
+        /* get information related with SIT */
+        sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
+        /* setup SIT bitmap from ckeckpoint pack */
+        bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
+        src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
+        dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!dst_bitmap)
+                return -ENOMEM;
+        memcpy(dst_bitmap, src_bitmap, bitmap_size);
+        /* init SIT information */
+        sit_i->s_ops = &default_salloc_ops;
+        sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
+        sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+        sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+        sit_i->sit_bitmap = dst_bitmap;
+        sit_i->bitmap_size = bitmap_size;
+        sit_i->dirty_sentries = 0;
+        sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
+        sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
+        sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+        mutex_init(&sit_i->sentry_lock);
+        return 0;
+}
+static int build_free_segmap(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_sm_info *sm_info = SM_I(sbi);
+        struct free_segmap_info *free_i;
+        unsigned int bitmap_size, sec_bitmap_size;
+        /* allocate memory for free segmap information */
+        free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
+        if (!free_i)
+                return -ENOMEM;
+        SM_I(sbi)->free_info = free_i;
+        bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+        if (!free_i->free_segmap)
+                return -ENOMEM;
+        sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
+        free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+        if (!free_i->free_secmap)
+                return -ENOMEM;
+        /* set all segments as dirty temporarily */
+        memset(free_i->free_segmap, 0xff, bitmap_size);
+        memset(free_i->free_secmap, 0xff, sec_bitmap_size);
+        /* init free segmap information */
+        free_i->start_segno =
+                (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
+        free_i->free_segments = 0;
+        free_i->free_sections = 0;
+        rwlock_init(&free_i->segmap_lock);
+        return 0;
+}
+static int build_curseg(struct f2fs_sb_info *sbi)
+{
+        struct curseg_info *array;
+        int i;
+        array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
+        if (!array)
+                return -ENOMEM;
+        SM_I(sbi)->curseg_array = array;
+        for (i = 0; i < NR_CURSEG_TYPE; i++) {
+                mutex_init(&array[i].curseg_mutex);
+                array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+                if (!array[i].sum_blk)
+                        return -ENOMEM;
+                array[i].segno = NULL_SEGNO;
+                array[i].next_blkoff = 0;
+        }
+        return restore_curseg_summaries(sbi);
+}
+static void build_sit_entries(struct f2fs_sb_info *sbi)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        unsigned int start;
+        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+                struct seg_entry *se = &sit_i->sentries[start];
+                struct f2fs_sit_block *sit_blk;
+                struct f2fs_sit_entry sit;
+                struct page *page;
+                int i;
+                mutex_lock(&curseg->curseg_mutex);
+                for (i = 0; i < sits_in_cursum(sum); i++) {
+                        if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+                                sit = sit_in_journal(sum, i);
+                                mutex_unlock(&curseg->curseg_mutex);
+                                goto got_it;
+                        }
+                }
+                mutex_unlock(&curseg->curseg_mutex);
+                page = get_current_sit_page(sbi, start);
+                sit_blk = (struct f2fs_sit_block *)page_address(page);
+                sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+                f2fs_put_page(page, 1);
+got_it:
+                check_block_count(sbi, start, &sit);
+                seg_info_from_raw_sit(se, &sit);
+                if (sbi->segs_per_sec > 1) {
+                        struct sec_entry *e = get_sec_entry(sbi, start);
+                        e->valid_blocks += se->valid_blocks;
+                }
+        }
+}
+static void init_free_segmap(struct f2fs_sb_info *sbi)
+{
+        unsigned int start;
+        int type;
+        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+                struct seg_entry *sentry = get_seg_entry(sbi, start);
+                if (!sentry->valid_blocks)
+                        __set_free(sbi, start);
+        }
+        /* set use the current segments */
+        for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
+                struct curseg_info *curseg_t = CURSEG_I(sbi, type);
+                __set_test_and_inuse(sbi, curseg_t->segno);
+        }
+}
+static void init_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int segno = 0, offset = 0;
+        unsigned short valid_blocks;
+        while (segno < TOTAL_SEGS(sbi)) {
+                /* find dirty segment based on free segmap */
+                segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
+                if (segno >= TOTAL_SEGS(sbi))
+                        break;
+                offset = segno + 1;
+                valid_blocks = get_valid_blocks(sbi, segno, 0);
+                if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+                        continue;
+                mutex_lock(&dirty_i->seglist_lock);
+                __locate_dirty_segment(sbi, segno, DIRTY);
+                mutex_unlock(&dirty_i->seglist_lock);
+        }
+}
+static int init_victim_segmap(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
+        dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
+                return -ENOMEM;
+        return 0;
+}
+static int build_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i;
+        unsigned int bitmap_size, i;
+        /* allocate memory for dirty segments list information */
+        dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
+        if (!dirty_i)
+                return -ENOMEM;
+        SM_I(sbi)->dirty_info = dirty_i;
+        mutex_init(&dirty_i->seglist_lock);
+        bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        for (i = 0; i < NR_DIRTY_TYPE; i++) {
+                dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+                dirty_i->nr_dirty[i] = 0;
+                if (!dirty_i->dirty_segmap[i])
+                        return -ENOMEM;
+        }
+        init_dirty_segmap(sbi);
+        return init_victim_segmap(sbi);
+}
+/*
+ * Update min, max modified time for cost-benefit GC algorithm
+ */
+static void init_min_max_mtime(struct f2fs_sb_info *sbi)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned int segno;
+        mutex_lock(&sit_i->sentry_lock);
+        sit_i->min_mtime = LLONG_MAX;
+        for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+                unsigned int i;
+                unsigned long long mtime = 0;
+                for (i = 0; i < sbi->segs_per_sec; i++)
+                        mtime += get_seg_entry(sbi, segno + i)->mtime;
+                mtime = div_u64(mtime, sbi->segs_per_sec);
+                if (sit_i->min_mtime > mtime)
+                        sit_i->min_mtime = mtime;
+        }
+        sit_i->max_mtime = get_mtime(sbi);
+        mutex_unlock(&sit_i->sentry_lock);
+}
+int build_segment_manager(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        struct f2fs_sm_info *sm_info;
+        int err;
+        sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
+        if (!sm_info)
+                return -ENOMEM;
+        /* init sm info */
+        sbi->sm_info = sm_info;
+        INIT_LIST_HEAD(&sm_info->wblist_head);
+        spin_lock_init(&sm_info->wblist_lock);
+        sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+        sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+        sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
+        sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+        sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+        sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
+        sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+        err = build_sit_info(sbi);
+        if (err)
+                return err;
+        err = build_free_segmap(sbi);
+        if (err)
+                return err;
+        err = build_curseg(sbi);
+        if (err)
+                return err;
+        /* reinit free segmap based on SIT */
+        build_sit_entries(sbi);
+        init_free_segmap(sbi);
+        err = build_dirty_segmap(sbi);
+        if (err)
+                return err;
+        init_min_max_mtime(sbi);
+        return 0;
+}
+static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
+                enum dirty_type dirty_type)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        mutex_lock(&dirty_i->seglist_lock);
+        kfree(dirty_i->dirty_segmap[dirty_type]);
+        dirty_i->nr_dirty[dirty_type] = 0;
+        mutex_unlock(&dirty_i->seglist_lock);
+}
+void reset_victim_segmap(struct f2fs_sb_info *sbi)
+{
+        unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
+}
+static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        kfree(dirty_i->victim_segmap[FG_GC]);
+        kfree(dirty_i->victim_segmap[BG_GC]);
+}
+static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+        int i;
+        if (!dirty_i)
+                return;
+        /* discard pre-free/dirty segments list */
+        for (i = 0; i < NR_DIRTY_TYPE; i++)
+                discard_dirty_segmap(sbi, i);
+        destroy_victim_segmap(sbi);
+        SM_I(sbi)->dirty_info = NULL;
+        kfree(dirty_i);
+}
+static void destroy_curseg(struct f2fs_sb_info *sbi)
+{
+        struct curseg_info *array = SM_I(sbi)->curseg_array;
+        int i;
+        if (!array)
+                return;
+        SM_I(sbi)->curseg_array = NULL;
+        for (i = 0; i < NR_CURSEG_TYPE; i++)
+                kfree(array[i].sum_blk);
+        kfree(array);
+}
+static void destroy_free_segmap(struct f2fs_sb_info *sbi)
+{
+        struct free_segmap_info *free_i = SM_I(sbi)->free_info;
+        if (!free_i)
+                return;
+        SM_I(sbi)->free_info = NULL;
+        kfree(free_i->free_segmap);
+        kfree(free_i->free_secmap);
+        kfree(free_i);
+}
+static void destroy_sit_info(struct f2fs_sb_info *sbi)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned int start;
+        if (!sit_i)
+                return;
+        if (sit_i->sentries) {
+                for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+                        kfree(sit_i->sentries[start].cur_valid_map);
+                        kfree(sit_i->sentries[start].ckpt_valid_map);
+                }
+        }
+        vfree(sit_i->sentries);
+        vfree(sit_i->sec_entries);
+        kfree(sit_i->dirty_sentries_bitmap);
+        SM_I(sbi)->sit_info = NULL;
+        kfree(sit_i->sit_bitmap);
+        kfree(sit_i);
+}
+void destroy_segment_manager(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_sm_info *sm_info = SM_I(sbi);
+        destroy_dirty_segmap(sbi);
+        destroy_curseg(sbi);
+        destroy_free_segmap(sbi);
+        destroy_sit_info(sbi);
+        sbi->sm_info = NULL;
+        kfree(sm_info);
+}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
new file mode 100644
index 000000000000..0948405af6f5
--- /dev/null
+++ b/fs/f2fs/segment.h
@@ -0,0 +1,618 @@
+/*
+ * fs/f2fs/segment.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* constant macro */
+#define NULL_SEGNO                      ((unsigned int)(~0))
+/* V: Logical segment # in volume, R: Relative segment # in main area */
+#define GET_L2R_SEGNO(free_i, segno)    (segno - free_i->start_segno)
+#define GET_R2L_SEGNO(free_i, segno)    (segno + free_i->start_segno)
+#define IS_DATASEG(t)                                                   \
+        ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) ||           \
+        (t == CURSEG_WARM_DATA))
+#define IS_NODESEG(t)                                                   \
+        ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||           \
+        (t == CURSEG_WARM_NODE))
+#define IS_CURSEG(sbi, segno)                                           \
+        ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||    \
+         (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||   \
+         (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||   \
+         (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||    \
+         (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||   \
+         (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+#define IS_CURSEC(sbi, secno)                                           \
+        ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /              \
+          sbi->segs_per_sec) || \
+         (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /             \
+          sbi->segs_per_sec) || \
+         (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /             \
+          sbi->segs_per_sec) || \
+         (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /              \
+          sbi->segs_per_sec) || \
+         (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /             \
+          sbi->segs_per_sec) || \
+         (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /             \
+          sbi->segs_per_sec))   \
+#define START_BLOCK(sbi, segno)                                         \
+        (SM_I(sbi)->seg0_blkaddr +                                      \
+         (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+#define NEXT_FREE_BLKADDR(sbi, curseg)                                  \
+        (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+#define MAIN_BASE_BLOCK(sbi)    (SM_I(sbi)->main_blkaddr)
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)                             \
+        ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
+#define GET_SEGNO_FROM_SEG0(sbi, blk_addr)                              \
+        (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_SEGNO(sbi, blk_addr)                                        \
+        (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?          \
+        NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),                 \
+                GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
+#define GET_SECNO(sbi, segno)                                   \
+        ((segno) / sbi->segs_per_sec)
+#define GET_ZONENO_FROM_SEGNO(sbi, segno)                               \
+        ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define GET_SUM_BLOCK(sbi, segno)                               \
+        ((sbi->sm_info->ssa_blkaddr) + segno)
+#define GET_SUM_TYPE(footer) ((footer)->entry_type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SIT_ENTRY_OFFSET(sit_i, segno)                                  \
+        (segno % sit_i->sents_per_block)
+#define SIT_BLOCK_OFFSET(sit_i, segno)                                  \
+        (segno / SIT_ENTRY_PER_BLOCK)
+#define START_SEGNO(sit_i, segno)               \
+        (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define f2fs_bitmap_size(nr)                    \
+        (BITS_TO_LONGS(nr) * sizeof(unsigned long))
+#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
+#define SECTOR_FROM_BLOCK(sbi, blk_addr)                                \
+        (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+/* during checkpoint, bio_private is used to synchronize the last bio */
+struct bio_private {
+        struct f2fs_sb_info *sbi;
+        bool is_sync;
+        void *wait;
+};
+/*
+ * indicate a block allocation direction: RIGHT and LEFT.
+ * RIGHT means allocating new sections towards the end of volume.
+ * LEFT means the opposite direction.
+ */
+enum {
+        ALLOC_RIGHT = 0,
+        ALLOC_LEFT
+};
+/*
+ * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
+ * LFS writes data sequentially with cleaning operations.
+ * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
+ */
+enum {
+        LFS = 0,
+        SSR
+};
+/*
+ * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
+ * GC_CB is based on cost-benefit algorithm.
+ * GC_GREEDY is based on greedy algorithm.
+ */
+enum {
+        GC_CB = 0,
+        GC_GREEDY
+};
+/*
+ * BG_GC means the background cleaning job.
+ * FG_GC means the on-demand cleaning job.
+ */
+enum {
+        BG_GC = 0,
+        FG_GC
+};
+/* for a function parameter to select a victim segment */
+struct victim_sel_policy {
+        int alloc_mode;                 /* LFS or SSR */
+        int gc_mode;                    /* GC_CB or GC_GREEDY */
+        unsigned long *dirty_segmap;    /* dirty segment bitmap */
+        unsigned int offset;            /* last scanned bitmap offset */
+        unsigned int ofs_unit;          /* bitmap search unit */
+        unsigned int min_cost;          /* minimum cost */
+        unsigned int min_segno;         /* segment # having min. cost */
+};
+struct seg_entry {
+        unsigned short valid_blocks;    /* # of valid blocks */
+        unsigned char *cur_valid_map;   /* validity bitmap of blocks */
+        /*
+         * # of valid blocks and the validity bitmap stored in the the last
+         * checkpoint pack. This information is used by the SSR mode.
+         */
+        unsigned short ckpt_valid_blocks;
+        unsigned char *ckpt_valid_map;
+        unsigned char type;             /* segment type like CURSEG_XXX_TYPE */
+        unsigned long long mtime;       /* modification time of the segment */
+};
+struct sec_entry {
+        unsigned int valid_blocks;      /* # of valid blocks in a section */
+};
+struct segment_allocation {
+        void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
+};
+struct sit_info {
+        const struct segment_allocation *s_ops;
+        block_t sit_base_addr;          /* start block address of SIT area */
+        block_t sit_blocks;             /* # of blocks used by SIT area */
+        block_t written_valid_blocks;   /* # of valid blocks in main area */
+        char *sit_bitmap;               /* SIT bitmap pointer */
+        unsigned int bitmap_size;       /* SIT bitmap size */
+        unsigned long *dirty_sentries_bitmap;   /* bitmap for dirty sentries */
+        unsigned int dirty_sentries;            /* # of dirty sentries */
+        unsigned int sents_per_block;           /* # of SIT entries per block */
+        struct mutex sentry_lock;               /* to protect SIT cache */
+        struct seg_entry *sentries;             /* SIT segment-level cache */
+        struct sec_entry *sec_entries;          /* SIT section-level cache */
+        /* for cost-benefit algorithm in cleaning procedure */
+        unsigned long long elapsed_time;        /* elapsed time after mount */
+        unsigned long long mounted_time;        /* mount time */
+        unsigned long long min_mtime;           /* min. modification time */
+        unsigned long long max_mtime;           /* max. modification time */
+};
+struct free_segmap_info {
+        unsigned int start_segno;       /* start segment number logically */
+        unsigned int free_segments;     /* # of free segments */
+        unsigned int free_sections;     /* # of free sections */
+        rwlock_t segmap_lock;           /* free segmap lock */
+        unsigned long *free_segmap;     /* free segment bitmap */
+        unsigned long *free_secmap;     /* free section bitmap */
+};
+/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */
+enum dirty_type {
+        DIRTY_HOT_DATA,         /* dirty segments assigned as hot data logs */
+        DIRTY_WARM_DATA,        /* dirty segments assigned as warm data logs */
+        DIRTY_COLD_DATA,        /* dirty segments assigned as cold data logs */
+        DIRTY_HOT_NODE,         /* dirty segments assigned as hot node logs */
+        DIRTY_WARM_NODE,        /* dirty segments assigned as warm node logs */
+        DIRTY_COLD_NODE,        /* dirty segments assigned as cold node logs */
+        DIRTY,                  /* to count # of dirty segments */
+        PRE,                    /* to count # of entirely obsolete segments */
+        NR_DIRTY_TYPE
+};
+struct dirty_seglist_info {
+        const struct victim_selection *v_ops;   /* victim selction operation */
+        unsigned long *dirty_segmap[NR_DIRTY_TYPE];
+        struct mutex seglist_lock;              /* lock for segment bitmaps */
+        int nr_dirty[NR_DIRTY_TYPE];            /* # of dirty segments */
+        unsigned long *victim_segmap[2];        /* BG_GC, FG_GC */
+};
+/* victim selection function for cleaning and SSR */
+struct victim_selection {
+        int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
+                                                        int, int, char);
+};
+/* for active log information */
+struct curseg_info {
+        struct mutex curseg_mutex;              /* lock for consistency */
+        struct f2fs_summary_block *sum_blk;     /* cached summary block */
+        unsigned char alloc_type;               /* current allocation type */
+        unsigned int segno;                     /* current segment number */
+        unsigned short next_blkoff;             /* next block offset to write */
+        unsigned int zone;                      /* current zone number */
+        unsigned int next_segno;                /* preallocated segment */
+};
+/*
+ * inline functions
+ */
+static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
+{
+        return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
+}
+static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
+                                                unsigned int segno)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        return &sit_i->sentries[segno];
+}
+static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
+                                                unsigned int segno)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+}
+static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
+                                unsigned int segno, int section)
+{
+        /*
+         * In order to get # of valid blocks in a section instantly from many
+         * segments, f2fs manages two counting structures separately.
+         */
+        if (section > 1)
+                return get_sec_entry(sbi, segno)->valid_blocks;
+        else
+                return get_seg_entry(sbi, segno)->valid_blocks;
+}
+static inline void seg_info_from_raw_sit(struct seg_entry *se,
+                                        struct f2fs_sit_entry *rs)
+{
+        se->valid_blocks = GET_SIT_VBLOCKS(rs);
+        se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
+        memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+        memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+        se->type = GET_SIT_TYPE(rs);
+        se->mtime = le64_to_cpu(rs->mtime);
+}
+static inline void seg_info_to_raw_sit(struct seg_entry *se,
+                                        struct f2fs_sit_entry *rs)
+{
+        unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
+                                        se->valid_blocks;
+        rs->vblocks = cpu_to_le16(raw_vblocks);
+        memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+        memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+        se->ckpt_valid_blocks = se->valid_blocks;
+        rs->mtime = cpu_to_le64(se->mtime);
+}
+static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
+                unsigned int max, unsigned int segno)
+{
+        unsigned int ret;
+        read_lock(&free_i->segmap_lock);
+        ret = find_next_bit(free_i->free_segmap, max, segno);
+        read_unlock(&free_i->segmap_lock);
+        return ret;
+}
+static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int secno = segno / sbi->segs_per_sec;
+        unsigned int start_segno = secno * sbi->segs_per_sec;
+        unsigned int next;
+        write_lock(&free_i->segmap_lock);
+        clear_bit(segno, free_i->free_segmap);
+        free_i->free_segments++;
+        next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+        if (next >= start_segno + sbi->segs_per_sec) {
+                clear_bit(secno, free_i->free_secmap);
+                free_i->free_sections++;
+        }
+        write_unlock(&free_i->segmap_lock);
+}
+static inline void __set_inuse(struct f2fs_sb_info *sbi,
+                unsigned int segno)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int secno = segno / sbi->segs_per_sec;
+        set_bit(segno, free_i->free_segmap);
+        free_i->free_segments--;
+        if (!test_and_set_bit(secno, free_i->free_secmap))
+                free_i->free_sections--;
+}
+static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
+                unsigned int segno)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int secno = segno / sbi->segs_per_sec;
+        unsigned int start_segno = secno * sbi->segs_per_sec;
+        unsigned int next;
+        write_lock(&free_i->segmap_lock);
+        if (test_and_clear_bit(segno, free_i->free_segmap)) {
+                free_i->free_segments++;
+                next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi),
+                                                                start_segno);
+                if (next >= start_segno + sbi->segs_per_sec) {
+                        if (test_and_clear_bit(secno, free_i->free_secmap))
+                                free_i->free_sections++;
+                }
+        }
+        write_unlock(&free_i->segmap_lock);
+}
+static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
+                unsigned int segno)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int secno = segno / sbi->segs_per_sec;
+        write_lock(&free_i->segmap_lock);
+        if (!test_and_set_bit(segno, free_i->free_segmap)) {
+                free_i->free_segments--;
+                if (!test_and_set_bit(secno, free_i->free_secmap))
+                        free_i->free_sections--;
+        }
+        write_unlock(&free_i->segmap_lock);
+}
+static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
+                void *dst_addr)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
+}
+static inline block_t written_block_count(struct f2fs_sb_info *sbi)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        block_t vblocks;
+        mutex_lock(&sit_i->sentry_lock);
+        vblocks = sit_i->written_valid_blocks;
+        mutex_unlock(&sit_i->sentry_lock);
+        return vblocks;
+}
+static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int free_segs;
+        read_lock(&free_i->segmap_lock);
+        free_segs = free_i->free_segments;
+        read_unlock(&free_i->segmap_lock);
+        return free_segs;
+}
+static inline int reserved_segments(struct f2fs_sb_info *sbi)
+{
+        return SM_I(sbi)->reserved_segments;
+}
+static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
+{
+        struct free_segmap_info *free_i = FREE_I(sbi);
+        unsigned int free_secs;
+        read_lock(&free_i->segmap_lock);
+        free_secs = free_i->free_sections;
+        read_unlock(&free_i->segmap_lock);
+        return free_secs;
+}
+static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
+{
+        return DIRTY_I(sbi)->nr_dirty[PRE];
+}
+static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
+{
+        return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
+                DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] +
+                DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] +
+                DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] +
+                DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] +
+                DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE];
+}
+static inline int overprovision_segments(struct f2fs_sb_info *sbi)
+{
+        return SM_I(sbi)->ovp_segments;
+}
+static inline int overprovision_sections(struct f2fs_sb_info *sbi)
+{
+        return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+}
+static inline int reserved_sections(struct f2fs_sb_info *sbi)
+{
+        return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+}
+static inline bool need_SSR(struct f2fs_sb_info *sbi)
+{
+        return (free_sections(sbi) < overprovision_sections(sbi));
+}
+static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        return DIRTY_I(sbi)->v_ops->get_victim(sbi,
+                                &(curseg)->next_segno, BG_GC, type, SSR);
+}
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
+{
+        return free_sections(sbi) <= reserved_sections(sbi);
+}
+static inline int utilization(struct f2fs_sb_info *sbi)
+{
+        return (long int)valid_user_blocks(sbi) * 100 /
+                        (long int)sbi->user_block_count;
+}
+/*
+ * Sometimes f2fs may be better to drop out-of-place update policy.
+ * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
+ * data in the original place likewise other traditional file systems.
+ * But, currently set 100 in percentage, which means it is disabled.
+ * See below need_inplace_update().
+ */
+#define MIN_IPU_UTIL            100
+static inline bool need_inplace_update(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        if (S_ISDIR(inode->i_mode))
+                return false;
+        if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+                return true;
+        return false;
+}
+static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
+                int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        return curseg->segno;
+}
+static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
+                int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        return curseg->alloc_type;
+}
+static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        return curseg->next_blkoff;
+}
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+        unsigned int end_segno = SM_I(sbi)->segment_count - 1;
+        BUG_ON(segno > end_segno);
+}
+/*
+ * This function is used for only debugging.
+ * NOTE: In future, we have to remove this function.
+ */
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+        struct f2fs_sm_info *sm_info = SM_I(sbi);
+        block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
+        block_t start_addr = sm_info->seg0_blkaddr;
+        block_t end_addr = start_addr + total_blks - 1;
+        BUG_ON(blk_addr < start_addr);
+        BUG_ON(blk_addr > end_addr);
+}
+/*
+ * Summary block is always treated as invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+                int segno, struct f2fs_sit_entry *raw_sit)
+{
+        struct f2fs_sm_info *sm_info = SM_I(sbi);
+        unsigned int end_segno = sm_info->segment_count - 1;
+        int valid_blocks = 0;
+        int i;
+        /* check segment usage */
+        BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
+        /* check boundary of a given segment number */
+        BUG_ON(segno > end_segno);
+        /* check bitmap with valid block count */
+        for (i = 0; i < sbi->blocks_per_seg; i++)
+                if (f2fs_test_bit(i, raw_sit->valid_map))
+                        valid_blocks++;
+        BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
+}
+static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
+                                                unsigned int start)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+        block_t blk_addr = sit_i->sit_base_addr + offset;
+        check_seg_range(sbi, start);
+        /* calculate sit block address */
+        if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+                blk_addr += sit_i->sit_blocks;
+        return blk_addr;
+}
+static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
+                                                pgoff_t block_addr)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        block_addr -= sit_i->sit_base_addr;
+        if (block_addr < sit_i->sit_blocks)
+                block_addr += sit_i->sit_blocks;
+        else
+                block_addr -= sit_i->sit_blocks;
+        return block_addr + sit_i->sit_base_addr;
+}
+static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
+{
+        unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+        if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
+                f2fs_clear_bit(block_off, sit_i->sit_bitmap);
+        else
+                f2fs_set_bit(block_off, sit_i->sit_bitmap);
+}
+static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
+{
+        struct sit_info *sit_i = SIT_I(sbi);
+        return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
+                                                sit_i->mounted_time;
+}
+static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
+                        unsigned int ofs_in_node, unsigned char version)
+{
+        sum->nid = cpu_to_le32(nid);
+        sum->ofs_in_node = cpu_to_le16(ofs_in_node);
+        sum->version = version;
+}
+static inline block_t start_sum_block(struct f2fs_sb_info *sbi)
+{
+        return __start_cp_addr(sbi) +
+                le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
+{
+        return __start_cp_addr(sbi) +
+                le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
+                                - (base + 1) + type;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
new file mode 100644
index 000000000000..13867322cf5a
--- /dev/null
+++ b/fs/f2fs/super.c
@@ -0,0 +1,657 @@
+/*
+ * fs/f2fs/super.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/statfs.h>
+#include <linux/proc_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "xattr.h"
+static struct kmem_cache *f2fs_inode_cachep;
+enum {
+        Opt_gc_background_off,
+        Opt_disable_roll_forward,
+        Opt_discard,
+        Opt_noheap,
+        Opt_nouser_xattr,
+        Opt_noacl,
+        Opt_active_logs,
+        Opt_disable_ext_identify,
+        Opt_err,
+};
+static match_table_t f2fs_tokens = {
+        {Opt_gc_background_off, "background_gc_off"},
+        {Opt_disable_roll_forward, "disable_roll_forward"},
+        {Opt_discard, "discard"},
+        {Opt_noheap, "no_heap"},
+        {Opt_nouser_xattr, "nouser_xattr"},
+        {Opt_noacl, "noacl"},
+        {Opt_active_logs, "active_logs=%u"},
+        {Opt_disable_ext_identify, "disable_ext_identify"},
+        {Opt_err, NULL},
+};
+static void init_once(void *foo)
+{
+        struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
+        inode_init_once(&fi->vfs_inode);
+}
+static struct inode *f2fs_alloc_inode(struct super_block *sb)
+{
+        struct f2fs_inode_info *fi;
+        fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+        if (!fi)
+                return NULL;
+        init_once((void *) fi);
+        /* Initilize f2fs-specific inode info */
+        fi->vfs_inode.i_version = 1;
+        atomic_set(&fi->dirty_dents, 0);
+        fi->i_current_depth = 1;
+        fi->i_advise = 0;
+        rwlock_init(&fi->ext.ext_lock);
+        set_inode_flag(fi, FI_NEW_INODE);
+        return &fi->vfs_inode;
+}
+static void f2fs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
+}
+static void f2fs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, f2fs_i_callback);
+}
+static void f2fs_put_super(struct super_block *sb)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        f2fs_destroy_stats(sbi);
+        stop_gc_thread(sbi);
+        write_checkpoint(sbi, false, true);
+        iput(sbi->node_inode);
+        iput(sbi->meta_inode);
+        /* destroy f2fs internal modules */
+        destroy_node_manager(sbi);
+        destroy_segment_manager(sbi);
+        kfree(sbi->ckpt);
+        sb->s_fs_info = NULL;
+        brelse(sbi->raw_super_buf);
+        kfree(sbi);
+}
+int f2fs_sync_fs(struct super_block *sb, int sync)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        int ret = 0;
+        if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
+                return 0;
+        if (sync)
+                write_checkpoint(sbi, false, false);
+        return ret;
+}
+static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        block_t total_count, user_block_count, start_count, ovp_count;
+        total_count = le64_to_cpu(sbi->raw_super->block_count);
+        user_block_count = sbi->user_block_count;
+        start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
+        ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
+        buf->f_type = F2FS_SUPER_MAGIC;
+        buf->f_bsize = sbi->blocksize;
+        buf->f_blocks = total_count - start_count;
+        buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+        buf->f_bavail = user_block_count - valid_user_blocks(sbi);
+        buf->f_files = valid_inode_count(sbi);
+        buf->f_ffree = sbi->total_node_count - valid_node_count(sbi);
+        buf->f_namelen = F2FS_MAX_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        return 0;
+}
+static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
+        if (test_opt(sbi, BG_GC))
+                seq_puts(seq, ",background_gc_on");
+        else
+                seq_puts(seq, ",background_gc_off");
+        if (test_opt(sbi, DISABLE_ROLL_FORWARD))
+                seq_puts(seq, ",disable_roll_forward");
+        if (test_opt(sbi, DISCARD))
+                seq_puts(seq, ",discard");
+        if (test_opt(sbi, NOHEAP))
+                seq_puts(seq, ",no_heap_alloc");
+#ifdef CONFIG_F2FS_FS_XATTR
+        if (test_opt(sbi, XATTR_USER))
+                seq_puts(seq, ",user_xattr");
+        else
+                seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+        if (test_opt(sbi, POSIX_ACL))
+                seq_puts(seq, ",acl");
+        else
+                seq_puts(seq, ",noacl");
+#endif
+        if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
+                seq_puts(seq, ",disable_ext_indentify");
+        seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+        return 0;
+}
+static struct super_operations f2fs_sops = {
+        .alloc_inode    = f2fs_alloc_inode,
+        .destroy_inode  = f2fs_destroy_inode,
+        .write_inode    = f2fs_write_inode,
+        .show_options   = f2fs_show_options,
+        .evict_inode    = f2fs_evict_inode,
+        .put_super      = f2fs_put_super,
+        .sync_fs        = f2fs_sync_fs,
+        .statfs         = f2fs_statfs,
+};
+static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
+                u64 ino, u32 generation)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *inode;
+        if (ino < F2FS_ROOT_INO(sbi))
+                return ERR_PTR(-ESTALE);
+        /*
+         * f2fs_iget isn't quite right if the inode is currently unallocated!
+         * However f2fs_iget currently does appropriate checks to handle stale
+         * inodes so everything is OK.
+         */
+        inode = f2fs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                /* we didn't find the right inode.. */
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    f2fs_nfs_get_inode);
+}
+static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+                int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    f2fs_nfs_get_inode);
+}
+static const struct export_operations f2fs_export_ops = {
+        .fh_to_dentry = f2fs_fh_to_dentry,
+        .fh_to_parent = f2fs_fh_to_parent,
+        .get_parent = f2fs_get_parent,
+};
+static int parse_options(struct f2fs_sb_info *sbi, char *options)
+{
+        substring_t args[MAX_OPT_ARGS];
+        char *p;
+        int arg = 0;
+        if (!options)
+                return 0;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = NULL;
+                token = match_token(p, f2fs_tokens, args);
+                switch (token) {
+                case Opt_gc_background_off:
+                        clear_opt(sbi, BG_GC);
+                        break;
+                case Opt_disable_roll_forward:
+                        set_opt(sbi, DISABLE_ROLL_FORWARD);
+                        break;
+                case Opt_discard:
+                        set_opt(sbi, DISCARD);
+                        break;
+                case Opt_noheap:
+                        set_opt(sbi, NOHEAP);
+                        break;
+#ifdef CONFIG_F2FS_FS_XATTR
+                case Opt_nouser_xattr:
+                        clear_opt(sbi, XATTR_USER);
+                        break;
+#else
+                case Opt_nouser_xattr:
+                        pr_info("nouser_xattr options not supported\n");
+                        break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+                case Opt_noacl:
+                        clear_opt(sbi, POSIX_ACL);
+                        break;
+#else
+                case Opt_noacl:
+                        pr_info("noacl options not supported\n");
+                        break;
+#endif
+                case Opt_active_logs:
+                        if (args->from && match_int(args, &arg))
+                                return -EINVAL;
+                        if (arg != 2 && arg != 4 && arg != 6)
+                                return -EINVAL;
+                        sbi->active_logs = arg;
+                        break;
+                case Opt_disable_ext_identify:
+                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
+                        break;
+                default:
+                        pr_err("Unrecognized mount option \"%s\" or missing value\n",
+                                        p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+static loff_t max_file_size(unsigned bits)
+{
+        loff_t result = ADDRS_PER_INODE;
+        loff_t leaf_count = ADDRS_PER_BLOCK;
+        /* two direct node blocks */
+        result += (leaf_count * 2);
+        /* two indirect node blocks */
+        leaf_count *= NIDS_PER_BLOCK;
+        result += (leaf_count * 2);
+        /* one double indirect node block */
+        leaf_count *= NIDS_PER_BLOCK;
+        result += leaf_count;
+        result <<= bits;
+        return result;
+}
+static int sanity_check_raw_super(struct f2fs_super_block *raw_super)
+{
+        unsigned int blocksize;
+        if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic))
+                return 1;
+        /* Currently, support only 4KB block size */
+        blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
+        if (blocksize != PAGE_CACHE_SIZE)
+                return 1;
+        if (le32_to_cpu(raw_super->log_sectorsize) !=
+                                        F2FS_LOG_SECTOR_SIZE)
+                return 1;
+        if (le32_to_cpu(raw_super->log_sectors_per_block) !=
+                                        F2FS_LOG_SECTORS_PER_BLOCK)
+                return 1;
+        return 0;
+}
+static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
+                                struct f2fs_checkpoint *ckpt)
+{
+        unsigned int total, fsmeta;
+        total = le32_to_cpu(raw_super->segment_count);
+        fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
+        fsmeta += le32_to_cpu(raw_super->segment_count_sit);
+        fsmeta += le32_to_cpu(raw_super->segment_count_nat);
+        fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
+        fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
+        if (fsmeta >= total)
+                return 1;
+        return 0;
+}
+static void init_sb_info(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_super_block *raw_super = sbi->raw_super;
+        int i;
+        sbi->log_sectors_per_block =
+                le32_to_cpu(raw_super->log_sectors_per_block);
+        sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
+        sbi->blocksize = 1 << sbi->log_blocksize;
+        sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+        sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
+        sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
+        sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
+        sbi->total_sections = le32_to_cpu(raw_super->section_count);
+        sbi->total_node_count =
+                (le32_to_cpu(raw_super->segment_count_nat) / 2)
+                        * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+        sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
+        sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
+        sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+        for (i = 0; i < NR_COUNT_TYPE; i++)
+                atomic_set(&sbi->nr_pages[i], 0);
+}
+static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct f2fs_sb_info *sbi;
+        struct f2fs_super_block *raw_super;
+        struct buffer_head *raw_super_buf;
+        struct inode *root;
+        long err = -EINVAL;
+        int i;
+        /* allocate memory for f2fs-specific super block info */
+        sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        /* set a temporary block size */
+        if (!sb_set_blocksize(sb, F2FS_BLKSIZE))
+                goto free_sbi;
+        /* read f2fs raw super block */
+        raw_super_buf = sb_bread(sb, 0);
+        if (!raw_super_buf) {
+                err = -EIO;
+                goto free_sbi;
+        }
+        raw_super = (struct f2fs_super_block *)
+                        ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
+        /* init some FS parameters */
+        sbi->active_logs = NR_CURSEG_TYPE;
+        set_opt(sbi, BG_GC);
+#ifdef CONFIG_F2FS_FS_XATTR
+        set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+        set_opt(sbi, POSIX_ACL);
+#endif
+        /* parse mount options */
+        if (parse_options(sbi, (char *)data))
+                goto free_sb_buf;
+        /* sanity checking of raw super */
+        if (sanity_check_raw_super(raw_super))
+                goto free_sb_buf;
+        sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+        sb->s_max_links = F2FS_LINK_MAX;
+        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+        sb->s_op = &f2fs_sops;
+        sb->s_xattr = f2fs_xattr_handlers;
+        sb->s_export_op = &f2fs_export_ops;
+        sb->s_magic = F2FS_SUPER_MAGIC;
+        sb->s_fs_info = sbi;
+        sb->s_time_gran = 1;
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+        memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+        /* init f2fs-specific super block info */
+        sbi->sb = sb;
+        sbi->raw_super = raw_super;
+        sbi->raw_super_buf = raw_super_buf;
+        mutex_init(&sbi->gc_mutex);
+        mutex_init(&sbi->write_inode);
+        mutex_init(&sbi->writepages);
+        mutex_init(&sbi->cp_mutex);
+        for (i = 0; i < NR_LOCK_TYPE; i++)
+                mutex_init(&sbi->fs_lock[i]);
+        sbi->por_doing = 0;
+        spin_lock_init(&sbi->stat_lock);
+        init_rwsem(&sbi->bio_sem);
+        init_sb_info(sbi);
+        /* get an inode for meta space */
+        sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
+        if (IS_ERR(sbi->meta_inode)) {
+                err = PTR_ERR(sbi->meta_inode);
+                goto free_sb_buf;
+        }
+        err = get_valid_checkpoint(sbi);
+        if (err)
+                goto free_meta_inode;
+        /* sanity checking of checkpoint */
+        err = -EINVAL;
+        if (sanity_check_ckpt(raw_super, sbi->ckpt))
+                goto free_cp;
+        sbi->total_valid_node_count =
+                                le32_to_cpu(sbi->ckpt->valid_node_count);
+        sbi->total_valid_inode_count =
+                                le32_to_cpu(sbi->ckpt->valid_inode_count);
+        sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
+        sbi->total_valid_block_count =
+                                le64_to_cpu(sbi->ckpt->valid_block_count);
+        sbi->last_valid_block_count = sbi->total_valid_block_count;
+        sbi->alloc_valid_block_count = 0;
+        INIT_LIST_HEAD(&sbi->dir_inode_list);
+        spin_lock_init(&sbi->dir_inode_lock);
+        /* init super block */
+        if (!sb_set_blocksize(sb, sbi->blocksize))
+                goto free_cp;
+        init_orphan_info(sbi);
+        /* setup f2fs internal modules */
+        err = build_segment_manager(sbi);
+        if (err)
+                goto free_sm;
+        err = build_node_manager(sbi);
+        if (err)
+                goto free_nm;
+        build_gc_manager(sbi);
+        /* get an inode for node space */
+        sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
+        if (IS_ERR(sbi->node_inode)) {
+                err = PTR_ERR(sbi->node_inode);
+                goto free_nm;
+        }
+        /* if there are nt orphan nodes free them */
+        err = -EINVAL;
+        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
+                                recover_orphan_inodes(sbi))
+                goto free_node_inode;
+        /* read root inode and dentry */
+        root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto free_node_inode;
+        }
+        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+                goto free_root_inode;
+        sb->s_root = d_make_root(root); /* allocate root dentry */
+        if (!sb->s_root) {
+                err = -ENOMEM;
+                goto free_root_inode;
+        }
+        /* recover fsynced data */
+        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
+                                !test_opt(sbi, DISABLE_ROLL_FORWARD))
+                recover_fsync_data(sbi);
+        /* After POR, we can run background GC thread */
+        err = start_gc_thread(sbi);
+        if (err)
+                goto fail;
+        err = f2fs_build_stats(sbi);
+        if (err)
+                goto fail;
+        return 0;
+fail:
+        stop_gc_thread(sbi);
+free_root_inode:
+        dput(sb->s_root);
+        sb->s_root = NULL;
+free_node_inode:
+        iput(sbi->node_inode);
+free_nm:
+        destroy_node_manager(sbi);
+free_sm:
+        destroy_segment_manager(sbi);
+free_cp:
+        kfree(sbi->ckpt);
+free_meta_inode:
+        make_bad_inode(sbi->meta_inode);
+        iput(sbi->meta_inode);
+free_sb_buf:
+        brelse(raw_super_buf);
+free_sbi:
+        kfree(sbi);
+        return err;
+}
+static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
+                        const char *dev_name, void *data)
+{
+        return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
+}
+static struct file_system_type f2fs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "f2fs",
+        .mount          = f2fs_mount,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static int init_inodecache(void)
+{
+        f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
+                        sizeof(struct f2fs_inode_info), NULL);
+        if (f2fs_inode_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+static void destroy_inodecache(void)
+{
+        /*
+         * Make sure all delayed rcu free inodes are flushed before we
+         * destroy cache.
+         */
+        rcu_barrier();
+        kmem_cache_destroy(f2fs_inode_cachep);
+}
+static int __init init_f2fs_fs(void)
+{
+        int err;
+        err = init_inodecache();
+        if (err)
+                goto fail;
+        err = create_node_manager_caches();
+        if (err)
+                goto fail;
+        err = create_gc_caches();
+        if (err)
+                goto fail;
+        err = create_checkpoint_caches();
+        if (err)
+                goto fail;
+        return register_filesystem(&f2fs_fs_type);
+fail:
+        return err;
+}
+static void __exit exit_f2fs_fs(void)
+{
+        destroy_root_stats();
+        unregister_filesystem(&f2fs_fs_type);
+        destroy_checkpoint_caches();
+        destroy_gc_caches();
+        destroy_node_manager_caches();
+        destroy_inodecache();
+}
+module_init(init_f2fs_fs)
+module_exit(exit_f2fs_fs)
+MODULE_AUTHOR("Samsung Electronics's Praesto Team");
+MODULE_DESCRIPTION("Flash Friendly File System");
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
new file mode 100644
index 000000000000..7d52e8dc0c59
--- /dev/null
+++ b/fs/f2fs/xattr.c
@@ -0,0 +1,440 @@
+/*
+ * fs/f2fs/xattr.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/rwsem.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
+                size_t list_size, const char *name, size_t name_len, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+        int total_len, prefix_len = 0;
+        const char *prefix = NULL;
+        switch (type) {
+        case F2FS_XATTR_INDEX_USER:
+                if (!test_opt(sbi, XATTR_USER))
+                        return -EOPNOTSUPP;
+                prefix = XATTR_USER_PREFIX;
+                prefix_len = XATTR_USER_PREFIX_LEN;
+                break;
+        case F2FS_XATTR_INDEX_TRUSTED:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                prefix = XATTR_TRUSTED_PREFIX;
+                prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+                break;
+        default:
+                return -EINVAL;
+        }
+        total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, prefix, prefix_len);
+                memcpy(list+prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
+                void *buffer, size_t size, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+        switch (type) {
+        case F2FS_XATTR_INDEX_USER:
+                if (!test_opt(sbi, XATTR_USER))
+                        return -EOPNOTSUPP;
+                break;
+        case F2FS_XATTR_INDEX_TRUSTED:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return f2fs_getxattr(dentry->d_inode, type, name,
+                        buffer, size);
+}
+static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags, int type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+        switch (type) {
+        case F2FS_XATTR_INDEX_USER:
+                if (!test_opt(sbi, XATTR_USER))
+                        return -EOPNOTSUPP;
+                break;
+        case F2FS_XATTR_INDEX_TRUSTED:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+}
+static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
+                size_t list_size, const char *name, size_t name_len, int type)
+{
+        const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
+        size_t size;
+        if (type != F2FS_XATTR_INDEX_ADVISE)
+                return 0;
+        size = strlen(xname) + 1;
+        if (list && size <= list_size)
+                memcpy(list, xname, size);
+        return size;
+}
+static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
+                void *buffer, size_t size, int type)
+{
+        struct inode *inode = dentry->d_inode;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        *((char *)buffer) = F2FS_I(inode)->i_advise;
+        return sizeof(char);
+}
+static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags, int type)
+{
+        struct inode *inode = dentry->d_inode;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!inode_owner_or_capable(inode))
+                return -EPERM;
+        if (value == NULL)
+                return -EINVAL;
+        F2FS_I(inode)->i_advise |= *(char *)value;
+        return 0;
+}
+const struct xattr_handler f2fs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .flags  = F2FS_XATTR_INDEX_USER,
+        .list   = f2fs_xattr_generic_list,
+        .get    = f2fs_xattr_generic_get,
+        .set    = f2fs_xattr_generic_set,
+};
+const struct xattr_handler f2fs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .flags  = F2FS_XATTR_INDEX_TRUSTED,
+        .list   = f2fs_xattr_generic_list,
+        .get    = f2fs_xattr_generic_get,
+        .set    = f2fs_xattr_generic_set,
+};
+const struct xattr_handler f2fs_xattr_advise_handler = {
+        .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+        .flags  = F2FS_XATTR_INDEX_ADVISE,
+        .list   = f2fs_xattr_advise_list,
+        .get    = f2fs_xattr_advise_get,
+        .set    = f2fs_xattr_advise_set,
+};
+static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+        [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+        [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
+        [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
+#endif
+        [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+        [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
+};
+const struct xattr_handler *f2fs_xattr_handlers[] = {
+        &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+        &f2fs_xattr_acl_access_handler,
+        &f2fs_xattr_acl_default_handler,
+#endif
+        &f2fs_xattr_trusted_handler,
+        &f2fs_xattr_advise_handler,
+        NULL,
+};
+static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
+{
+        const struct xattr_handler *handler = NULL;
+        if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map))
+                handler = f2fs_xattr_handler_map[name_index];
+        return handler;
+}
+int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+                void *buffer, size_t buffer_size)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct f2fs_xattr_entry *entry;
+        struct page *page;
+        void *base_addr;
+        int error = 0, found = 0;
+        int value_len, name_len;
+        if (name == NULL)
+                return -EINVAL;
+        name_len = strlen(name);
+        if (!fi->i_xattr_nid)
+                return -ENODATA;
+        page = get_node_page(sbi, fi->i_xattr_nid);
+        base_addr = page_address(page);
+        list_for_each_xattr(entry, base_addr) {
+                if (entry->e_name_index != name_index)
+                        continue;
+                if (entry->e_name_len != name_len)
+                        continue;
+                if (!memcmp(entry->e_name, name, name_len)) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                error = -ENODATA;
+                goto cleanup;
+        }
+        value_len = le16_to_cpu(entry->e_value_size);
+        if (buffer && value_len > buffer_size) {
+                error = -ERANGE;
+                goto cleanup;
+        }
+        if (buffer) {
+                char *pval = entry->e_name + entry->e_name_len;
+                memcpy(buffer, pval, value_len);
+        }
+        error = value_len;
+cleanup:
+        f2fs_put_page(page, 1);
+        return error;
+}
+ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct f2fs_xattr_entry *entry;
+        struct page *page;
+        void *base_addr;
+        int error = 0;
+        size_t rest = buffer_size;
+        if (!fi->i_xattr_nid)
+                return 0;
+        page = get_node_page(sbi, fi->i_xattr_nid);
+        base_addr = page_address(page);
+        list_for_each_xattr(entry, base_addr) {
+                const struct xattr_handler *handler =
+                        f2fs_xattr_handler(entry->e_name_index);
+                size_t size;
+                if (!handler)
+                        continue;
+                size = handler->list(dentry, buffer, rest, entry->e_name,
+                                entry->e_name_len, handler->flags);
+                if (buffer && size > rest) {
+                        error = -ERANGE;
+                        goto cleanup;
+                }
+                if (buffer)
+                        buffer += size;
+                rest -= size;
+        }
+        error = buffer_size - rest;
+cleanup:
+        f2fs_put_page(page, 1);
+        return error;
+}
+int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+                                        const void *value, size_t value_len)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct f2fs_xattr_header *header = NULL;
+        struct f2fs_xattr_entry *here, *last;
+        struct page *page;
+        void *base_addr;
+        int error, found, free, name_len, newsize;
+        char *pval;
+        if (name == NULL)
+                return -EINVAL;
+        name_len = strlen(name);
+        if (value == NULL)
+                value_len = 0;
+        if (name_len > 255 || value_len > MAX_VALUE_LEN)
+                return -ERANGE;
+        mutex_lock_op(sbi, NODE_NEW);
+        if (!fi->i_xattr_nid) {
+                /* Allocate new attribute block */
+                struct dnode_of_data dn;
+                if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
+                        mutex_unlock_op(sbi, NODE_NEW);
+                        return -ENOSPC;
+                }
+                set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
+                mark_inode_dirty(inode);
+                page = new_node_page(&dn, XATTR_NODE_OFFSET);
+                if (IS_ERR(page)) {
+                        alloc_nid_failed(sbi, fi->i_xattr_nid);
+                        fi->i_xattr_nid = 0;
+                        mutex_unlock_op(sbi, NODE_NEW);
+                        return PTR_ERR(page);
+                }
+                alloc_nid_done(sbi, fi->i_xattr_nid);
+                base_addr = page_address(page);
+                header = XATTR_HDR(base_addr);
+                header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
+                header->h_refcount = cpu_to_le32(1);
+        } else {
+                /* The inode already has an extended attribute block. */
+                page = get_node_page(sbi, fi->i_xattr_nid);
+                if (IS_ERR(page)) {
+                        mutex_unlock_op(sbi, NODE_NEW);
+                        return PTR_ERR(page);
+                }
+                base_addr = page_address(page);
+                header = XATTR_HDR(base_addr);
+        }
+        if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
+                error = -EIO;
+                goto cleanup;
+        }
+        /* find entry with wanted name. */
+        found = 0;
+        list_for_each_xattr(here, base_addr) {
+                if (here->e_name_index != name_index)
+                        continue;
+                if (here->e_name_len != name_len)
+                        continue;
+                if (!memcmp(here->e_name, name, name_len)) {
+                        found = 1;
+                        break;
+                }
+        }
+        last = here;
+        while (!IS_XATTR_LAST_ENTRY(last))
+                last = XATTR_NEXT_ENTRY(last);
+        newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) +
+                        name_len + value_len);
+        /* 1. Check space */
+        if (value) {
+                /* If value is NULL, it is remove operation.
+                 * In case of update operation, we caculate free.
+                 */
+                free = MIN_OFFSET - ((char *)last - (char *)header);
+                if (found)
+                        free = free - ENTRY_SIZE(here);
+                if (free < newsize) {
+                        error = -ENOSPC;
+                        goto cleanup;
+                }
+        }
+        /* 2. Remove old entry */
+        if (found) {
+                /* If entry is found, remove old entry.
+                 * If not found, remove operation is not needed.
+                 */
+                struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
+                int oldsize = ENTRY_SIZE(here);
+                memmove(here, next, (char *)last - (char *)next);
+                last = (struct f2fs_xattr_entry *)((char *)last - oldsize);
+                memset(last, 0, oldsize);
+        }
+        /* 3. Write new entry */
+        if (value) {
+                /* Before we come here, old entry is removed.
+                 * We just write new entry. */
+                memset(last, 0, newsize);
+                last->e_name_index = name_index;
+                last->e_name_len = name_len;
+                memcpy(last->e_name, name, name_len);
+                pval = last->e_name + name_len;
+                memcpy(pval, value, value_len);
+                last->e_value_size = cpu_to_le16(value_len);
+        }
+        set_page_dirty(page);
+        f2fs_put_page(page, 1);
+        if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+                inode->i_mode = fi->i_acl_mode;
+                inode->i_ctime = CURRENT_TIME;
+                clear_inode_flag(fi, FI_ACL_MODE);
+        }
+        f2fs_write_inode(inode, NULL);
+        mutex_unlock_op(sbi, NODE_NEW);
+        return 0;
+cleanup:
+        f2fs_put_page(page, 1);
+        mutex_unlock_op(sbi, NODE_NEW);
+        return error;
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
new file mode 100644
index 000000000000..49c9558305e3
--- /dev/null
+++ b/fs/f2fs/xattr.h
@@ -0,0 +1,145 @@
+/*
+ * fs/f2fs/xattr.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.h
+ *
+ * On-disk format of extended attributes for the ext2 filesystem.
+ *
+ * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_XATTR_H__
+#define __F2FS_XATTR_H__
+#include <linux/init.h>
+#include <linux/xattr.h>
+/* Magic value in attribute blocks */
+#define F2FS_XATTR_MAGIC                0xF2F52011
+/* Maximum number of references to one attribute block */
+#define F2FS_XATTR_REFCOUNT_MAX         1024
+/* Name indexes */
+#define F2FS_SYSTEM_ADVISE_PREFIX               "system.advise"
+#define F2FS_XATTR_INDEX_USER                   1
+#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS       2
+#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT      3
+#define F2FS_XATTR_INDEX_TRUSTED                4
+#define F2FS_XATTR_INDEX_LUSTRE                 5
+#define F2FS_XATTR_INDEX_SECURITY               6
+#define F2FS_XATTR_INDEX_ADVISE                 7
+struct f2fs_xattr_header {
+        __le32  h_magic;        /* magic number for identification */
+        __le32  h_refcount;     /* reference count */
+        __u32   h_reserved[4];  /* zero right now */
+};
+struct f2fs_xattr_entry {
+        __u8    e_name_index;
+        __u8    e_name_len;
+        __le16  e_value_size;   /* size of attribute value */
+        char    e_name[0];      /* attribute name */
+};
+#define XATTR_HDR(ptr)          ((struct f2fs_xattr_header *)(ptr))
+#define XATTR_ENTRY(ptr)        ((struct f2fs_xattr_entry *)(ptr))
+#define XATTR_FIRST_ENTRY(ptr)  (XATTR_ENTRY(XATTR_HDR(ptr)+1))
+#define XATTR_ROUND             (3)
+#define XATTR_ALIGN(size)       ((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
+                        entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
+                        ENTRY_SIZE(entry)))
+#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+#define list_for_each_xattr(entry, addr) \
+                for (entry = XATTR_FIRST_ENTRY(addr);\
+                                !IS_XATTR_LAST_ENTRY(entry);\
+                                entry = XATTR_NEXT_ENTRY(entry))
+#define MIN_OFFSET      XATTR_ALIGN(PAGE_SIZE - \
+                        sizeof(struct node_footer) - \
+                        sizeof(__u32))
+#define MAX_VALUE_LEN   (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
+                        sizeof(struct f2fs_xattr_entry))
+/*
+ * On-disk structure of f2fs_xattr
+ * We use only 1 block for xattr.
+ *
+ * +--------------------+
+ * | f2fs_xattr_header  |
+ * |                    |
+ * +--------------------+
+ * | f2fs_xattr_entry   |
+ * | .e_name_index = 1  |
+ * | .e_name_len = 3    |
+ * | .e_value_size = 14 |
+ * | .e_name = "foo"    |
+ * | "value_of_xattr"   |<- value_offs = e_name + e_name_len
+ * +--------------------+
+ * | f2fs_xattr_entry   |
+ * | .e_name_index = 4  |
+ * | .e_name = "bar"    |
+ * +--------------------+
+ * |                    |
+ * |        Free        |
+ * |                    |
+ * +--------------------+<- MIN_OFFSET
+ * |   node_footer      |
+ * | (nid, ino, offset) |
+ * +--------------------+
+ *
+ **/
+#ifdef CONFIG_F2FS_FS_XATTR
+extern const struct xattr_handler f2fs_xattr_user_handler;
+extern const struct xattr_handler f2fs_xattr_trusted_handler;
+extern const struct xattr_handler f2fs_xattr_acl_access_handler;
+extern const struct xattr_handler f2fs_xattr_acl_default_handler;
+extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler *f2fs_xattr_handlers[];
+extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+                const void *value, size_t value_len);
+extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+                void *buffer, size_t buffer_size);
+extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+                size_t buffer_size);
+#else
+#define f2fs_xattr_handlers     NULL
+static inline int f2fs_setxattr(struct inode *inode, int name_index,
+        const char *name, const void *value, size_t value_len)
+{
+        return -EOPNOTSUPP;
+}
+static inline int f2fs_getxattr(struct inode *inode, int name_index,
+                const char *name, void *buffer, size_t buffer_size)
+{
+        return -EOPNOTSUPP;
+}
+static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+                size_t buffer_size)
+{
+        return -EOPNOTSUPP;
+}
+#endif
+#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fhandle.c b/fs/fhandle.c
index cccdc874bb55..999ff5c3cab0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path,
        handle_bytes = handle_dwords * sizeof(u32);
        handle->handle_bytes = handle_bytes;
        if ((handle->handle_bytes > f_handle.handle_bytes) ||
-            (retval == 255) || (retval == -ENOSPC)) {
+            (retval == FILEID_INVALID) || (retval == -ENOSPC)) {
                /* As per old exportfs_encode_fh documentation
                 * we could return ENOSPC to indicate overflow
                 * But file system returned 255 always. So handle
diff --git a/fs/file_table.c b/fs/file_table.c
index a72bf9ddd0d2..de9e9653d611 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -458,8 +458,8 @@ void mark_files_ro(struct super_block *sb)
                spin_unlock(&f->f_lock);
                if (file_check_writeable(f) != 0)
                        continue;
+                __mnt_drop_write(f->f_path.mnt);
                file_release_write(f);
-                mnt_drop_write_file(f);
        } while_file_list_for_each_entry;
        lg_global_unlock(&files_lglock);
 }
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 6a3c48abd677..b52aed1dca97 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache);
 */
 void fscache_io_error(struct fscache_cache *cache)
 {
-        set_bit(FSCACHE_IOERROR, &cache->flags);
+        if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
+                printk(KERN_ERR "FS-Cache:"
-        printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+                       " Cache '%s' stopped due to I/O error\n",
-               cache->ops->name);
+                       cache->ops->name);
 }
 EXPORT_SYMBOL(fscache_io_error);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 990535071a8a..8dcb114758e3 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -370,6 +370,66 @@ cant_attach_object:
 }
 /*
+ * Invalidate an object.  Callable with spinlocks held.
+ */
+void __fscache_invalidate(struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        _enter("{%s}", cookie->def->name);
+        fscache_stat(&fscache_n_invalidates);
+        /* Only permit invalidation of data files.  Invalidating an index will
+         * require the caller to release all its attachments to the tree rooted
+         * there, and if it's doing that, it may as well just retire the
+         * cookie.
+         */
+        ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+        /* We will be updating the cookie too. */
+        BUG_ON(!cookie->def->get_aux);
+        /* If there's an object, we tell the object state machine to handle the
+         * invalidation on our behalf, otherwise there's nothing to do.
+         */
+        if (!hlist_empty(&cookie->backing_objects)) {
+                spin_lock(&cookie->lock);
+                if (!hlist_empty(&cookie->backing_objects) &&
+                    !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
+                                      &cookie->flags)) {
+                        object = hlist_entry(cookie->backing_objects.first,
+                                             struct fscache_object,
+                                             cookie_link);
+                        if (object->state < FSCACHE_OBJECT_DYING)
+                                fscache_raise_event(
+                                        object, FSCACHE_OBJECT_EV_INVALIDATE);
+                }
+                spin_unlock(&cookie->lock);
+        }
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_invalidate);
+/*
+ * Wait for object invalidation to complete.
+ */
+void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
+{
+        _enter("%p", cookie);
+        wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
+                    fscache_wait_bit_interruptible,
+                    TASK_UNINTERRUPTIBLE);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_wait_on_invalidate);
+/*
 * update the index entries backing a cookie
 */
 void __fscache_update_cookie(struct fscache_cookie *cookie)
@@ -442,16 +502,34 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
        event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+try_again:
        spin_lock(&cookie->lock);
        /* break links with all the active objects */
        while (!hlist_empty(&cookie->backing_objects)) {
+                int n_reads;
                object = hlist_entry(cookie->backing_objects.first,
                                     struct fscache_object,
                                     cookie_link);
                _debug("RELEASE OBJ%x", object->debug_id);
+                set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
+                n_reads = atomic_read(&object->n_reads);
+                if (n_reads) {
+                        int n_ops = object->n_ops;
+                        int n_in_progress = object->n_in_progress;
+                        spin_unlock(&cookie->lock);
+                        printk(KERN_ERR "FS-Cache:"
+                               " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
+                               cookie->def->name,
+                               n_reads, n_ops, n_in_progress);
+                        wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
+                                    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                        printk("Wait finished\n");
+                        goto try_again;
+                }
                /* detach each cache object from the object cookie */
                spin_lock(&object->lock);
                hlist_del_init(&object->cookie_link);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f6aad48d38a8..ee38fef4be51 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -121,12 +121,19 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
                                       struct fscache_operation *);
 extern int fscache_submit_op(struct fscache_object *,
                             struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *,
+                             void (*)(struct fscache_operation *));
+extern void fscache_cancel_all_ops(struct fscache_object *);
 extern void fscache_abort_object(struct fscache_object *);
 extern void fscache_start_operations(struct fscache_object *);
 extern void fscache_operation_gc(struct work_struct *);
 /*
+ * page.c
+ */
+extern void fscache_invalidate_writes(struct fscache_cookie *);
+/*
 * proc.c
 */
 #ifdef CONFIG_PROC_FS
@@ -194,6 +201,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing;
 extern atomic_t fscache_n_store_vmscan_gone;
 extern atomic_t fscache_n_store_vmscan_busy;
 extern atomic_t fscache_n_store_vmscan_cancelled;
+extern atomic_t fscache_n_store_vmscan_wait;
 extern atomic_t fscache_n_marks;
 extern atomic_t fscache_n_uncaches;
@@ -205,6 +213,9 @@ extern atomic_t fscache_n_acquires_ok;
 extern atomic_t fscache_n_acquires_nobufs;
 extern atomic_t fscache_n_acquires_oom;
+extern atomic_t fscache_n_invalidates;
+extern atomic_t fscache_n_invalidates_run;
 extern atomic_t fscache_n_updates;
 extern atomic_t fscache_n_updates_null;
 extern atomic_t fscache_n_updates_run;
@@ -237,6 +248,7 @@ extern atomic_t fscache_n_cop_alloc_object;
 extern atomic_t fscache_n_cop_lookup_object;
 extern atomic_t fscache_n_cop_lookup_complete;
 extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_invalidate_object;
 extern atomic_t fscache_n_cop_update_object;
 extern atomic_t fscache_n_cop_drop_object;
 extern atomic_t fscache_n_cop_put_object;
@@ -278,6 +290,7 @@ extern const struct file_operations fscache_stats_fops;
 static inline void fscache_raise_event(struct fscache_object *object,
                                       unsigned event)
 {
+        BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
        if (!test_and_set_bit(event, &object->events) &&
            test_bit(event, &object->event_mask))
                fscache_enqueue_object(object);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index ebe29c581380..f27c89d17885 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                   obj->n_in_progress,
                   obj->n_exclusive,
                   atomic_read(&obj->n_reads),
-                   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+                   obj->event_mask,
                   obj->events,
                   obj->flags,
                   work_busy(&obj->work));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index b6b897c550ac..50d41c180211 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,6 +14,7 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "internal.h"
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
        [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
        [FSCACHE_OBJECT_AVAILABLE]      = "OBJECT_AVAILABLE",
        [FSCACHE_OBJECT_ACTIVE]         = "OBJECT_ACTIVE",
+        [FSCACHE_OBJECT_INVALIDATING]   = "OBJECT_INVALIDATING",
        [FSCACHE_OBJECT_UPDATING]       = "OBJECT_UPDATING",
        [FSCACHE_OBJECT_DYING]          = "OBJECT_DYING",
        [FSCACHE_OBJECT_LC_DYING]       = "OBJECT_LC_DYING",
@@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
        [FSCACHE_OBJECT_CREATING]       = "CRTN",
        [FSCACHE_OBJECT_AVAILABLE]      = "AVBL",
        [FSCACHE_OBJECT_ACTIVE]         = "ACTV",
+        [FSCACHE_OBJECT_INVALIDATING]   = "INVL",
        [FSCACHE_OBJECT_UPDATING]       = "UPDT",
        [FSCACHE_OBJECT_DYING]          = "DYNG",
        [FSCACHE_OBJECT_LC_DYING]       = "LCDY",
@@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *);
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
+static void fscache_invalidate_object(struct fscache_object *);
 static void fscache_release_object(struct fscache_object *);
 static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
@@ -79,6 +83,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
 }
 /*
+ * Notify netfs of invalidation completion.
+ */
+static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+{
+        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+}
+/*
 * process events that have been sent to an object's state machine
 * - initiates parent lookup
 * - does object lookup
@@ -90,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
 {
        enum fscache_object_state new_state;
        struct fscache_cookie *cookie;
+        int event;
        ASSERT(object != NULL);
@@ -101,7 +115,8 @@ static void fscache_object_state_machine(struct fscache_object *object)
                /* wait for the parent object to become ready */
        case FSCACHE_OBJECT_INIT:
                object->event_mask =
-                        ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+                        FSCACHE_OBJECT_EVENTS_MASK &
+                        ~(1 << FSCACHE_OBJECT_EV_CLEARED);
                fscache_initialise_object(object);
                goto done;
@@ -125,6 +140,16 @@ static void fscache_object_state_machine(struct fscache_object *object)
        case FSCACHE_OBJECT_ACTIVE:
                goto active_transit;
+                /* Invalidate an object on disk */
+        case FSCACHE_OBJECT_INVALIDATING:
+                clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
+                fscache_stat(&fscache_n_invalidates_run);
+                fscache_stat(&fscache_n_cop_invalidate_object);
+                fscache_invalidate_object(object);
+                fscache_stat_d(&fscache_n_cop_invalidate_object);
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+                goto active_transit;
                /* update the object metadata on disk */
        case FSCACHE_OBJECT_UPDATING:
                clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
@@ -251,13 +276,17 @@ static void fscache_object_state_machine(struct fscache_object *object)
        /* determine the transition from a lookup state */
 lookup_transit:
-        switch (fls(object->events & object->event_mask) - 1) {
+        event = fls(object->events & object->event_mask) - 1;
+        switch (event) {
        case FSCACHE_OBJECT_EV_WITHDRAW:
        case FSCACHE_OBJECT_EV_RETIRE:
        case FSCACHE_OBJECT_EV_RELEASE:
        case FSCACHE_OBJECT_EV_ERROR:
                new_state = FSCACHE_OBJECT_LC_DYING;
                goto change_state;
+        case FSCACHE_OBJECT_EV_INVALIDATE:
+                new_state = FSCACHE_OBJECT_INVALIDATING;
+                goto change_state;
        case FSCACHE_OBJECT_EV_REQUEUE:
                goto done;
        case -1:
@@ -268,13 +297,17 @@ lookup_transit:
        /* determine the transition from an active state */
 active_transit:
-        switch (fls(object->events & object->event_mask) - 1) {
+        event = fls(object->events & object->event_mask) - 1;
+        switch (event) {
        case FSCACHE_OBJECT_EV_WITHDRAW:
        case FSCACHE_OBJECT_EV_RETIRE:
        case FSCACHE_OBJECT_EV_RELEASE:
        case FSCACHE_OBJECT_EV_ERROR:
                new_state = FSCACHE_OBJECT_DYING;
                goto change_state;
+        case FSCACHE_OBJECT_EV_INVALIDATE:
+                new_state = FSCACHE_OBJECT_INVALIDATING;
+                goto change_state;
        case FSCACHE_OBJECT_EV_UPDATE:
                new_state = FSCACHE_OBJECT_UPDATING;
                goto change_state;
@@ -287,7 +320,8 @@ active_transit:
        /* determine the transition from a terminal state */
 terminal_transit:
-        switch (fls(object->events & object->event_mask) - 1) {
+        event = fls(object->events & object->event_mask) - 1;
+        switch (event) {
        case FSCACHE_OBJECT_EV_WITHDRAW:
                new_state = FSCACHE_OBJECT_WITHDRAWING;
                goto change_state;
@@ -320,8 +354,8 @@ done:
 unsupported_event:
        printk(KERN_ERR "FS-Cache:"
-               " Unsupported event %lx [mask %lx] in state %s\n",
+               " Unsupported event %d [%lx/%lx] in state %s\n",
-               object->events, object->event_mask,
+               event, object->events, object->event_mask,
               fscache_object_states[object->state]);
        BUG();
 }
@@ -587,8 +621,6 @@ static void fscache_object_available(struct fscache_object *object)
        if (object->n_in_progress == 0) {
                if (object->n_ops > 0) {
                        ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
-                        ASSERTIF(object->n_ops > object->n_obj_ops,
-                                 !list_empty(&object->pending_ops));
                        fscache_start_operations(object);
                } else {
                        ASSERT(list_empty(&object->pending_ops));
@@ -681,6 +713,7 @@ static void fscache_withdraw_object(struct fscache_object *object)
                if (object->cookie == cookie) {
                        hlist_del_init(&object->cookie_link);
                        object->cookie = NULL;
+                        fscache_invalidation_complete(cookie);
                        detached = true;
                }
                spin_unlock(&cookie->lock);
@@ -890,3 +923,55 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
        return result;
 }
 EXPORT_SYMBOL(fscache_check_aux);
+/*
+ * Asynchronously invalidate an object.
+ */
+static void fscache_invalidate_object(struct fscache_object *object)
+{
+        struct fscache_operation *op;
+        struct fscache_cookie *cookie = object->cookie;
+        _enter("{OBJ%x}", object->debug_id);
+        /* Reject any new read/write ops and abort any that are pending. */
+        fscache_invalidate_writes(cookie);
+        clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+        fscache_cancel_all_ops(object);
+        /* Now we have to wait for in-progress reads and writes */
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op) {
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+                _leave(" [ENOMEM]");
+                return;
+        }
+        fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+        spin_lock(&cookie->lock);
+        if (fscache_submit_exclusive_op(object, op) < 0)
+                goto submit_op_failed;
+        spin_unlock(&cookie->lock);
+        fscache_put_operation(op);
+        /* Once we've completed the invalidation, we know there will be no data
+         * stored in the cache and thus we can reinstate the data-check-skip
+         * optimisation.
+         */
+        set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+        /* We can allow read and write requests to come in once again.  They'll
+         * queue up behind our exclusive invalidation operation.
+         */
+        fscache_invalidation_complete(cookie);
+        _leave("");
+        return;
+submit_op_failed:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+        fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+        _leave(" [EIO]");
+}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 30afdfa7aec7..762a9ec4ffa4 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        ASSERT(op->processor != NULL);
        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
        fscache_stat(&fscache_n_op_enqueue);
        switch (op->flags & FSCACHE_OP_TYPE) {
@@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
                           struct fscache_operation *op)
 {
+        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+        op->state = FSCACHE_OP_ST_IN_PROGRESS;
        object->n_in_progress++;
        if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
                wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -84,18 +88,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
        _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
        ASSERT(list_empty(&op->pend_link));
-        ret = -ENOBUFS;
+        op->state = FSCACHE_OP_ST_PENDING;
        if (fscache_object_is_active(object)) {
                op->object = object;
                object->n_ops++;
                object->n_exclusive++;  /* reads and writes must wait */
-                if (object->n_ops > 1) {
+                if (object->n_in_progress > 0) {
                        atomic_inc(&op->usage);
                        list_add_tail(&op->pend_link, &object->pending_ops);
                        fscache_stat(&fscache_n_op_pend);
@@ -121,8 +128,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
                fscache_stat(&fscache_n_op_pend);
                ret = 0;
        } else {
-                /* not allowed to submit ops in any other state */
+                /* If we're in any other state, there must have been an I/O
-                BUG();
+                 * error of some nature.
+                 */
+                ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
+                ret = -EIO;
        }
        spin_unlock(&object->lock);
@@ -186,6 +196,7 @@ int fscache_submit_op(struct fscache_object *object,
        _enter("{OBJ%x OP%x},{%u}",
               object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
        ASSERTCMP(atomic_read(&op->usage), >, 0);
        spin_lock(&object->lock);
@@ -196,6 +207,7 @@ int fscache_submit_op(struct fscache_object *object,
        ostate = object->state;
        smp_rmb();
+        op->state = FSCACHE_OP_ST_PENDING;
        if (fscache_object_is_active(object)) {
                op->object = object;
                object->n_ops++;
@@ -225,12 +237,15 @@ int fscache_submit_op(struct fscache_object *object,
                   object->state == FSCACHE_OBJECT_LC_DYING ||
                   object->state == FSCACHE_OBJECT_WITHDRAWING) {
                fscache_stat(&fscache_n_op_rejected);
+                op->state = FSCACHE_OP_ST_CANCELLED;
                ret = -ENOBUFS;
        } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
                fscache_report_unexpected_submission(object, op, ostate);
                ASSERT(!fscache_object_is_active(object));
+                op->state = FSCACHE_OP_ST_CANCELLED;
                ret = -ENOBUFS;
        } else {
+                op->state = FSCACHE_OP_ST_CANCELLED;
                ret = -ENOBUFS;
        }
@@ -283,20 +298,28 @@ void fscache_start_operations(struct fscache_object *object)
 /*
 * cancel an operation that's pending on an object
 */
-int fscache_cancel_op(struct fscache_operation *op)
+int fscache_cancel_op(struct fscache_operation *op,
+                      void (*do_cancel)(struct fscache_operation *))
 {
        struct fscache_object *object = op->object;
        int ret;
        _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+        ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
+        ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
        spin_lock(&object->lock);
        ret = -EBUSY;
-        if (!list_empty(&op->pend_link)) {
+        if (op->state == FSCACHE_OP_ST_PENDING) {
+                ASSERT(!list_empty(&op->pend_link));
                fscache_stat(&fscache_n_op_cancelled);
                list_del_init(&op->pend_link);
-                object->n_ops--;
+                if (do_cancel)
+                        do_cancel(op);
+                op->state = FSCACHE_OP_ST_CANCELLED;
                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
                        object->n_exclusive--;
                if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
@@ -311,6 +334,70 @@ int fscache_cancel_op(struct fscache_operation *op)
 }
 /*
+ * Cancel all pending operations on an object
+ */
+void fscache_cancel_all_ops(struct fscache_object *object)
+{
+        struct fscache_operation *op;
+        _enter("OBJ%x", object->debug_id);
+        spin_lock(&object->lock);
+        while (!list_empty(&object->pending_ops)) {
+                op = list_entry(object->pending_ops.next,
+                                struct fscache_operation, pend_link);
+                fscache_stat(&fscache_n_op_cancelled);
+                list_del_init(&op->pend_link);
+                ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+                op->state = FSCACHE_OP_ST_CANCELLED;
+                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+                        object->n_exclusive--;
+                if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                        wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+                fscache_put_operation(op);
+                cond_resched_lock(&object->lock);
+        }
+        spin_unlock(&object->lock);
+        _leave("");
+}
+/*
+ * Record the completion or cancellation of an in-progress operation.
+ */
+void fscache_op_complete(struct fscache_operation *op, bool cancelled)
+{
+        struct fscache_object *object = op->object;
+        _enter("OBJ%x", object->debug_id);
+        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+        ASSERTCMP(object->n_in_progress, >, 0);
+        ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+                    object->n_exclusive, >, 0);
+        ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+                    object->n_in_progress, ==, 1);
+        spin_lock(&object->lock);
+        op->state = cancelled ?
+                FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+        if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+                object->n_exclusive--;
+        object->n_in_progress--;
+        if (object->n_in_progress == 0)
+                fscache_start_operations(object);
+        spin_unlock(&object->lock);
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_op_complete);
+/*
 * release an operation
 * - queues pending ops if this is the last in-progress op
 */
@@ -328,8 +415,9 @@ void fscache_put_operation(struct fscache_operation *op)
                return;
        _debug("PUT OP");
-        if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+        ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
-                BUG();
+                    op->state, ==, FSCACHE_OP_ST_CANCELLED);
+        op->state = FSCACHE_OP_ST_DEAD;
        fscache_stat(&fscache_n_op_release);
@@ -340,8 +428,14 @@ void fscache_put_operation(struct fscache_operation *op)
        object = op->object;
-        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
-                atomic_dec(&object->n_reads);
+                if (atomic_dec_and_test(&object->n_reads)) {
+                        clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
+                                  &object->cookie->flags);
+                        wake_up_bit(&object->cookie->flags,
+                                    FSCACHE_COOKIE_WAITING_ON_READS);
+                }
+        }
        /* now... we may get called with the object spinlock held, so we
         * complete the cleanup here only if we can immediately acquire the
@@ -359,16 +453,6 @@ void fscache_put_operation(struct fscache_operation *op)
                return;
        }
-        if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
-                ASSERTCMP(object->n_exclusive, >, 0);
-                object->n_exclusive--;
-        }
-        ASSERTCMP(object->n_in_progress, >, 0);
-        object->n_in_progress--;
-        if (object->n_in_progress == 0)
-                fscache_start_operations(object);
        ASSERTCMP(object->n_ops, >, 0);
        object->n_ops--;
        if (object->n_ops == 0)
@@ -407,23 +491,14 @@ void fscache_operation_gc(struct work_struct *work)
                spin_unlock(&cache->op_gc_list_lock);
                object = op->object;
+                spin_lock(&object->lock);
                _debug("GC DEFERRED REL OBJ%x OP%x",
                       object->debug_id, op->debug_id);
                fscache_stat(&fscache_n_op_gc);
                ASSERTCMP(atomic_read(&op->usage), ==, 0);
+                ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
-                spin_lock(&object->lock);
-                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
-                        ASSERTCMP(object->n_exclusive, >, 0);
-                        object->n_exclusive--;
-                }
-                ASSERTCMP(object->n_in_progress, >, 0);
-                object->n_in_progress--;
-                if (object->n_in_progress == 0)
-                        fscache_start_operations(object);
                ASSERTCMP(object->n_ops, >, 0);
                object->n_ops--;
@@ -431,6 +506,7 @@ void fscache_operation_gc(struct work_struct *work)
                        fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
                spin_unlock(&object->lock);
+                kfree(op);
        } while (count++ < 20);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 3f7a59bfa7ad..ff000e52072d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
        _enter("%p,%p,%x", cookie, page, gfp);
+try_again:
        rcu_read_lock();
        val = radix_tree_lookup(&cookie->stores, page->index);
        if (!val) {
@@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
        return true;
 page_busy:
-        /* we might want to wait here, but that could deadlock the allocator as
+        /* We will wait here if we're allowed to, but that could deadlock the
-         * the work threads writing to the cache may all end up sleeping
+         * allocator as the work threads writing to the cache may all end up
-         * on memory allocation */
+         * sleeping on memory allocation, so we may need to impose a timeout
-        fscache_stat(&fscache_n_store_vmscan_busy);
+         * too. */
-        return false;
+        if (!(gfp & __GFP_WAIT)) {
+                fscache_stat(&fscache_n_store_vmscan_busy);
+                return false;
+        }
+        fscache_stat(&fscache_n_store_vmscan_wait);
+        __fscache_wait_on_page_write(cookie, page);
+        gfp &= ~__GFP_WAIT;
+        goto try_again;
 }
 EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -162,6 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
                        fscache_abort_object(object);
        }
+        fscache_op_complete(op, true);
        _leave("");
 }
@@ -223,6 +233,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
        _enter("{OP%x}", op->op.debug_id);
+        ASSERTCMP(op->n_pages, ==, 0);
        fscache_hist(fscache_retrieval_histogram, op->start_time);
        if (op->context)
                fscache_put_context(op->op.object->cookie, op->context);
@@ -291,6 +303,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 }
 /*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+        struct fscache_retrieval *op =
+                container_of(_op, struct fscache_retrieval, op);
+        op->n_pages = 0;
+}
+/*
 * wait for an object to become active (or dead)
 */
 static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
@@ -307,8 +330,8 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
        fscache_stat(stat_op_waits);
        if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
                        fscache_wait_bit_interruptible,
-                        TASK_INTERRUPTIBLE) < 0) {
+                        TASK_INTERRUPTIBLE) != 0) {
-                ret = fscache_cancel_op(&op->op);
+                ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
                if (ret == 0)
                        return -ERESTARTSYS;
@@ -320,7 +343,14 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
        _debug("<<< GO");
 check_if_dead:
+        if (op->op.state == FSCACHE_OP_ST_CANCELLED) {
+                fscache_stat(stat_object_dead);
+                _leave(" = -ENOBUFS [cancelled]");
+                return -ENOBUFS;
+        }
        if (unlikely(fscache_object_is_dead(object))) {
+                pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state);
+                fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
                fscache_stat(stat_object_dead);
                return -ENOBUFS;
        }
@@ -353,6 +383,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        if (hlist_empty(&cookie->backing_objects))
                goto nobufs;
+        if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+                _leave(" = -ENOBUFS [invalidating]");
+                return -ENOBUFS;
+        }
        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
        ASSERTCMP(page, !=, NULL);
@@ -364,6 +399,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
                _leave(" = -ENOMEM");
                return -ENOMEM;
        }
+        op->n_pages = 1;
        spin_lock(&cookie->lock);
@@ -375,10 +411,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
        atomic_inc(&object->n_reads);
-        set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+        __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
        if (fscache_submit_op(object, &op->op) < 0)
-                goto nobufs_unlock;
+                goto nobufs_unlock_dec;
        spin_unlock(&cookie->lock);
        fscache_stat(&fscache_n_retrieval_ops);
@@ -425,6 +461,8 @@ error:
        _leave(" = %d", ret);
        return ret;
+nobufs_unlock_dec:
+        atomic_dec(&object->n_reads);
 nobufs_unlock:
        spin_unlock(&cookie->lock);
        kfree(op);
@@ -472,6 +510,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        if (hlist_empty(&cookie->backing_objects))
                goto nobufs;
+        if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+                _leave(" = -ENOBUFS [invalidating]");
+                return -ENOBUFS;
+        }
        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
        ASSERTCMP(*nr_pages, >, 0);
        ASSERT(!list_empty(pages));
@@ -482,6 +525,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(mapping, end_io_func, context);
        if (!op)
                return -ENOMEM;
+        op->n_pages = *nr_pages;
        spin_lock(&cookie->lock);
@@ -491,10 +535,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
                             struct fscache_object, cookie_link);
        atomic_inc(&object->n_reads);
-        set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+        __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
        if (fscache_submit_op(object, &op->op) < 0)
-                goto nobufs_unlock;
+                goto nobufs_unlock_dec;
        spin_unlock(&cookie->lock);
        fscache_stat(&fscache_n_retrieval_ops);
@@ -541,6 +585,8 @@ error:
        _leave(" = %d", ret);
        return ret;
+nobufs_unlock_dec:
+        atomic_dec(&object->n_reads);
 nobufs_unlock:
        spin_unlock(&cookie->lock);
        kfree(op);
@@ -577,12 +623,18 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
        ASSERTCMP(page, !=, NULL);
+        if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+                _leave(" = -ENOBUFS [invalidating]");
+                return -ENOBUFS;
+        }
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
        if (!op)
                return -ENOMEM;
+        op->n_pages = 1;
        spin_lock(&cookie->lock);
@@ -658,9 +710,27 @@ static void fscache_write_op(struct fscache_operation *_op)
        spin_lock(&object->lock);
        cookie = object->cookie;
-        if (!fscache_object_is_active(object) || !cookie) {
+        if (!fscache_object_is_active(object)) {
+                /* If we get here, then the on-disk cache object likely longer
+                 * exists, so we should just cancel this write operation.
+                 */
+                spin_unlock(&object->lock);
+                fscache_op_complete(&op->op, false);
+                _leave(" [inactive]");
+                return;
+        }
+        if (!cookie) {
+                /* If we get here, then the cookie belonging to the object was
+                 * detached, probably by the cookie being withdrawn due to
+                 * memory pressure, which means that the pages we might write
+                 * to the cache from no longer exist - therefore, we can just
+                 * cancel this write operation.
+                 */
                spin_unlock(&object->lock);
-                _leave("");
+                fscache_op_complete(&op->op, false);
+                _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
+                       _op->flags, _op->state, object->state, object->flags);
                return;
        }
@@ -696,6 +766,7 @@ static void fscache_write_op(struct fscache_operation *_op)
        fscache_end_page_write(object, page);
        if (ret < 0) {
                fscache_abort_object(object);
+                fscache_op_complete(&op->op, true);
        } else {
                fscache_enqueue_operation(&op->op);
        }
@@ -710,6 +781,38 @@ superseded:
        spin_unlock(&cookie->stores_lock);
        clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
        spin_unlock(&object->lock);
+        fscache_op_complete(&op->op, true);
+        _leave("");
+}
+/*
+ * Clear the pages pending writing for invalidation
+ */
+void fscache_invalidate_writes(struct fscache_cookie *cookie)
+{
+        struct page *page;
+        void *results[16];
+        int n, i;
+        _enter("");
+        while (spin_lock(&cookie->stores_lock),
+               n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+                                              ARRAY_SIZE(results),
+                                              FSCACHE_COOKIE_PENDING_TAG),
+               n > 0) {
+                for (i = n - 1; i >= 0; i--) {
+                        page = results[i];
+                        radix_tree_delete(&cookie->stores, page->index);
+                }
+                spin_unlock(&cookie->stores_lock);
+                for (i = n - 1; i >= 0; i--)
+                        page_cache_release(results[i]);
+        }
+        spin_unlock(&cookie->stores_lock);
        _leave("");
 }
@@ -759,7 +862,12 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_stat(&fscache_n_stores);
-        op = kzalloc(sizeof(*op), GFP_NOIO);
+        if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+                _leave(" = -ENOBUFS [invalidating]");
+                return -ENOBUFS;
+        }
+        op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
        if (!op)
                goto nomem;
@@ -915,6 +1023,40 @@ done:
 EXPORT_SYMBOL(__fscache_uncache_page);
 /**
+ * fscache_mark_page_cached - Mark a page as being cached
+ * @op: The retrieval op pages are being marked for
+ * @page: The page to be marked
+ *
+ * Mark a netfs page as being cached.  After this is called, the netfs
+ * must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
+{
+        struct fscache_cookie *cookie = op->op.object->cookie;
+#ifdef CONFIG_FSCACHE_STATS
+        atomic_inc(&fscache_n_marks);
+#endif
+        _debug("- mark %p{%lx}", page, page->index);
+        if (TestSetPageFsCache(page)) {
+                static bool once_only;
+                if (!once_only) {
+                        once_only = true;
+                        printk(KERN_WARNING "FS-Cache:"
+                               " Cookie type %s marked page %lx"
+                               " multiple times\n",
+                               cookie->def->name, page->index);
+                }
+        }
+        if (cookie->def->mark_page_cached)
+                cookie->def->mark_page_cached(cookie->netfs_data,
+                                              op->mapping, page);
+}
+EXPORT_SYMBOL(fscache_mark_page_cached);
+/**
 * fscache_mark_pages_cached - Mark pages as being cached
 * @op: The retrieval op pages are being marked for
 * @pagevec: The pages to be marked
@@ -925,32 +1067,11 @@ EXPORT_SYMBOL(__fscache_uncache_page);
 void fscache_mark_pages_cached(struct fscache_retrieval *op,
                               struct pagevec *pagevec)
 {
-        struct fscache_cookie *cookie = op->op.object->cookie;
        unsigned long loop;
-#ifdef CONFIG_FSCACHE_STATS
+        for (loop = 0; loop < pagevec->nr; loop++)
-        atomic_add(pagevec->nr, &fscache_n_marks);
+                fscache_mark_page_cached(op, pagevec->pages[loop]);
-#endif
-        for (loop = 0; loop < pagevec->nr; loop++) {
-                struct page *page = pagevec->pages[loop];
-                _debug("- mark %p{%lx}", page, page->index);
-                if (TestSetPageFsCache(page)) {
-                        static bool once_only;
-                        if (!once_only) {
-                                once_only = true;
-                                printk(KERN_WARNING "FS-Cache:"
-                                       " Cookie type %s marked page %lx"
-                                       " multiple times\n",
-                                       cookie->def->name, page->index);
-                        }
-                }
-        }
-        if (cookie->def->mark_pages_cached)
-                cookie->def->mark_pages_cached(cookie->netfs_data,
-                                               op->mapping, pagevec);
        pagevec_reinit(pagevec);
 }
 EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 4765190d537f..8179e8bc4a3d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing;
 atomic_t fscache_n_store_vmscan_gone;
 atomic_t fscache_n_store_vmscan_busy;
 atomic_t fscache_n_store_vmscan_cancelled;
+atomic_t fscache_n_store_vmscan_wait;
 atomic_t fscache_n_marks;
 atomic_t fscache_n_uncaches;
@@ -80,6 +81,9 @@ atomic_t fscache_n_acquires_ok;
 atomic_t fscache_n_acquires_nobufs;
 atomic_t fscache_n_acquires_oom;
+atomic_t fscache_n_invalidates;
+atomic_t fscache_n_invalidates_run;
 atomic_t fscache_n_updates;
 atomic_t fscache_n_updates_null;
 atomic_t fscache_n_updates_run;
@@ -112,6 +116,7 @@ atomic_t fscache_n_cop_alloc_object;
 atomic_t fscache_n_cop_lookup_object;
 atomic_t fscache_n_cop_lookup_complete;
 atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_invalidate_object;
 atomic_t fscache_n_cop_update_object;
 atomic_t fscache_n_cop_drop_object;
 atomic_t fscache_n_cop_put_object;
@@ -168,6 +173,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_object_created),
                   atomic_read(&fscache_n_object_lookups_timed_out));
+        seq_printf(m, "Invals : n=%u run=%u\n",
+                   atomic_read(&fscache_n_invalidates),
+                   atomic_read(&fscache_n_invalidates_run));
        seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
                   atomic_read(&fscache_n_updates),
                   atomic_read(&fscache_n_updates_null),
@@ -224,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_store_radix_deletes),
                   atomic_read(&fscache_n_store_pages_over_limit));
-        seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+        seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
                   atomic_read(&fscache_n_store_vmscan_not_storing),
                   atomic_read(&fscache_n_store_vmscan_gone),
                   atomic_read(&fscache_n_store_vmscan_busy),
-                   atomic_read(&fscache_n_store_vmscan_cancelled));
+                   atomic_read(&fscache_n_store_vmscan_cancelled),
+                   atomic_read(&fscache_n_store_vmscan_wait));
        seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
                   atomic_read(&fscache_n_op_pend),
@@ -246,7 +256,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_cop_lookup_object),
                   atomic_read(&fscache_n_cop_lookup_complete),
                   atomic_read(&fscache_n_cop_grab_object));
-        seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+        seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+                   atomic_read(&fscache_n_cop_invalidate_object),
                   atomic_read(&fscache_n_cop_update_object),
                   atomic_read(&fscache_n_cop_drop_object),
                   atomic_read(&fscache_n_cop_put_object),
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 0b35903219bc..d47f11658c17 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page)
        return block_read_full_page(page, hfs_get_block);
 }
+static void hfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                hfs_file_truncate(inode);
+        }
+}
 static int hfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hfs_get_block,
                                &HFS_I(mapping->host)->phys_size);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                hfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
@@ -120,6 +127,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
        struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
        ssize_t ret;
@@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
                loff_t end = offset + iov_length(iov, nr_segs);
                if (end > isize)
-                        vmtruncate(inode, isize);
+                        hfs_write_failed(mapping, end);
        }
        return ret;
@@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
            attr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
+                truncate_setsize(inode, attr->ia_size);
+                hfs_file_truncate(inode);
        }
        setattr_copy(inode, attr);
@@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = {
 static const struct inode_operations hfs_file_inode_operations = {
        .lookup         = hfs_file_lookup,
-        .truncate       = hfs_file_truncate,
        .setattr        = hfs_inode_setattr,
        .setxattr       = hfs_setxattr,
        .getxattr       = hfs_getxattr,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2172aa5976f5..799b336b59f9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -28,6 +28,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
        return block_write_full_page(page, hfsplus_get_block, wbc);
 }
+static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                hfsplus_file_truncate(inode);
+        }
+}
 static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -38,11 +48,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hfsplus_get_block,
                                &HFSPLUS_I(mapping->host)->phys_size);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                hfsplus_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
@@ -116,6 +123,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
        struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
        ssize_t ret;
@@ -131,7 +139,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
                loff_t end = offset + iov_length(iov, nr_segs);
                if (end > isize)
-                        vmtruncate(inode, isize);
+                        hfsplus_write_failed(mapping, end);
        }
        return ret;
@@ -300,10 +308,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);
+                truncate_setsize(inode, attr->ia_size);
-                error = vmtruncate(inode, attr->ia_size);
+                hfsplus_file_truncate(inode);
-                if (error)
-                        return error;
        }
        setattr_copy(inode, attr);
@@ -358,7 +364,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 static const struct inode_operations hfsplus_file_inode_operations = {
        .lookup         = hfsplus_file_lookup,
-        .truncate       = hfsplus_file_truncate,
        .setattr        = hfsplus_setattr,
        .setxattr       = hfsplus_setxattr,
        .getxattr       = hfsplus_getxattr,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89d2a5803ae3..fbfe2df5624b 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
        return disk_secno;
 }
-static void hpfs_truncate(struct inode *i)
+void hpfs_truncate(struct inode *i)
 {
        if (IS_IMMUTABLE(i)) return /*-EPERM*/;
        hpfs_lock_assert(i->i_sb);
@@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page)
        return block_read_full_page(page,hpfs_get_block);
 }
+static void hpfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                hpfs_truncate(inode);
+        }
+}
 static int hpfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hpfs_get_block,
                                &hpfs_i(mapping->host)->mmu_private);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                hpfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
@@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops =
 const struct inode_operations hpfs_file_iops =
 {
-        .truncate       = hpfs_truncate,
        .setattr        = hpfs_setattr,
 };
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 7102aaecc244..b7ae286646b5 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 /* file.c */
 int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
+void hpfs_truncate(struct inode *);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 804a9a842cbc..5dc06c837105 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        goto out_unlock;
+                truncate_setsize(inode, attr->ia_size);
+                hpfs_truncate(inode);
        }
        setattr_copy(inode, attr);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 9d3afd157f99..dd7442c58358 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
            iattr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);
-                rc = vmtruncate(inode, iattr->ia_size);
+                rc = inode_newsize_ok(inode, iattr->ia_size);
                if (rc)
                        return rc;
+                truncate_setsize(inode, iattr->ia_size);
+                jfs_truncate(inode);
        }
        setattr_copy(inode, iattr);
@@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 const struct inode_operations jfs_file_inode_operations = {
-        .truncate       = jfs_truncate,
        .setxattr       = jfs_setxattr,
        .getxattr       = jfs_getxattr,
        .listxattr      = jfs_listxattr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 4692bf3ca8cb..b7dc47ba675e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
 }
+static void jfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                jfs_truncate(inode);
+        }
+}
 static int jfs_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
@@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
        ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
                                jfs_get_block);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                jfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
@@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
        const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
@@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
                loff_t end = offset + iov_length(iov, nr_segs);
                if (end > isize)
-                        vmtruncate(inode, isize);
+                        jfs_write_failed(mapping, end);
        }
        return ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index 35fc6e74cd88..916da8c4158b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
        struct inode *inode = dentry->d_inode;
        int error;
-        WARN_ON_ONCE(inode->i_op->truncate);
        error = inode_change_ok(inode, iattr);
        if (error)
                return error;
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index e1a3b6bf6324..9a59cbade2fb 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target)
                logfs_put_wblocks(sb, NULL, 1);
        }
-        if (!err)
+        if (!err) {
-                err = vmtruncate(inode, target);
+                err = inode_newsize_ok(inode, target);
+                if (err)
+                        goto out;
+                truncate_setsize(inode, target);
+        }
+ out:
        /* I don't trust error recovery yet. */
        WARN_ON(err);
        return err;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 4493ce695ab8..adc6f5494231 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
+                truncate_setsize(inode, attr->ia_size);
+                minix_truncate(inode);
        }
        setattr_copy(inode, attr);
@@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
 }
 const struct inode_operations minix_file_inode_operations = {
-        .truncate       = minix_truncate,
        .setattr        = minix_setattr,
        .getattr        = minix_getattr,
 };
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 4fc5f8ab1c44..99541cceb584 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
        return __block_write_begin(page, pos, len, minix_get_block);
 }
+static void minix_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                minix_truncate(inode);
+        }
+}
 static int minix_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep,
                                minix_get_block);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                minix_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 5f4cdf3ad913..43a97ee1d4c8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1275,9 +1275,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
        *need_lookup = false;
        dentry = d_lookup(dir, name);
        if (dentry) {
-                if (d_need_lookup(dentry)) {
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
-                        *need_lookup = true;
-                } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
                        error = d_revalidate(dentry, flags);
                        if (unlikely(error <= 0)) {
                                if (error < 0) {
@@ -1383,8 +1381,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
                        return -ECHILD;
                nd->seq = seq;
-                if (unlikely(d_need_lookup(dentry)))
-                        goto unlazy;
                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
                        status = d_revalidate(dentry, nd->flags);
                        if (unlikely(status <= 0)) {
@@ -1410,11 +1406,6 @@ unlazy:
        if (unlikely(!dentry))
                goto need_lookup;
-        if (unlikely(d_need_lookup(dentry))) {
-                dput(dentry);
-                goto need_lookup;
-        }
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
                status = d_revalidate(dentry, nd->flags);
        if (unlikely(status <= 0)) {
@@ -1859,7 +1850,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        if (flags & LOOKUP_ROOT) {
                struct inode *inode = nd->root.dentry->d_inode;
                if (*name) {
-                        if (!inode->i_op->lookup)
+                        if (!can_lookup(inode))
                                return -ENOTDIR;
                        retval = inode_permission(inode, MAY_EXEC);
                        if (retval)
@@ -1903,6 +1894,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                        get_fs_pwd(current->fs, &nd->path);
                }
        } else {
+                /* Caller must check execute permissions on the starting path component */
                struct fd f = fdget_raw(dfd);
                struct dentry *dentry;
@@ -1912,16 +1904,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                dentry = f.file->f_path.dentry;
                if (*name) {
-                        if (!S_ISDIR(dentry->d_inode->i_mode)) {
+                        if (!can_lookup(dentry->d_inode)) {
                                fdput(f);
                                return -ENOTDIR;
                        }
-                        retval = inode_permission(dentry->d_inode, MAY_EXEC);
-                        if (retval) {
-                                fdput(f);
-                                return retval;
-                        }
                }
                nd->path = f.file->f_path;
@@ -2189,15 +2175,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 *     path-walking is complete.
 */
 static struct filename *
-user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
+user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+                 unsigned int flags)
 {
        struct filename *s = getname(path);
        int error;
+        /* only LOOKUP_REVAL is allowed in extra flags */
+        flags &= LOOKUP_REVAL;
        if (IS_ERR(s))
                return s;
-        error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
+        error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
        if (error) {
                putname(s);
                return ERR_PTR(error);
@@ -3044,12 +3034,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
        return file;
 }
-struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
+struct dentry *kern_path_create(int dfd, const char *pathname,
+                                struct path *path, unsigned int lookup_flags)
 {
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct nameidata nd;
        int err2;
-        int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+        int error;
+        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
+        /*
+         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
+         * other flags passed in are ignored!
+         */
+        lookup_flags &= LOOKUP_REVAL;
+        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
        if (error)
                return ERR_PTR(error);
@@ -3113,13 +3113,14 @@ void done_path_create(struct path *path, struct dentry *dentry)
 }
 EXPORT_SYMBOL(done_path_create);
-struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
+struct dentry *user_path_create(int dfd, const char __user *pathname,
+                                struct path *path, unsigned int lookup_flags)
 {
        struct filename *tmp = getname(pathname);
        struct dentry *res;
        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
-        res = kern_path_create(dfd, tmp->name, path, is_dir);
+        res = kern_path_create(dfd, tmp->name, path, lookup_flags);
        putname(tmp);
        return res;
 }
@@ -3175,12 +3176,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
        struct dentry *dentry;
        struct path path;
        int error;
+        unsigned int lookup_flags = 0;
        error = may_mknod(mode);
        if (error)
                return error;
+retry:
-        dentry = user_path_create(dfd, filename, &path, 0);
+        dentry = user_path_create(dfd, filename, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -3203,6 +3205,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
        }
 out:
        done_path_create(&path, dentry);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -3241,8 +3247,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
        struct dentry *dentry;
        struct path path;
        int error;
+        unsigned int lookup_flags = LOOKUP_DIRECTORY;
-        dentry = user_path_create(dfd, pathname, &path, 1);
+retry:
+        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -3252,6 +3260,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
        if (!error)
                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
        done_path_create(&path, dentry);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -3327,8 +3339,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
        struct filename *name;
        struct dentry *dentry;
        struct nameidata nd;
+        unsigned int lookup_flags = 0;
-        name = user_path_parent(dfd, pathname, &nd);
+retry:
+        name = user_path_parent(dfd, pathname, &nd, lookup_flags);
        if (IS_ERR(name))
                return PTR_ERR(name);
@@ -3370,6 +3383,10 @@ exit2:
 exit1:
        path_put(&nd.path);
        putname(name);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -3423,8 +3440,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
        struct dentry *dentry;
        struct nameidata nd;
        struct inode *inode = NULL;
+        unsigned int lookup_flags = 0;
-        name = user_path_parent(dfd, pathname, &nd);
+retry:
+        name = user_path_parent(dfd, pathname, &nd, lookup_flags);
        if (IS_ERR(name))
                return PTR_ERR(name);
@@ -3462,6 +3480,11 @@ exit2:
 exit1:
        path_put(&nd.path);
        putname(name);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                inode = NULL;
+                goto retry;
+        }
        return error;
 slashes:
@@ -3513,12 +3536,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
        struct filename *from;
        struct dentry *dentry;
        struct path path;
+        unsigned int lookup_flags = 0;
        from = getname(oldname);
        if (IS_ERR(from))
                return PTR_ERR(from);
+retry:
-        dentry = user_path_create(newdfd, newname, &path, 0);
+        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;
@@ -3527,6 +3551,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
        if (!error)
                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
        done_path_create(&path, dentry);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 out_putname:
        putname(from);
        return error;
@@ -3613,12 +3641,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
+retry:
        error = user_path_at(olddfd, oldname, how, &old_path);
        if (error)
                return error;
-        new_dentry = user_path_create(newdfd, newname, &new_path, 0);
+        new_dentry = user_path_create(newdfd, newname, &new_path,
+                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out;
@@ -3635,6 +3664,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
 out_dput:
        done_path_create(&new_path, new_dentry);
+        if (retry_estale(error, how)) {
+                how |= LOOKUP_REVAL;
+                goto retry;
+        }
 out:
        path_put(&old_path);
@@ -3807,15 +3840,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
        struct nameidata oldnd, newnd;
        struct filename *from;
        struct filename *to;
+        unsigned int lookup_flags = 0;
+        bool should_retry = false;
        int error;
+retry:
-        from = user_path_parent(olddfd, oldname, &oldnd);
+        from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto exit;
        }
-        to = user_path_parent(newdfd, newname, &newnd);
+        to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
        if (IS_ERR(to)) {
                error = PTR_ERR(to);
                goto exit1;
@@ -3887,11 +3922,18 @@ exit3:
        unlock_rename(new_dir, old_dir);
        mnt_drop_write(oldnd.path.mnt);
 exit2:
+        if (retry_estale(error, lookup_flags))
+                should_retry = true;
        path_put(&newnd.path);
        putname(to);
 exit1:
        path_put(&oldnd.path);
        putname(from);
+        if (should_retry) {
+                should_retry = false;
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 exit:
        return error;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 398a50ff2438..55605c552787 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m)
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();
-        while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+        while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
                cpu_relax();
        /*
         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d7e9fe77188a..1acdad7fcec7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -976,9 +976,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
                        goto out;
                if (attr->ia_size != i_size_read(inode)) {
-                        result = vmtruncate(inode, attr->ia_size);
+                        truncate_setsize(inode, attr->ia_size);
-                        if (result)
-                                goto out;
                        mark_inode_dirty(inode);
                }
        }
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c817787fbdb4..24d1d1c5fcaf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
                nfs_fscache_inode_unlock(inode);
        }
 }
+EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
 /*
 * Replace a per-inode cookie due to revalidation detecting a file having
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index c5b11b53ff33..277b02782897 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -153,6 +153,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,
 }
 /*
+ * Invalidate the contents of fscache for this inode.  This will not sleep.
+ */
+static inline void nfs_fscache_invalidate(struct inode *inode)
+{
+        fscache_invalidate(NFS_I(inode)->fscache);
+}
+/*
+ * Wait for an object to finish being invalidated.
+ */
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
+{
+        fscache_wait_on_invalidate(NFS_I(inode)->fscache);
+}
+/*
 * indicate the client caching state as readable text
 */
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
@@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
        return "no ";
 }
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
@@ -205,6 +220,9 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
 static inline void nfs_readpage_to_fscache(struct inode *inode,
                                           struct page *page, int sync) {}
+static inline void nfs_fscache_invalidate(struct inode *inode) {}
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 {
        return "no ";
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2faae14d89f4..ebeb94ce1b0b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -161,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode)
        nfsi->attrtimeo_timestamp = jiffies;
        memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
-        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
-        else
+                nfs_fscache_invalidate(inode);
+        } else {
                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+        }
 }
 void nfs_zap_caches(struct inode *inode)
@@ -179,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
        if (mapping->nrpages != 0) {
                spin_lock(&inode->i_lock);
                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+                nfs_fscache_invalidate(inode);
                spin_unlock(&inode->i_lock);
        }
 }
@@ -881,7 +884,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
        spin_unlock(&inode->i_lock);
        nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
-        nfs_fscache_reset_inode_cookie(inode);
+        nfs_fscache_wait_on_invalidate(inode);
        dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
                        inode->i_sb->s_id, (long long)NFS_FILEID(inode));
        return 0;
@@ -957,6 +960,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
                i_size_write(inode, nfs_size_to_loff_t(fattr->size));
                ret |= NFS_INO_INVALID_ATTR;
        }
+        if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+                nfs_fscache_invalidate(inode);
        return ret;
 }
@@ -1205,8 +1212,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
        struct nfs_inode *nfsi = NFS_I(inode);
        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-        if (S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode)) {
                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                nfs_fscache_invalidate(inode);
+        }
        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
                return 0;
        return nfs_refresh_inode_locked(inode, fattr);
@@ -1494,6 +1503,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        (save_cache_validity & NFS_INO_REVAL_FORCED))
                nfsi->cache_validity |= invalid;
+        if (invalid & NFS_INO_INVALID_DATA)
+                nfs_fscache_invalidate(inode);
        return 0;
 out_err:
        /*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e7699308364a..08ddcccb8887 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -5,6 +5,7 @@
 */
 #include <linux/nfs_fs.h>
 #include "internal.h"
+#include "fscache.h"
 #include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_FILE
@@ -74,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        nfs_file_set_open_context(filp, ctx);
+        nfs_fscache_set_inode_cookie(inode, filp);
        err = 0;
 out_put_ctx:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 493f0f41c554..5d864fb36578 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,7 +64,7 @@
 #include "pnfs.h"
 #include "netns.h"
 #include "nfs4session.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
@@ -734,6 +734,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
        if (!cinfo->atomic || cinfo->before != dir->i_version)
                nfs_force_lookup_revalidate(dir);
        dir->i_version = cinfo->after;
+        nfs_fscache_invalidate(dir);
        spin_unlock(&dir->i_lock);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5209916e1222..b673be31590e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1794,7 +1794,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
        if (PagePrivate(page))
                return -EBUSY;
-        nfs_fscache_release_page(page, GFP_KERNEL);
+        if (!nfs_fscache_release_page(page, GFP_KERNEL))
+                return -EBUSY;
        return migrate_page(mapping, newpage, page, mode);
 }
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index e6c38159622f..e761ee95617f 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -8,61 +8,144 @@
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/sunrpc/clnt.h>
+#include <asm/uaccess.h>
 #include "state.h"
-#include "fault_inject.h"
+#include "netns.h"
 struct nfsd_fault_inject_op {
        char *file;
-        void (*func)(u64);
+        u64 (*forget)(struct nfs4_client *, u64);
+        u64 (*print)(struct nfs4_client *, u64);
 };
 static struct nfsd_fault_inject_op inject_ops[] = {
        {
                .file   = "forget_clients",
-                .func   = nfsd_forget_clients,
+                .forget = nfsd_forget_client,
+                .print  = nfsd_print_client,
        },
        {
                .file   = "forget_locks",
-                .func   = nfsd_forget_locks,
+                .forget = nfsd_forget_client_locks,
+                .print  = nfsd_print_client_locks,
        },
        {
                .file   = "forget_openowners",
-                .func   = nfsd_forget_openowners,
+                .forget = nfsd_forget_client_openowners,
+                .print  = nfsd_print_client_openowners,
        },
        {
                .file   = "forget_delegations",
-                .func   = nfsd_forget_delegations,
+                .forget = nfsd_forget_client_delegations,
+                .print  = nfsd_print_client_delegations,
        },
        {
                .file   = "recall_delegations",
-                .func   = nfsd_recall_delegations,
+                .forget = nfsd_recall_client_delegations,
+                .print  = nfsd_print_client_delegations,
        },
 };
 static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
 static struct dentry *debug_dir;
-static int nfsd_inject_set(void *op_ptr, u64 val)
+static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
 {
-        struct nfsd_fault_inject_op *op = op_ptr;
+        u64 count = 0;
        if (val == 0)
                printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
        else
                printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
-        op->func(val);
+        nfs4_lock_state();
-        return 0;
+        count = nfsd_for_n_state(val, op->forget);
+        nfs4_unlock_state();
+        printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
 }
-static int nfsd_inject_get(void *data, u64 *val)
+static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
+                                   struct sockaddr_storage *addr,
+                                   size_t addr_size)
 {
-        *val = 0;
+        char buf[INET6_ADDRSTRLEN];
-        return 0;
+        struct nfs4_client *clp;
+        u64 count;
+        nfs4_lock_state();
+        clp = nfsd_find_client(addr, addr_size);
+        if (clp) {
+                count = op->forget(clp, 0);
+                rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+                printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
+        }
+        nfs4_unlock_state();
+}
+static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
+{
+        nfs4_lock_state();
+        *val = nfsd_for_n_state(0, op->print);
+        nfs4_unlock_state();
 }
-DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+static ssize_t fault_inject_read(struct file *file, char __user *buf,
+                                 size_t len, loff_t *ppos)
+{
+        static u64 val;
+        char read_buf[25];
+        size_t size, ret;
+        loff_t pos = *ppos;
+        if (!pos)
+                nfsd_inject_get(file->f_dentry->d_inode->i_private, &val);
+        size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
+        if (pos < 0)
+                return -EINVAL;
+        if (pos >= size || !len)
+                return 0;
+        if (len > size - pos)
+                len = size - pos;
+        ret = copy_to_user(buf, read_buf + pos, len);
+        if (ret == len)
+                return -EFAULT;
+        len -= ret;
+        *ppos = pos + len;
+        return len;
+}
+static ssize_t fault_inject_write(struct file *file, const char __user *buf,
+                                  size_t len, loff_t *ppos)
+{
+        char write_buf[INET6_ADDRSTRLEN];
+        size_t size = min(sizeof(write_buf) - 1, len);
+        struct net *net = current->nsproxy->net_ns;
+        struct sockaddr_storage sa;
+        u64 val;
+        if (copy_from_user(write_buf, buf, size))
+                return -EFAULT;
+        write_buf[size] = '\0';
+        size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
+        if (size > 0)
+                nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size);
+        else {
+                val = simple_strtoll(write_buf, NULL, 0);
+                nfsd_inject_set(file->f_dentry->d_inode->i_private, val);
+        }
+        return len; /* on success, claim we got the whole input */
+}
+static const struct file_operations fops_nfsd = {
+        .owner   = THIS_MODULE,
+        .read    = fault_inject_read,
+        .write   = fault_inject_write,
+};
 void nfsd_fault_inject_cleanup(void)
 {
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
deleted file mode 100644
index 90bd0570956c..000000000000
--- a/fs/nfsd/fault_inject.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
- *
- * Function definitions for fault injection
- */
-#ifndef LINUX_NFSD_FAULT_INJECT_H
-#define LINUX_NFSD_FAULT_INJECT_H
-#ifdef CONFIG_NFSD_FAULT_INJECTION
-int nfsd_fault_inject_init(void);
-void nfsd_fault_inject_cleanup(void);
-void nfsd_forget_clients(u64);
-void nfsd_forget_locks(u64);
-void nfsd_forget_openowners(u64);
-void nfsd_forget_delegations(u64);
-void nfsd_recall_delegations(u64);
-#else /* CONFIG_NFSD_FAULT_INJECTION */
-static inline int nfsd_fault_inject_init(void) { return 0; }
-static inline void nfsd_fault_inject_cleanup(void) {}
-static inline void nfsd_forget_clients(u64 num) {}
-static inline void nfsd_forget_locks(u64 num) {}
-static inline void nfsd_forget_openowners(u64 num) {}
-static inline void nfsd_forget_delegations(u64 num) {}
-static inline void nfsd_recall_delegations(u64 num) {}
-#endif /* CONFIG_NFSD_FAULT_INJECTION */
-#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 65c2431ea32f..1051bebff1b0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -24,7 +24,18 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+/* Hash tables for nfs4_clientid state */
+#define CLIENT_HASH_BITS                 4
+#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
+#define LOCKOWNER_INO_HASH_BITS         8
+#define LOCKOWNER_INO_HASH_SIZE         (1 << LOCKOWNER_INO_HASH_BITS)
+#define SESSION_HASH_SIZE       512
 struct cld_net;
+struct nfsd4_client_tracking_ops;
 struct nfsd_net {
        struct cld_net *cld_net;
@@ -38,7 +49,62 @@ struct nfsd_net {
        struct lock_manager nfsd4_manager;
        bool grace_ended;
        time_t boot_time;
+        /*
+         * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+         * used in reboot/reset lease grace period processing
+         *
+         * conf_id_hashtbl[], and conf_name_tree hold confirmed
+         * setclientid_confirmed info.
+         *
+         * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
+         * setclientid info.
+         */
+        struct list_head *reclaim_str_hashtbl;
+        int reclaim_str_hashtbl_size;
+        struct list_head *conf_id_hashtbl;
+        struct rb_root conf_name_tree;
+        struct list_head *unconf_id_hashtbl;
+        struct rb_root unconf_name_tree;
+        struct list_head *ownerstr_hashtbl;
+        struct list_head *lockowner_ino_hashtbl;
+        struct list_head *sessionid_hashtbl;
+        /*
+         * client_lru holds client queue ordered by nfs4_client.cl_time
+         * for lease renewal.
+         *
+         * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+         * for last close replay.
+         *
+         * All of the above fields are protected by the client_mutex.
+         */
+        struct list_head client_lru;
+        struct list_head close_lru;
+        struct delayed_work laundromat_work;
+        /* client_lock protects the client lru list and session hash table */
+        spinlock_t client_lock;
+        struct file *rec_file;
+        bool in_grace;
+        struct nfsd4_client_tracking_ops *client_tracking_ops;
+        time_t nfsd4_lease;
+        time_t nfsd4_grace;
+        bool nfsd_net_up;
+        /*
+         * Time of server startup
+         */
+        struct timeval nfssvc_boot;
+        struct svc_serv *nfsd_serv;
 };
+/* Simple check to find out if a given net was properly initialized */
+#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
 extern int nfsd_net_id;
 #endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b314888825d5..9170861c804a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -253,7 +253,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                (resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
                (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
        while (w > 0) {
-                if (!rqstp->rq_respages[rqstp->rq_resused++])
+                if (!*(rqstp->rq_next_page++))
                        return 0;
                w -= PAGE_SIZE;
        }
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index a596e9d987e4..9cbc1a841f87 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                        (resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
                        (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
                while (w > 0) {
-                        if (!rqstp->rq_respages[rqstp->rq_resused++])
+                        if (!*(rqstp->rq_next_page++))
                                return 0;
                        w -= PAGE_SIZE;
                }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 97d90d1c8608..1fc02dfdc5c4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -460,7 +460,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
        __be32  nfserr;
        int     count = 0;
        loff_t  offset;
-        int     i;
+        struct page **p;
        caddr_t page_addr = NULL;
        dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
@@ -484,8 +484,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
                                     &resp->common,
                                     nfs3svc_encode_entry_plus);
        memcpy(resp->verf, argp->verf, 8);
-        for (i=1; i<rqstp->rq_resused ; i++) {
+        for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
-                page_addr = page_address(rqstp->rq_respages[i]);
+                page_addr = page_address(*p);
                if (((caddr_t)resp->buffer >= page_addr) &&
                    ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 43f46cd9edea..324c0baf7cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -7,8 +7,10 @@
 */
 #include <linux/namei.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include "xdr3.h"
 #include "auth.h"
+#include "netns.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
@@ -323,7 +325,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readargs *args)
 {
        unsigned int len;
-        int v,pn;
+        int v;
        u32 max_blocksize = svc_max_payload(rqstp);
        if (!(p = decode_fh(p, &args->fh)))
@@ -338,8 +340,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
        /* set up the kvec */
        v=0;
        while (len > 0) {
-                pn = rqstp->rq_resused++;
+                struct page *p = *(rqstp->rq_next_page++);
-                rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+                rqstp->rq_vec[v].iov_base = page_address(p);
                rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
                len -= rqstp->rq_vec[v].iov_len;
                v++;
@@ -461,8 +464,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
        len = ntohl(*p++);
        if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
                return 0;
-        args->tname = new =
+        args->tname = new = page_address(*(rqstp->rq_next_page++));
-                page_address(rqstp->rq_respages[rqstp->rq_resused++]);
        args->tlen = len;
        /* first copy and check from the first page */
        old = (char*)p;
@@ -533,8 +535,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
 {
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
-        args->buffer =
+        args->buffer = page_address(*(rqstp->rq_next_page++));
-                page_address(rqstp->rq_respages[rqstp->rq_resused++]);
        return xdr_argsize_check(rqstp, p);
 }
@@ -565,8 +566,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
        if (args->count > PAGE_SIZE)
                args->count = PAGE_SIZE;
-        args->buffer =
+        args->buffer = page_address(*(rqstp->rq_next_page++));
-                page_address(rqstp->rq_respages[rqstp->rq_resused++]);
        return xdr_argsize_check(rqstp, p);
 }
@@ -575,7 +575,7 @@ int
 nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readdirargs *args)
 {
-        int len, pn;
+        int len;
        u32 max_blocksize = svc_max_payload(rqstp);
        if (!(p = decode_fh(p, &args->fh)))
@@ -590,9 +590,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
        args->count = len;
        while (len > 0) {
-                pn = rqstp->rq_resused++;
+                struct page *p = *(rqstp->rq_next_page++);
                if (!args->buffer)
-                        args->buffer = page_address(rqstp->rq_respages[pn]);
+                        args->buffer = page_address(p);
                len -= PAGE_SIZE;
        }
@@ -720,12 +720,14 @@ int
 nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_writeres *resp)
 {
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        p = encode_wcc_data(rqstp, p, &resp->fh);
        if (resp->status == 0) {
                *p++ = htonl(resp->count);
                *p++ = htonl(resp->committed);
-                *p++ = htonl(nfssvc_boot.tv_sec);
+                *p++ = htonl(nn->nfssvc_boot.tv_sec);
-                *p++ = htonl(nfssvc_boot.tv_usec);
+                *p++ = htonl(nn->nfssvc_boot.tv_usec);
        }
        return xdr_ressize_check(rqstp, p);
 }
@@ -876,7 +878,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
                                                        common);
        __be32          *p = cd->buffer;
        caddr_t         curr_page_addr = NULL;
-        int             pn;             /* current page number */
+        struct page **  page;
        int             slen;           /* string (name) length */
        int             elen;           /* estimated entry length in words */
        int             num_entry_words = 0;    /* actual number of words */
@@ -913,8 +915,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
        }
        /* determine which page in rq_respages[] we are currently filling */
-        for (pn=1; pn < cd->rqstp->rq_resused; pn++) {
+        for (page = cd->rqstp->rq_respages + 1;
-                curr_page_addr = page_address(cd->rqstp->rq_respages[pn]);
+                                page < cd->rqstp->rq_next_page; page++) {
+                curr_page_addr = page_address(*page);
                if (((caddr_t)cd->buffer >= curr_page_addr) &&
                    ((caddr_t)cd->buffer <  curr_page_addr + PAGE_SIZE))
@@ -929,14 +932,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
                if (plus)
                        p = encode_entryplus_baggage(cd, p, name, namlen);
                num_entry_words = p - cd->buffer;
-        } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
+        } else if (*(page+1) != NULL) {
                /* temporarily encode entry into next page, then move back to
                 * current and next page in rq_respages[] */
                __be32 *p1, *tmp;
                int len1, len2;
                /* grab next page for temporary storage of entry */
-                p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]);
+                p1 = tmp = page_address(*(page+1));
                p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
@@ -1082,11 +1085,13 @@ int
 nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_commitres *resp)
 {
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        p = encode_wcc_data(rqstp, p, &resp->fh);
        /* Write verifier */
        if (resp->status == 0) {
-                *p++ = htonl(nfssvc_boot.tv_sec);
+                *p++ = htonl(nn->nfssvc_boot.tv_sec);
-                *p++ = htonl(nfssvc_boot.tv_usec);
+                *p++ = htonl(nn->nfssvc_boot.tv_usec);
        }
        return xdr_ressize_check(rqstp, p);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index bdf29c96e4cd..99bc85ff0217 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include "nfsd.h"
 #include "state.h"
+#include "netns.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -625,20 +626,46 @@ static const struct rpc_program cb_program = {
        .pipe_dir_name          = "nfsd4_cb",
 };
-static int max_cb_time(void)
+static int max_cb_time(struct net *net)
 {
-        return max(nfsd4_lease/10, (time_t)1) * HZ;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
 }
+static struct rpc_cred *callback_cred;
+int set_callback_cred(void)
+{
+        if (callback_cred)
+                return 0;
+        callback_cred = rpc_lookup_machine_cred("nfs");
+        if (!callback_cred)
+                return -ENOMEM;
+        return 0;
+}
+static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+{
+        if (clp->cl_minorversion == 0) {
+                return get_rpccred(callback_cred);
+        } else {
+                struct rpc_auth *auth = client->cl_auth;
+                struct auth_cred acred = {};
+                acred.uid = ses->se_cb_sec.uid;
+                acred.gid = ses->se_cb_sec.gid;
+                return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
+        }
+}
 static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
        struct rpc_timeout      timeparms = {
-                .to_initval     = max_cb_time(),
+                .to_initval     = max_cb_time(clp->net),
                .to_retries     = 0,
        };
        struct rpc_create_args args = {
-                .net            = &init_net,
+                .net            = clp->net,
                .address        = (struct sockaddr *) &conn->cb_addr,
                .addrsize       = conn->cb_addrlen,
                .saddress       = (struct sockaddr *) &conn->cb_saddr,
@@ -648,6 +675,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
        };
        struct rpc_clnt *client;
+        struct rpc_cred *cred;
        if (clp->cl_minorversion == 0) {
                if (!clp->cl_cred.cr_principal &&
@@ -666,7 +694,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
                args.bc_xprt = conn->cb_xprt;
                args.prognumber = clp->cl_cb_session->se_cb_prog;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
-                args.authflavor = RPC_AUTH_UNIX;
+                args.authflavor = ses->se_cb_sec.flavor;
        }
        /* Create RPC client */
        client = rpc_create(&args);
@@ -675,9 +703,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
+        cred = get_backchannel_cred(clp, client, ses);
+        if (IS_ERR(cred)) {
+                rpc_shutdown_client(client);
+                return PTR_ERR(cred);
+        }
        clp->cl_cb_client = client;
+        clp->cl_cb_cred = cred;
        return 0;
 }
 static void warn_no_callback_path(struct nfs4_client *clp, int reason)
@@ -714,18 +747,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
        .rpc_call_done = nfsd4_cb_probe_done,
 };
-static struct rpc_cred *callback_cred;
-int set_callback_cred(void)
-{
-        if (callback_cred)
-                return 0;
-        callback_cred = rpc_lookup_machine_cred("nfs");
-        if (!callback_cred)
-                return -ENOMEM;
-        return 0;
-}
 static struct workqueue_struct *callback_wq;
 static void run_nfsd4_cb(struct nfsd4_callback *cb)
@@ -743,7 +764,6 @@ static void do_probe_callback(struct nfs4_client *clp)
        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
        cb->cb_msg.rpc_argp = NULL;
        cb->cb_msg.rpc_resp = NULL;
-        cb->cb_msg.rpc_cred = callback_cred;
        cb->cb_ops = &nfsd4_cb_probe_ops;
@@ -962,6 +982,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
        if (clp->cl_cb_client) {
                rpc_shutdown_client(clp->cl_cb_client);
                clp->cl_cb_client = NULL;
+                put_rpccred(clp->cl_cb_cred);
+                clp->cl_cb_cred = NULL;
        }
        if (clp->cl_cb_conn.cb_xprt) {
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -995,7 +1017,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
                run_nfsd4_cb(cb);
 }
-void nfsd4_do_callback_rpc(struct work_struct *w)
+static void nfsd4_do_callback_rpc(struct work_struct *w)
 {
        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
        struct nfs4_client *clp = cb->cb_clp;
@@ -1010,10 +1032,16 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
                nfsd4_release_cb(cb);
                return;
        }
+        cb->cb_msg.rpc_cred = clp->cl_cb_cred;
        rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
                        cb->cb_ops, cb);
 }
+void nfsd4_init_callback(struct nfsd4_callback *cb)
+{
+        INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
+}
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfsd4_callback *cb = &dp->dl_recall;
@@ -1025,7 +1053,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
        cb->cb_msg.rpc_argp = cb;
        cb->cb_msg.rpc_resp = cb;
-        cb->cb_msg.rpc_cred = callback_cred;
        cb->cb_ops = &nfsd4_cb_recall_ops;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6c9a4b291dba..9d1c5dba2bbb 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -40,6 +40,7 @@
 #include "xdr4.h"
 #include "vfs.h"
 #include "current_stateid.h"
+#include "netns.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -194,6 +195,7 @@ static __be32
 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
        struct svc_fh *resfh;
+        int accmode;
        __be32 status;
        resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
@@ -253,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
        /* set reply cache */
        fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
                        &resfh->fh_handle);
-        if (!open->op_created)
+        accmode = NFSD_MAY_NOP;
-                status = do_open_permission(rqstp, resfh, open,
+        if (open->op_created)
-                                            NFSD_MAY_NOP);
+                accmode |= NFSD_MAY_OWNER_OVERRIDE;
+        status = do_open_permission(rqstp, resfh, open, accmode);
        set_change_info(&open->op_cinfo, current_fh);
        fh_dup2(current_fh, resfh);
 out:
@@ -304,6 +307,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        __be32 status;
        struct nfsd4_compoundres *resp;
+        struct net *net = SVC_NET(rqstp);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
                (int)open->op_fname.len, open->op_fname.data,
@@ -331,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        /* check seqid for replay. set nfs4_owner */
        resp = rqstp->rq_resp;
-        status = nfsd4_process_open1(&resp->cstate, open);
+        status = nfsd4_process_open1(&resp->cstate, open, nn);
        if (status == nfserr_replay_me) {
                struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
                fh_put(&cstate->current_fh);
@@ -354,10 +359,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        /* Openowner is now set, so sequence id will get bumped.  Now we need
         * these checks before we do any creates: */
        status = nfserr_grace;
-        if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+        if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
                goto out;
        status = nfserr_no_grace;
-        if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+        if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
                goto out;
        switch (open->op_claim_type) {
@@ -370,7 +375,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        break;
                case NFS4_OPEN_CLAIM_PREVIOUS:
                        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-                        status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion);
+                        status = nfs4_check_open_reclaim(&open->op_clientid,
+                                                         cstate->minorversion,
+                                                         nn);
                        if (status)
                                goto out;
                case NFS4_OPEN_CLAIM_FH:
@@ -490,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                           &access->ac_supported);
 }
-static void gen_boot_verifier(nfs4_verifier *verifier)
+static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
 {
        __be32 verf[2];
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        verf[0] = (__be32)nfssvc_boot.tv_sec;
+        verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
-        verf[1] = (__be32)nfssvc_boot.tv_usec;
+        verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
        memcpy(verifier->data, verf, sizeof(verifier->data));
 }
@@ -503,7 +511,7 @@ static __be32
 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
             struct nfsd4_commit *commit)
 {
-        gen_boot_verifier(&commit->co_verf);
+        gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));
        return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
                             commit->co_count);
 }
@@ -684,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (read->rd_offset >= OFFSET_MAX)
                return nfserr_inval;
+        /*
+         * If we do a zero copy read, then a client will see read data
+         * that reflects the state of the file *after* performing the
+         * following compound.
+         *
+         * To ensure proper ordering, we therefore turn off zero copy if
+         * the client wants us to do more in this compound:
+         */
+        if (!nfsd4_last_compound_op(rqstp))
+                rqstp->rq_splice_ok = false;
        nfs4_lock_state();
        /* check stateid */
        if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
@@ -876,6 +895,24 @@ out:
        return status;
 }
+static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write)
+{
+        int i = 1;
+        int buflen = write->wr_buflen;
+        vec[0].iov_base = write->wr_head.iov_base;
+        vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len);
+        buflen -= vec[0].iov_len;
+        while (buflen) {
+                vec[i].iov_base = page_address(write->wr_pagelist[i - 1]);
+                vec[i].iov_len = min_t(int, PAGE_SIZE, buflen);
+                buflen -= vec[i].iov_len;
+                i++;
+        }
+        return i;
+}
 static __be32
 nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_write *write)
@@ -884,6 +921,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file *filp = NULL;
        __be32 status = nfs_ok;
        unsigned long cnt;
+        int nvecs;
        /* no need to check permission - this will be done in nfsd_write() */
@@ -904,10 +942,13 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        cnt = write->wr_buflen;
        write->wr_how_written = write->wr_stable_how;
-        gen_boot_verifier(&write->wr_verifier);
+        gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
+        nvecs = fill_in_write_vector(rqstp->rq_vec, write);
+        WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
        status =  nfsd_write(rqstp, &cstate->current_fh, filp,
-                             write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+                             write->wr_offset, rqstp->rq_vec, nvecs,
                             &cnt, &write->wr_how_written);
        if (filp)
                fput(filp);
@@ -1666,6 +1707,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_name = "OP_EXCHANGE_ID",
                .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
        },
+        [OP_BACKCHANNEL_CTL] = {
+                .op_func = (nfsd4op_func)nfsd4_backchannel_ctl,
+                .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+                .op_name = "OP_BACKCHANNEL_CTL",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+        },
        [OP_BIND_CONN_TO_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
@@ -1719,6 +1766,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_free_stateid,
                .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
                .op_name = "OP_FREE_STATEID",
+                .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
 };
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 43295d45cc2b..ba6fdd4a0455 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,13 +58,11 @@ struct nfsd4_client_tracking_ops {
        void (*create)(struct nfs4_client *);
        void (*remove)(struct nfs4_client *);
        int (*check)(struct nfs4_client *);
-        void (*grace_done)(struct net *, time_t);
+        void (*grace_done)(struct nfsd_net *, time_t);
 };
 /* Globals */
-static struct file *rec_file;
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static struct nfsd4_client_tracking_ops *client_tracking_ops;
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -102,33 +100,39 @@ md5_to_hex(char *out, char *md5)
        *out = '\0';
 }
-__be32
+static int
-nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
+nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
 {
        struct xdr_netobj cksum;
        struct hash_desc desc;
        struct scatterlist sg;
-        __be32 status = nfserr_jukebox;
+        int status;
        dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
                        clname->len, clname->data);
        desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
        desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
-        if (IS_ERR(desc.tfm))
+        if (IS_ERR(desc.tfm)) {
+                status = PTR_ERR(desc.tfm);
                goto out_no_tfm;
+        }
        cksum.len = crypto_hash_digestsize(desc.tfm);
        cksum.data = kmalloc(cksum.len, GFP_KERNEL);
-        if (cksum.data == NULL)
+        if (cksum.data == NULL) {
+                status = -ENOMEM;
                goto out;
+        }
        sg_init_one(&sg, clname->data, clname->len);
-        if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data))
+        status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
+        if (status)
                goto out;
        md5_to_hex(dname, cksum.data);
-        status = nfs_ok;
+        status = 0;
 out:
        kfree(cksum.data);
        crypto_free_hash(desc.tfm);
@@ -136,29 +140,61 @@ out_no_tfm:
        return status;
 }
+/*
+ * If we had an error generating the recdir name for the legacy tracker
+ * then warn the admin. If the error doesn't appear to be transient,
+ * then disable recovery tracking.
+ */
+static void
+legacy_recdir_name_error(int error)
+{
+        printk(KERN_ERR "NFSD: unable to generate recoverydir "
+                        "name (%d).\n", error);
+        /*
+         * if the algorithm just doesn't exist, then disable the recovery
+         * tracker altogether. The crypto libs will generally return this if
+         * FIPS is enabled as well.
+         */
+        if (error == -ENOENT) {
+                printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
+                        "Reboot recovery will not function correctly!\n");
+                /* the argument is ignored by the legacy exit function */
+                nfsd4_client_tracking_exit(NULL);
+        }
+}
 static void
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
        const struct cred *original_cred;
-        char *dname = clp->cl_recdir;
+        char dname[HEXDIR_LEN];
        struct dentry *dir, *dentry;
+        struct nfs4_client_reclaim *crp;
        int status;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
        if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
                return;
-        if (!rec_file)
+        if (!nn->rec_file)
                return;
+        status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+        if (status)
+                return legacy_recdir_name_error(status);
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return;
-        status = mnt_want_write_file(rec_file);
+        status = mnt_want_write_file(nn->rec_file);
        if (status)
                return;
-        dir = rec_file->f_path.dentry;
+        dir = nn->rec_file->f_path.dentry;
        /* lock the parent */
        mutex_lock(&dir->d_inode->i_mutex);
@@ -182,18 +218,24 @@ out_put:
        dput(dentry);
 out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
-        if (status == 0)
+        if (status == 0) {
-                vfs_fsync(rec_file, 0);
+                if (nn->in_grace) {
-        else
+                        crp = nfs4_client_to_reclaim(dname, nn);
+                        if (crp)
+                                crp->cr_clp = clp;
+                }
+                vfs_fsync(nn->rec_file, 0);
+        } else {
                printk(KERN_ERR "NFSD: failed to write recovery record"
                                " (err %d); please check that %s exists"
                                " and is writeable", status,
                                user_recovery_dirname);
-        mnt_drop_write_file(rec_file);
+        }
+        mnt_drop_write_file(nn->rec_file);
        nfs4_reset_creds(original_cred);
 }
-typedef int (recdir_func)(struct dentry *, struct dentry *);
+typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
 struct name_list {
        char name[HEXDIR_LEN];
@@ -219,10 +261,10 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
 }
 static int
-nfsd4_list_rec_dir(recdir_func *f)
+nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 {
        const struct cred *original_cred;
-        struct dentry *dir = rec_file->f_path.dentry;
+        struct dentry *dir = nn->rec_file->f_path.dentry;
        LIST_HEAD(names);
        int status;
@@ -230,13 +272,13 @@ nfsd4_list_rec_dir(recdir_func *f)
        if (status < 0)
                return status;
-        status = vfs_llseek(rec_file, 0, SEEK_SET);
+        status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
        if (status < 0) {
                nfs4_reset_creds(original_cred);
                return status;
        }
-        status = vfs_readdir(rec_file, nfsd4_build_namelist, &names);
+        status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        while (!list_empty(&names)) {
                struct name_list *entry;
@@ -248,7 +290,7 @@ nfsd4_list_rec_dir(recdir_func *f)
                                status = PTR_ERR(dentry);
                                break;
                        }
-                        status = f(dir, dentry);
+                        status = f(dir, dentry, nn);
                        dput(dentry);
                }
                list_del(&entry->list);
@@ -260,14 +302,14 @@ nfsd4_list_rec_dir(recdir_func *f)
 }
 static int
-nfsd4_unlink_clid_dir(char *name, int namlen)
+nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 {
        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
-        dir = rec_file->f_path.dentry;
+        dir = nn->rec_file->f_path.dentry;
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        dentry = lookup_one_len(name, dir, namlen);
        if (IS_ERR(dentry)) {
@@ -289,37 +331,52 @@ static void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
        const struct cred *original_cred;
+        struct nfs4_client_reclaim *crp;
+        char dname[HEXDIR_LEN];
        int status;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+        if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
                return;
-        status = mnt_want_write_file(rec_file);
+        status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+        if (status)
+                return legacy_recdir_name_error(status);
+        status = mnt_want_write_file(nn->rec_file);
        if (status)
                goto out;
        clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
-                goto out;
+                goto out_drop_write;
-        status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
+        status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
        nfs4_reset_creds(original_cred);
-        if (status == 0)
+        if (status == 0) {
-                vfs_fsync(rec_file, 0);
+                vfs_fsync(nn->rec_file, 0);
-        mnt_drop_write_file(rec_file);
+                if (nn->in_grace) {
+                        /* remove reclaim record */
+                        crp = nfsd4_find_reclaim_client(dname, nn);
+                        if (crp)
+                                nfs4_remove_reclaim_record(crp, nn);
+                }
+        }
+out_drop_write:
+        mnt_drop_write_file(nn->rec_file);
 out:
        if (status)
                printk("NFSD: Failed to remove expired client state directory"
-                                " %.*s\n", HEXDIR_LEN, clp->cl_recdir);
+                                " %.*s\n", HEXDIR_LEN, dname);
 }
 static int
-purge_old(struct dentry *parent, struct dentry *child)
+purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
        int status;
-        if (nfs4_has_reclaimed_state(child->d_name.name, false))
+        if (nfs4_has_reclaimed_state(child->d_name.name, nn))
                return 0;
        status = vfs_rmdir(parent->d_inode, child);
@@ -331,27 +388,29 @@ purge_old(struct dentry *parent, struct dentry *child)
 }
 static void
-nfsd4_recdir_purge_old(struct net *net, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
 {
        int status;
-        if (!rec_file)
+        nn->in_grace = false;
+        if (!nn->rec_file)
                return;
-        status = mnt_want_write_file(rec_file);
+        status = mnt_want_write_file(nn->rec_file);
        if (status)
                goto out;
-        status = nfsd4_list_rec_dir(purge_old);
+        status = nfsd4_list_rec_dir(purge_old, nn);
        if (status == 0)
-                vfs_fsync(rec_file, 0);
+                vfs_fsync(nn->rec_file, 0);
-        mnt_drop_write_file(rec_file);
+        mnt_drop_write_file(nn->rec_file);
 out:
+        nfs4_release_reclaim(nn);
        if (status)
                printk("nfsd4: failed to purge old clients from recovery"
-                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
+                        " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
 }
 static int
-load_recdir(struct dentry *parent, struct dentry *child)
+load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
        if (child->d_name.len != HEXDIR_LEN - 1) {
                printk("nfsd4: illegal name %s in recovery directory\n",
@@ -359,21 +418,22 @@ load_recdir(struct dentry *parent, struct dentry *child)
                /* Keep trying; maybe the others are OK: */
                return 0;
        }
-        nfs4_client_to_reclaim(child->d_name.name);
+        nfs4_client_to_reclaim(child->d_name.name, nn);
        return 0;
 }
 static int
-nfsd4_recdir_load(void) {
+nfsd4_recdir_load(struct net *net) {
        int status;
+        struct nfsd_net *nn =  net_generic(net, nfsd_net_id);
-        if (!rec_file)
+        if (!nn->rec_file)
                return 0;
-        status = nfsd4_list_rec_dir(load_recdir);
+        status = nfsd4_list_rec_dir(load_recdir, nn);
        if (status)
                printk("nfsd4: failed loading clients from recovery"
-                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
+                        " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
        return status;
 }
@@ -382,15 +442,16 @@ nfsd4_recdir_load(void) {
 */
 static int
-nfsd4_init_recdir(void)
+nfsd4_init_recdir(struct net *net)
 {
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        const struct cred *original_cred;
        int status;
        printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
                        user_recovery_dirname);
-        BUG_ON(rec_file);
+        BUG_ON(nn->rec_file);
        status = nfs4_save_creds(&original_cred);
        if (status < 0) {
@@ -400,23 +461,65 @@ nfsd4_init_recdir(void)
                return status;
        }
-        rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
+        nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
-        if (IS_ERR(rec_file)) {
+        if (IS_ERR(nn->rec_file)) {
                printk("NFSD: unable to find recovery directory %s\n",
                                user_recovery_dirname);
-                status = PTR_ERR(rec_file);
+                status = PTR_ERR(nn->rec_file);
-                rec_file = NULL;
+                nn->rec_file = NULL;
        }
        nfs4_reset_creds(original_cred);
+        if (!status)
+                nn->in_grace = true;
        return status;
 }
+static int
+nfs4_legacy_state_init(struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        int i;
+        nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) *
+                                          CLIENT_HASH_SIZE, GFP_KERNEL);
+        if (!nn->reclaim_str_hashtbl)
+                return -ENOMEM;
+        for (i = 0; i < CLIENT_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+        nn->reclaim_str_hashtbl_size = 0;
+        return 0;
+}
+static void
+nfs4_legacy_state_shutdown(struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        kfree(nn->reclaim_str_hashtbl);
+}
 static int
 nfsd4_load_reboot_recovery_data(struct net *net)
 {
        int status;
+        status = nfsd4_init_recdir(net);
+        if (!status)
+                status = nfsd4_recdir_load(net);
+        if (status)
+                printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+        return status;
+}
+static int
+nfsd4_legacy_tracking_init(struct net *net)
+{
+        int status;
        /* XXX: The legacy code won't work in a container */
        if (net != &init_net) {
                WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
@@ -424,30 +527,37 @@ nfsd4_load_reboot_recovery_data(struct net *net)
                return -EINVAL;
        }
-        nfs4_lock_state();
+        status = nfs4_legacy_state_init(net);
-        status = nfsd4_init_recdir();
-        if (!status)
-                status = nfsd4_recdir_load();
-        nfs4_unlock_state();
        if (status)
-                printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+                return status;
+        status = nfsd4_load_reboot_recovery_data(net);
+        if (status)
+                goto err;
+        return 0;
+err:
+        nfs4_legacy_state_shutdown(net);
        return status;
 }
 static void
-nfsd4_shutdown_recdir(void)
+nfsd4_shutdown_recdir(struct nfsd_net *nn)
 {
-        if (!rec_file)
+        if (!nn->rec_file)
                return;
-        fput(rec_file);
+        fput(nn->rec_file);
-        rec_file = NULL;
+        nn->rec_file = NULL;
 }
 static void
 nfsd4_legacy_tracking_exit(struct net *net)
 {
-        nfs4_release_reclaim();
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        nfsd4_shutdown_recdir();
+        nfs4_release_reclaim(nn);
+        nfsd4_shutdown_recdir(nn);
+        nfs4_legacy_state_shutdown(net);
 }
 /*
@@ -480,13 +590,26 @@ nfs4_recoverydir(void)
 static int
 nfsd4_check_legacy_client(struct nfs4_client *clp)
 {
+        int status;
+        char dname[HEXDIR_LEN];
+        struct nfs4_client_reclaim *crp;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        /* did we already find that this client is stable? */
        if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
                return 0;
+        status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+        if (status) {
+                legacy_recdir_name_error(status);
+                return status;
+        }
        /* look for it in the reclaim hashtable otherwise */
-        if (nfsd4_find_reclaim_client(clp)) {
+        crp = nfsd4_find_reclaim_client(dname, nn);
+        if (crp) {
                set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+                crp->cr_clp = clp;
                return 0;
        }
@@ -494,7 +617,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
 }
 static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
-        .init           = nfsd4_load_reboot_recovery_data,
+        .init           = nfsd4_legacy_tracking_init,
        .exit           = nfsd4_legacy_tracking_exit,
        .create         = nfsd4_create_clid_dir,
        .remove         = nfsd4_remove_clid_dir,
@@ -785,8 +908,7 @@ nfsd4_cld_create(struct nfs4_client *clp)
 {
        int ret;
        struct cld_upcall *cup;
-        /* FIXME: determine net from clp */
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        struct cld_net *cn = nn->cld_net;
        /* Don't upcall if it's already stored */
@@ -823,8 +945,7 @@ nfsd4_cld_remove(struct nfs4_client *clp)
 {
        int ret;
        struct cld_upcall *cup;
-        /* FIXME: determine net from clp */
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        struct cld_net *cn = nn->cld_net;
        /* Don't upcall if it's already removed */
@@ -861,8 +982,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
 {
        int ret;
        struct cld_upcall *cup;
-        /* FIXME: determine net from clp */
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        struct cld_net *cn = nn->cld_net;
        /* Don't upcall if one was already stored during this grace pd */
@@ -892,11 +1012,10 @@ nfsd4_cld_check(struct nfs4_client *clp)
 }
 static void
-nfsd4_cld_grace_done(struct net *net, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
 {
        int ret;
        struct cld_upcall *cup;
-        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        struct cld_net *cn = nn->cld_net;
        cup = alloc_cld_upcall(cn);
@@ -926,28 +1045,261 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
        .grace_done     = nfsd4_cld_grace_done,
 };
+/* upcall via usermodehelper */
+static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
+module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
+                        S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
+static bool cltrack_legacy_disable;
+module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_legacy_disable,
+                "Disable legacy recoverydir conversion. Default: false");
+#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
+#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+static char *
+nfsd4_cltrack_legacy_topdir(void)
+{
+        int copied;
+        size_t len;
+        char *result;
+        if (cltrack_legacy_disable)
+                return NULL;
+        len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
+                strlen(nfs4_recoverydir()) + 1;
+        result = kmalloc(len, GFP_KERNEL);
+        if (!result)
+                return result;
+        copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
+                                nfs4_recoverydir());
+        if (copied >= len) {
+                /* just return nothing if output was truncated */
+                kfree(result);
+                return NULL;
+        }
+        return result;
+}
+static char *
+nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
+{
+        int copied;
+        size_t len;
+        char *result;
+        if (cltrack_legacy_disable)
+                return NULL;
+        /* +1 is for '/' between "topdir" and "recdir" */
+        len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
+                strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
+        result = kmalloc(len, GFP_KERNEL);
+        if (!result)
+                return result;
+        copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
+                                nfs4_recoverydir());
+        if (copied > (len - HEXDIR_LEN)) {
+                /* just return nothing if output will be truncated */
+                kfree(result);
+                return NULL;
+        }
+        copied = nfs4_make_rec_clidname(result + copied, name);
+        if (copied) {
+                kfree(result);
+                return NULL;
+        }
+        return result;
+}
+static int
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+{
+        char *envp[2];
+        char *argv[4];
+        int ret;
+        if (unlikely(!cltrack_prog[0])) {
+                dprintk("%s: cltrack_prog is disabled\n", __func__);
+                return -EACCES;
+        }
+        dprintk("%s: cmd: %s\n", __func__, cmd);
+        dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
+        dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+        envp[0] = legacy;
+        envp[1] = NULL;
+        argv[0] = (char *)cltrack_prog;
+        argv[1] = cmd;
+        argv[2] = arg;
+        argv[3] = NULL;
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        /*
+         * Disable the upcall mechanism if we're getting an ENOENT or EACCES
+         * error. The admin can re-enable it on the fly by using sysfs
+         * once the problem has been fixed.
+         */
+        if (ret == -ENOENT || ret == -EACCES) {
+                dprintk("NFSD: %s was not found or isn't executable (%d). "
+                        "Setting cltrack_prog to blank string!",
+                        cltrack_prog, ret);
+                cltrack_prog[0] = '\0';
+        }
+        dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
+        return ret;
+}
+static char *
+bin_to_hex_dup(const unsigned char *src, int srclen)
+{
+        int i;
+        char *buf, *hex;
+        /* +1 for terminating NULL */
+        buf = kmalloc((srclen * 2) + 1, GFP_KERNEL);
+        if (!buf)
+                return buf;
+        hex = buf;
+        for (i = 0; i < srclen; i++) {
+                sprintf(hex, "%2.2x", *src++);
+                hex += 2;
+        }
+        return buf;
+}
+static int
+nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+{
+        return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+}
+static void
+nfsd4_umh_cltrack_create(struct nfs4_client *clp)
+{
+        char *hexid;
+        hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+        if (!hexid) {
+                dprintk("%s: can't allocate memory for upcall!\n", __func__);
+                return;
+        }
+        nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+        kfree(hexid);
+}
+static void
+nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
+{
+        char *hexid;
+        hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+        if (!hexid) {
+                dprintk("%s: can't allocate memory for upcall!\n", __func__);
+                return;
+        }
+        nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+        kfree(hexid);
+}
+static int
+nfsd4_umh_cltrack_check(struct nfs4_client *clp)
+{
+        int ret;
+        char *hexid, *legacy;
+        hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+        if (!hexid) {
+                dprintk("%s: can't allocate memory for upcall!\n", __func__);
+                return -ENOMEM;
+        }
+        legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
+        ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+        kfree(legacy);
+        kfree(hexid);
+        return ret;
+}
+static void
+nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
+                                time_t boot_time)
+{
+        char *legacy;
+        char timestr[22]; /* FIXME: better way to determine max size? */
+        sprintf(timestr, "%ld", boot_time);
+        legacy = nfsd4_cltrack_legacy_topdir();
+        nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+        kfree(legacy);
+}
+static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+        .init           = nfsd4_umh_cltrack_init,
+        .exit           = NULL,
+        .create         = nfsd4_umh_cltrack_create,
+        .remove         = nfsd4_umh_cltrack_remove,
+        .check          = nfsd4_umh_cltrack_check,
+        .grace_done     = nfsd4_umh_cltrack_grace_done,
+};
 int
 nfsd4_client_tracking_init(struct net *net)
 {
        int status;
        struct path path;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        if (!client_tracking_ops) {
+        /* just run the init if it the method is already decided */
-                client_tracking_ops = &nfsd4_cld_tracking_ops;
+        if (nn->client_tracking_ops)
-                status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+                goto do_init;
-                if (!status) {
-                        if (S_ISDIR(path.dentry->d_inode->i_mode))
+        /*
-                                client_tracking_ops =
+         * First, try a UMH upcall. It should succeed or fail quickly, so
-                                                &nfsd4_legacy_tracking_ops;
+         * there's little harm in trying that first.
-                        path_put(&path);
+         */
-                }
+        nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
+        status = nn->client_tracking_ops->init(net);
+        if (!status)
+                return status;
+        /*
+         * See if the recoverydir exists and is a directory. If it is,
+         * then use the legacy ops.
+         */
+        nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+        status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+        if (!status) {
+                status = S_ISDIR(path.dentry->d_inode->i_mode);
+                path_put(&path);
+                if (status)
+                        goto do_init;
        }
-        status = client_tracking_ops->init(net);
+        /* Finally, try to use nfsdcld */
+        nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+        printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
+                        "removed in 3.10. Please transition to using "
+                        "nfsdcltrack.\n");
+do_init:
+        status = nn->client_tracking_ops->init(net);
        if (status) {
                printk(KERN_WARNING "NFSD: Unable to initialize client "
                                    "recovery tracking! (%d)\n", status);
-                client_tracking_ops = NULL;
+                nn->client_tracking_ops = NULL;
        }
        return status;
 }
@@ -955,40 +1307,49 @@ nfsd4_client_tracking_init(struct net *net)
 void
 nfsd4_client_tracking_exit(struct net *net)
 {
-        if (client_tracking_ops) {
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-                client_tracking_ops->exit(net);
-                client_tracking_ops = NULL;
+        if (nn->client_tracking_ops) {
+                if (nn->client_tracking_ops->exit)
+                        nn->client_tracking_ops->exit(net);
+                nn->client_tracking_ops = NULL;
        }
 }
 void
 nfsd4_client_record_create(struct nfs4_client *clp)
 {
-        if (client_tracking_ops)
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-                client_tracking_ops->create(clp);
+        if (nn->client_tracking_ops)
+                nn->client_tracking_ops->create(clp);
 }
 void
 nfsd4_client_record_remove(struct nfs4_client *clp)
 {
-        if (client_tracking_ops)
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-                client_tracking_ops->remove(clp);
+        if (nn->client_tracking_ops)
+                nn->client_tracking_ops->remove(clp);
 }
 int
 nfsd4_client_record_check(struct nfs4_client *clp)
 {
-        if (client_tracking_ops)
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-                return client_tracking_ops->check(clp);
+        if (nn->client_tracking_ops)
+                return nn->client_tracking_ops->check(clp);
        return -EOPNOTSUPP;
 }
 void
-nfsd4_record_grace_done(struct net *net, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
 {
-        if (client_tracking_ops)
+        if (nn->client_tracking_ops)
-                client_tracking_ops->grace_done(net, boot_time);
+                nn->client_tracking_ops->grace_done(nn, boot_time);
 }
 static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d0237f872cc4..ac8ed96c4199 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,16 +44,11 @@
 #include "xdr4.h"
 #include "vfs.h"
 #include "current_stateid.h"
-#include "fault_inject.h"
 #include "netns.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
-/* Globals */
-time_t nfsd4_lease = 90;     /* default lease time */
-time_t nfsd4_grace = 90;
 #define all_ones {{~0,~0},~0}
 static const stateid_t one_stateid = {
        .si_generation = ~0,
@@ -176,8 +171,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
        return ret & OWNER_HASH_MASK;
 }
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
@@ -192,7 +185,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE];
 static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
 {
-        BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+        WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
        atomic_inc(&fp->fi_access[oflag]);
 }
@@ -251,7 +244,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)
         * preallocations that can exist at a time, but the state lock
         * prevents anyone from using ours before we get here:
         */
-        BUG_ON(error);
+        WARN_ON_ONCE(error);
        /*
         * It shouldn't be a problem to reuse an opaque stateid value.
         * I don't think it is for 4.1.  But with 4.0 I worry that, for
@@ -340,7 +333,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
-        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
+        nfsd4_init_callback(&dp->dl_recall);
        return dp;
 }
@@ -390,14 +383,6 @@ unhash_delegation(struct nfs4_delegation *dp)
 * SETCLIENTID state 
 */
-/* client_lock protects the client lru list and session hash table */
-static DEFINE_SPINLOCK(client_lock);
-/* Hash tables for nfs4_clientid state */
-#define CLIENT_HASH_BITS                 4
-#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
-#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
 static unsigned int clientid_hashval(u32 id)
 {
        return id & CLIENT_HASH_MASK;
@@ -409,31 +394,6 @@ static unsigned int clientstr_hashval(const char *name)
 }
 /*
- * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
- * used in reboot/reset lease grace period processing
- *
- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
- * setclientid_confirmed info. 
- *
- * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed 
- * setclientid info.
- *
- * client_lru holds client queue ordered by nfs4_client.cl_time
- * for lease renewal.
- *
- * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
- * for last close replay.
- */
-static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
-static int reclaim_str_hashtbl_size = 0;
-static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head client_lru;
-static struct list_head close_lru;
-/*
 * We store the NONE, READ, WRITE, and BOTH bits separately in the
 * st_{access,deny}_bmap field of the stateid, in order to track not
 * only what share bits are currently in force, but also what
@@ -526,7 +486,8 @@ static int nfs4_access_to_omode(u32 access)
        case NFS4_SHARE_ACCESS_BOTH:
                return O_RDWR;
        }
-        BUG();
+        WARN_ON_ONCE(1);
+        return O_RDONLY;
 }
 /* release all access and file references for a given stateid */
@@ -652,9 +613,6 @@ static void release_openowner(struct nfs4_openowner *oo)
        nfs4_free_openowner(oo);
 }
-#define SESSION_HASH_SIZE       512
-static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
 static inline int
 hash_sessionid(struct nfs4_sessionid *sessionid)
 {
@@ -785,9 +743,12 @@ out_free:
        return NULL;
 }
-static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
+                                   struct nfsd4_channel_attrs *req,
+                                   int numslots, int slotsize,
+                                   struct nfsd_net *nn)
 {
-        u32 maxrpc = nfsd_serv->sv_max_mesg;
+        u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
        new->maxreqs = numslots;
        new->maxresp_cached = min_t(u32, req->maxresp_cached,
@@ -906,21 +867,27 @@ static void __free_session(struct nfsd4_session *ses)
 static void free_session(struct kref *kref)
 {
        struct nfsd4_session *ses;
+        struct nfsd_net *nn;
-        lockdep_assert_held(&client_lock);
        ses = container_of(kref, struct nfsd4_session, se_ref);
+        nn = net_generic(ses->se_client->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        nfsd4_del_conns(ses);
        __free_session(ses);
 }
 void nfsd4_put_session(struct nfsd4_session *ses)
 {
-        spin_lock(&client_lock);
+        struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
+        spin_lock(&nn->client_lock);
        nfsd4_put_session_locked(ses);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
 }
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
+                                           struct nfsd_net *nn)
 {
        struct nfsd4_session *new;
        int numslots, slotsize;
@@ -941,13 +908,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
                nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
                return NULL;
        }
-        init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
+        init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
        return new;
 }
-static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
        int idx;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        new->se_client = clp;
        gen_sessionid(new);
@@ -957,14 +925,15 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
        new->se_cb_seq_nr = 1;
        new->se_flags = cses->flags;
        new->se_cb_prog = cses->callback_prog;
+        new->se_cb_sec = cses->cb_sec;
        kref_init(&new->se_ref);
        idx = hash_sessionid(&new->se_sessionid);
-        spin_lock(&client_lock);
+        spin_lock(&nn->client_lock);
-        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
        spin_lock(&clp->cl_lock);
        list_add(&new->se_perclnt, &clp->cl_sessions);
        spin_unlock(&clp->cl_lock);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
        if (cses->flags & SESSION4_BACK_CHAN) {
                struct sockaddr *sa = svc_addr(rqstp);
@@ -978,20 +947,20 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
        }
-        return new;
 }
 /* caller must hold client_lock */
 static struct nfsd4_session *
-find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
 {
        struct nfsd4_session *elem;
        int idx;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        dump_sessionid(__func__, sessionid);
        idx = hash_sessionid(sessionid);
        /* Search in the appropriate list */
-        list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+        list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
                if (!memcmp(elem->se_sessionid.data, sessionid->data,
                            NFS4_MAX_SESSIONID_LEN)) {
                        return elem;
@@ -1016,6 +985,8 @@ unhash_session(struct nfsd4_session *ses)
 static inline void
 renew_client_locked(struct nfs4_client *clp)
 {
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        if (is_client_expired(clp)) {
                WARN_ON(1);
                printk("%s: client (clientid %08x/%08x) already expired\n",
@@ -1028,16 +999,18 @@ renew_client_locked(struct nfs4_client *clp)
        dprintk("renewing client (clientid %08x/%08x)\n", 
                        clp->cl_clientid.cl_boot, 
                        clp->cl_clientid.cl_id);
-        list_move_tail(&clp->cl_lru, &client_lru);
+        list_move_tail(&clp->cl_lru, &nn->client_lru);
        clp->cl_time = get_seconds();
 }
 static inline void
 renew_client(struct nfs4_client *clp)
 {
-        spin_lock(&client_lock);
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        spin_lock(&nn->client_lock);
        renew_client_locked(clp);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
 }
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
@@ -1075,7 +1048,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
-        lockdep_assert_held(&client_lock);
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        while (!list_empty(&clp->cl_sessions)) {
                struct nfsd4_session *ses;
                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1092,15 +1067,16 @@ void
 release_session_client(struct nfsd4_session *session)
 {
        struct nfs4_client *clp = session->se_client;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+        if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
                return;
        if (is_client_expired(clp)) {
                free_client(clp);
                session->se_client = NULL;
        } else
                renew_client_locked(clp);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
 }
 /* must be called under the client_lock */
@@ -1123,6 +1099,7 @@ destroy_client(struct nfs4_client *clp)
        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
        struct list_head reaplist;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
@@ -1144,12 +1121,15 @@ destroy_client(struct nfs4_client *clp)
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
        list_del(&clp->cl_idhash);
-        list_del(&clp->cl_strhash);
+        if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
-        spin_lock(&client_lock);
+                rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+        else
+                rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+        spin_lock(&nn->client_lock);
        unhash_client_locked(clp);
        if (atomic_read(&clp->cl_refcount) == 0)
                free_client(clp);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
 }
 static void expire_client(struct nfs4_client *clp)
@@ -1187,6 +1167,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
        return 0;
 }
+static long long
+compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
+{
+        long long res;
+        res = o1->len - o2->len;
+        if (res)
+                return res;
+        return (long long)memcmp(o1->data, o2->data, o1->len);
+}
 static int same_name(const char *n1, const char *n2)
 {
        return 0 == memcmp(n1, n2, HEXDIR_LEN);
@@ -1247,10 +1238,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
        return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
 }
-static void gen_clid(struct nfs4_client *clp)
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
 {
        static u32 current_clientid = 1;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        clp->cl_clientid.cl_boot = nn->boot_time;
        clp->cl_clientid.cl_id = current_clientid++; 
@@ -1283,12 +1273,14 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t
        return NULL;
 }
-static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+static struct nfs4_client *create_client(struct xdr_netobj name,
                struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
        struct nfs4_client *clp;
        struct sockaddr *sa = svc_addr(rqstp);
        int ret;
+        struct net *net = SVC_NET(rqstp);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        clp = alloc_client(name);
        if (clp == NULL)
@@ -1297,23 +1289,21 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        INIT_LIST_HEAD(&clp->cl_sessions);
        ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        if (ret) {
-                spin_lock(&client_lock);
+                spin_lock(&nn->client_lock);
                free_client(clp);
-                spin_unlock(&client_lock);
+                spin_unlock(&nn->client_lock);
                return NULL;
        }
        idr_init(&clp->cl_stateids);
-        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_refcount, 0);
        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        INIT_LIST_HEAD(&clp->cl_idhash);
-        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_lru);
        INIT_LIST_HEAD(&clp->cl_callbacks);
        spin_lock_init(&clp->cl_lock);
-        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
+        nfsd4_init_callback(&clp->cl_cb_null);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -1321,17 +1311,60 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
        gen_confirm(clp);
        clp->cl_cb_session = NULL;
+        clp->net = net;
        return clp;
 }
 static void
-add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
+add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct nfs4_client *clp;
+        while (*new) {
+                clp = rb_entry(*new, struct nfs4_client, cl_namenode);
+                parent = *new;
+                if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
+                        new = &((*new)->rb_left);
+                else
+                        new = &((*new)->rb_right);
+        }
+        rb_link_node(&new_clp->cl_namenode, parent, new);
+        rb_insert_color(&new_clp->cl_namenode, root);
+}
+static struct nfs4_client *
+find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
+{
+        long long cmp;
+        struct rb_node *node = root->rb_node;
+        struct nfs4_client *clp;
+        while (node) {
+                clp = rb_entry(node, struct nfs4_client, cl_namenode);
+                cmp = compare_blob(&clp->cl_name, name);
+                if (cmp > 0)
+                        node = node->rb_left;
+                else if (cmp < 0)
+                        node = node->rb_right;
+                else
+                        return clp;
+        }
+        return NULL;
+}
+static void
+add_to_unconfirmed(struct nfs4_client *clp)
 {
        unsigned int idhashval;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
+        clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+        add_clp_to_name_tree(clp, &nn->unconf_name_tree);
        idhashval = clientid_hashval(clp->cl_clientid.cl_id);
-        list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
+        list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
        renew_client(clp);
 }
@@ -1339,22 +1372,23 @@ static void
 move_to_confirmed(struct nfs4_client *clp)
 {
        unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
-        unsigned int strhashval;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
-        list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
+        list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
-        strhashval = clientstr_hashval(clp->cl_recdir);
+        rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
-        list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+        add_clp_to_name_tree(clp, &nn->conf_name_tree);
+        set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
        renew_client(clp);
 }
 static struct nfs4_client *
-find_confirmed_client(clientid_t *clid, bool sessions)
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
        struct nfs4_client *clp;
        unsigned int idhashval = clientid_hashval(clid->cl_id);
-        list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
+        list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
                if (same_clid(&clp->cl_clientid, clid)) {
                        if ((bool)clp->cl_minorversion != sessions)
                                return NULL;
@@ -1366,12 +1400,12 @@ find_confirmed_client(clientid_t *clid, bool sessions)
 }
 static struct nfs4_client *
-find_unconfirmed_client(clientid_t *clid, bool sessions)
+find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
        struct nfs4_client *clp;
        unsigned int idhashval = clientid_hashval(clid->cl_id);
-        list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
+        list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
                if (same_clid(&clp->cl_clientid, clid)) {
                        if ((bool)clp->cl_minorversion != sessions)
                                return NULL;
@@ -1387,27 +1421,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
 } 
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
 {
-        struct nfs4_client *clp;
+        return find_clp_in_name_tree(name, &nn->conf_name_tree);
-        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
-                        return clp;
-        }
-        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
 {
-        struct nfs4_client *clp;
+        return find_clp_in_name_tree(name, &nn->unconf_name_tree);
-        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
-                        return clp;
-        }
-        return NULL;
 }
 static void
@@ -1428,7 +1450,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
        else
                goto out_err;
-        conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,
+        conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
                                            se->se_callback_addr_len,
                                            (struct sockaddr *)&conn->cb_addr,
                                            sizeof(conn->cb_addr));
@@ -1572,12 +1594,11 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 {
        struct nfs4_client *unconf, *conf, *new;
        __be32 status;
-        unsigned int            strhashval;
-        char                    dname[HEXDIR_LEN];
        char                    addr_str[INET6_ADDRSTRLEN];
        nfs4_verifier           verf = exid->verifier;
        struct sockaddr         *sa = svc_addr(rqstp);
        bool    update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
+        struct nfsd_net         *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        rpc_ntop(sa, addr_str, sizeof(addr_str));
        dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1592,24 +1613,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        switch (exid->spa_how) {
        case SP4_NONE:
                break;
+        default:                                /* checked by xdr code */
+                WARN_ON_ONCE(1);
        case SP4_SSV:
-                return nfserr_serverfault;
-        default:
-                BUG();                          /* checked by xdr code */
        case SP4_MACH_CRED:
                return nfserr_serverfault;      /* no excuse :-/ */
        }
-        status = nfs4_make_rec_clidname(dname, &exid->clname);
-        if (status)
-                return status;
-        strhashval = clientstr_hashval(dname);
        /* Cases below refer to rfc 5661 section 18.35.4: */
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval);
+        conf = find_confirmed_client_by_name(&exid->clname, nn);
        if (conf) {
                bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
                bool verfs_match = same_verf(&verf, &conf->cl_verifier);
@@ -1654,21 +1667,21 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                goto out;
        }
-        unconf  = find_unconfirmed_client_by_str(dname, strhashval);
+        unconf  = find_unconfirmed_client_by_name(&exid->clname, nn);
        if (unconf) /* case 4, possible retry or client restart */
                expire_client(unconf);
        /* case 1 (normal case) */
 out_new:
-        new = create_client(exid->clname, dname, rqstp, &verf);
+        new = create_client(exid->clname, rqstp, &verf);
        if (new == NULL) {
                status = nfserr_jukebox;
                goto out;
        }
        new->cl_minorversion = 1;
-        gen_clid(new);
+        gen_clid(new, nn);
-        add_to_unconfirmed(new, strhashval);
+        add_to_unconfirmed(new);
 out_copy:
        exid->clientid.cl_boot = new->cl_clientid.cl_boot;
        exid->clientid.cl_id = new->cl_clientid.cl_id;
@@ -1761,12 +1774,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        struct nfsd4_conn *conn;
        struct nfsd4_clid_slot *cs_slot = NULL;
        __be32 status = 0;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
                return nfserr_inval;
        if (check_forechannel_attrs(cr_ses->fore_channel))
                return nfserr_toosmall;
-        new = alloc_session(&cr_ses->fore_channel);
+        new = alloc_session(&cr_ses->fore_channel, nn);
        if (!new)
                return nfserr_jukebox;
        status = nfserr_jukebox;
@@ -1775,8 +1789,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                goto out_free_session;
        nfs4_lock_state();
-        unconf = find_unconfirmed_client(&cr_ses->clientid, true);
+        unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
-        conf = find_confirmed_client(&cr_ses->clientid, true);
+        conf = find_confirmed_client(&cr_ses->clientid, true, nn);
        if (conf) {
                cs_slot = &conf->cl_cs_slot;
@@ -1789,7 +1803,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                        goto out_free_conn;
                }
        } else if (unconf) {
-                unsigned int hash;
                struct nfs4_client *old;
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
                    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1803,8 +1816,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                        status = nfserr_seq_misordered;
                        goto out_free_conn;
                }
-                hash = clientstr_hashval(unconf->cl_recdir);
+                old = find_confirmed_client_by_name(&unconf->cl_name, nn);
-                old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
                if (old)
                        expire_client(old);
                move_to_confirmed(unconf);
@@ -1843,14 +1855,6 @@ out_free_session:
        goto out;
 }
-static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
-{
-        struct nfsd4_compoundres *resp = rqstp->rq_resp;
-        struct nfsd4_compoundargs *argp = rqstp->rq_argp;
-        return argp->opcnt == resp->opcnt;
-}
 static __be32 nfsd4_map_bcts_dir(u32 *dir)
 {
        switch (*dir) {
@@ -1865,24 +1869,40 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir)
        return nfserr_inval;
 }
+__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc)
+{
+        struct nfsd4_session *session = cstate->session;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+        spin_lock(&nn->client_lock);
+        session->se_cb_prog = bc->bc_cb_program;
+        session->se_cb_sec = bc->bc_cb_sec;
+        spin_unlock(&nn->client_lock);
+        nfsd4_probe_callback(session->se_client);
+        return nfs_ok;
+}
 __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
                     struct nfsd4_compound_state *cstate,
                     struct nfsd4_bind_conn_to_session *bcts)
 {
        __be32 status;
        struct nfsd4_conn *conn;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        if (!nfsd4_last_compound_op(rqstp))
                return nfserr_not_only_op;
-        spin_lock(&client_lock);
+        spin_lock(&nn->client_lock);
-        cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+        cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
        /* Sorta weird: we only need the refcnt'ing because new_conn acquires
         * client_lock iself: */
        if (cstate->session) {
                nfsd4_get_session(cstate->session);
                atomic_inc(&cstate->session->se_client->cl_refcount);
        }
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
        if (!cstate->session)
                return nfserr_badsession;
@@ -1910,6 +1930,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
 {
        struct nfsd4_session *ses;
        __be32 status = nfserr_badsession;
+        struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
        /* Notes:
         * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1923,24 +1944,24 @@ nfsd4_destroy_session(struct svc_rqst *r,
                        return nfserr_not_only_op;
        }
        dump_sessionid(__func__, &sessionid->sessionid);
-        spin_lock(&client_lock);
+        spin_lock(&nn->client_lock);
-        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+        ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
        if (!ses) {
-                spin_unlock(&client_lock);
+                spin_unlock(&nn->client_lock);
                goto out;
        }
        unhash_session(ses);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
        nfs4_lock_state();
        nfsd4_probe_callback_sync(ses->se_client);
        nfs4_unlock_state();
-        spin_lock(&client_lock);
+        spin_lock(&nn->client_lock);
        nfsd4_del_conns(ses);
        nfsd4_put_session_locked(ses);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
        status = nfs_ok;
 out:
        dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -2006,6 +2027,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        struct nfsd4_slot *slot;
        struct nfsd4_conn *conn;
        __be32 status;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
@@ -2018,9 +2040,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (!conn)
                return nfserr_jukebox;
-        spin_lock(&client_lock);
+        spin_lock(&nn->client_lock);
        status = nfserr_badsession;
-        session = find_in_sessionid_hashtbl(&seq->sessionid);
+        session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
        if (!session)
                goto out;
@@ -2094,7 +2116,7 @@ out:
                }
        }
        kfree(conn);
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
 }
@@ -2104,10 +2126,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 {
        struct nfs4_client *conf, *unconf, *clp;
        __be32 status = 0;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        nfs4_lock_state();
-        unconf = find_unconfirmed_client(&dc->clientid, true);
+        unconf = find_unconfirmed_client(&dc->clientid, true, nn);
-        conf = find_confirmed_client(&dc->clientid, true);
+        conf = find_confirmed_client(&dc->clientid, true, nn);
        if (conf) {
                clp = conf;
@@ -2181,20 +2204,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        struct xdr_netobj       clname = setclid->se_name;
        nfs4_verifier           clverifier = setclid->se_verf;
-        unsigned int            strhashval;
        struct nfs4_client      *conf, *unconf, *new;
        __be32                  status;
-        char                    dname[HEXDIR_LEN];
+        struct nfsd_net         *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
-        
-        status = nfs4_make_rec_clidname(dname, &clname);
-        if (status)
-                return status;
-        strhashval = clientstr_hashval(dname);
        /* Cases below refer to rfc 3530 section 14.2.33: */
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval);
+        conf = find_confirmed_client_by_name(&clname, nn);
        if (conf) {
                /* case 0: */
                status = nfserr_clid_inuse;
@@ -2209,21 +2225,21 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                }
        }
-        unconf = find_unconfirmed_client_by_str(dname, strhashval);
+        unconf = find_unconfirmed_client_by_name(&clname, nn);
        if (unconf)
                expire_client(unconf);
        status = nfserr_jukebox;
-        new = create_client(clname, dname, rqstp, &clverifier);
+        new = create_client(clname, rqstp, &clverifier);
        if (new == NULL)
                goto out;
        if (conf && same_verf(&conf->cl_verifier, &clverifier))
                /* case 1: probable callback update */
                copy_clid(new, conf);
        else /* case 4 (new client) or cases 2, 3 (client reboot): */
-                gen_clid(new);
+                gen_clid(new, nn);
        new->cl_minorversion = 0;
        gen_callback(new, setclid, rqstp);
-        add_to_unconfirmed(new, strhashval);
+        add_to_unconfirmed(new);
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
        setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
        memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
@@ -2243,14 +2259,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
        nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
        clientid_t * clid = &setclientid_confirm->sc_clientid;
        __be32 status;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        if (STALE_CLIENTID(clid, nn))
                return nfserr_stale_clientid;
        nfs4_lock_state();
-        conf = find_confirmed_client(clid, false);
+        conf = find_confirmed_client(clid, false, nn);
-        unconf = find_unconfirmed_client(clid, false);
+        unconf = find_unconfirmed_client(clid, false, nn);
        /*
         * We try hard to give out unique clientid's, so if we get an
         * attempt to confirm the same clientid with a different cred,
@@ -2276,9 +2292,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                nfsd4_probe_callback(conf);
                expire_client(unconf);
        } else { /* case 3: normal case; new or rebooted client */
-                unsigned int hash = clientstr_hashval(unconf->cl_recdir);
+                conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
-                conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
                if (conf)
                        expire_client(conf);
                move_to_confirmed(unconf);
@@ -2340,7 +2354,7 @@ nfsd4_init_slabs(void)
        if (openowner_slab == NULL)
                goto out_nomem;
        lockowner_slab = kmem_cache_create("nfsd4_lockowners",
-                        sizeof(struct nfs4_openowner), 0, 0, NULL);
+                        sizeof(struct nfs4_lockowner), 0, 0, NULL);
        if (lockowner_slab == NULL)
                goto out_nomem;
        file_slab = kmem_cache_create("nfsd4_files",
@@ -2404,7 +2418,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
 static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
 {
-        list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
        list_add(&oo->oo_perclient, &clp->cl_openowners);
 }
@@ -2444,11 +2460,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 }
 static void
-move_to_close_lru(struct nfs4_openowner *oo)
+move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
 {
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
-        list_move_tail(&oo->oo_close_lru, &close_lru);
+        list_move_tail(&oo->oo_close_lru, &nn->close_lru);
        oo->oo_time = get_seconds();
 }
@@ -2462,13 +2480,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
 }
 static struct nfs4_openowner *
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions)
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+                        bool sessions, struct nfsd_net *nn)
 {
        struct nfs4_stateowner *so;
        struct nfs4_openowner *oo;
        struct nfs4_client *clp;
-        list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+        list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
                if (!so->so_is_open_owner)
                        continue;
                if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
@@ -2555,9 +2574,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
        struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
        struct nfs4_delegation *dp;
-        BUG_ON(!fp);
+        if (!fp) {
-        /* We assume break_lease is only called once per lease: */
+                WARN(1, "(%p)->fl_owner NULL\n", fl);
-        BUG_ON(fp->fi_had_conflict);
+                return;
+        }
+        if (fp->fi_had_conflict) {
+                WARN(1, "duplicate break on %p\n", fp);
+                return;
+        }
        /*
         * We don't want the locks code to timeout the lease for us;
         * we'll remove it ourself if a delegation isn't returned
@@ -2599,14 +2623,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
 __be32
 nfsd4_process_open1(struct nfsd4_compound_state *cstate,
-                    struct nfsd4_open *open)
+                    struct nfsd4_open *open, struct nfsd_net *nn)
 {
        clientid_t *clientid = &open->op_clientid;
        struct nfs4_client *clp = NULL;
        unsigned int strhashval;
        struct nfs4_openowner *oo = NULL;
        __be32 status;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        if (STALE_CLIENTID(&open->op_clientid, nn))
                return nfserr_stale_clientid;
@@ -2619,10 +2642,11 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
                return nfserr_jukebox;
        strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
-        oo = find_openstateowner_str(strhashval, open, cstate->minorversion);
+        oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
        open->op_openowner = oo;
        if (!oo) {
-                clp = find_confirmed_client(clientid, cstate->minorversion);
+                clp = find_confirmed_client(clientid, cstate->minorversion,
+                                            nn);
                if (clp == NULL)
                        return nfserr_expired;
                goto new_owner;
@@ -2891,7 +2915,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
                        open->op_why_no_deleg = WND4_CANCELLED;
                        break;
                case NFS4_SHARE_WANT_NO_DELEG:
-                        BUG();  /* not supposed to get here */
+                        WARN_ON_ONCE(1);
                }
        }
 }
@@ -2959,6 +2983,7 @@ out:
        }
        return;
 out_free:
+        unhash_stid(&dp->dl_stid);
        nfs4_put_delegation(dp);
 out_no_deleg:
        flag = NFS4_OPEN_DELEGATE_NONE;
@@ -3104,27 +3129,32 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
                free_generic_stateid(open->op_stp);
 }
+static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
+{
+        struct nfs4_client *found;
+        if (STALE_CLIENTID(clid, nn))
+                return nfserr_stale_clientid;
+        found = find_confirmed_client(clid, session, nn);
+        if (clp)
+                *clp = found;
+        return found ? nfs_ok : nfserr_expired;
+}
 __be32
 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            clientid_t *clid)
 {
        struct nfs4_client *clp;
        __be32 status;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        nfs4_lock_state();
        dprintk("process_renew(%08x/%08x): starting\n", 
                        clid->cl_boot, clid->cl_id);
-        status = nfserr_stale_clientid;
+        status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
-        if (STALE_CLIENTID(clid, nn))
+        if (status)
-                goto out;
-        clp = find_confirmed_client(clid, cstate->minorversion);
-        status = nfserr_expired;
-        if (clp == NULL) {
-                /* We assume the client took too long to RENEW. */
-                dprintk("nfsd4_renew: clientid not found!\n");
                goto out;
-        }
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
                        && clp->cl_cb_state != NFSD4_CB_UP)
@@ -3136,44 +3166,42 @@ out:
 }
 static void
-nfsd4_end_grace(struct net *net)
+nfsd4_end_grace(struct nfsd_net *nn)
 {
-        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        /* do nothing if grace period already ended */
        if (nn->grace_ended)
                return;
        dprintk("NFSD: end of grace period\n");
        nn->grace_ended = true;
-        nfsd4_record_grace_done(net, nn->boot_time);
+        nfsd4_record_grace_done(nn, nn->boot_time);
        locks_end_grace(&nn->nfsd4_manager);
        /*
         * Now that every NFSv4 client has had the chance to recover and
         * to see the (possibly new, possibly shorter) lease time, we
         * can safely set the next grace time to the current lease time:
         */
-        nfsd4_grace = nfsd4_lease;
+        nn->nfsd4_grace = nn->nfsd4_lease;
 }
 static time_t
-nfs4_laundromat(void)
+nfs4_laundromat(struct nfsd_net *nn)
 {
        struct nfs4_client *clp;
        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
        struct list_head *pos, *next, reaplist;
-        time_t cutoff = get_seconds() - nfsd4_lease;
+        time_t cutoff = get_seconds() - nn->nfsd4_lease;
-        time_t t, clientid_val = nfsd4_lease;
+        time_t t, clientid_val = nn->nfsd4_lease;
-        time_t u, test_val = nfsd4_lease;
+        time_t u, test_val = nn->nfsd4_lease;
        nfs4_lock_state();
        dprintk("NFSD: laundromat service - starting\n");
-        nfsd4_end_grace(&init_net);
+        nfsd4_end_grace(nn);
        INIT_LIST_HEAD(&reaplist);
-        spin_lock(&client_lock);
+        spin_lock(&nn->client_lock);
-        list_for_each_safe(pos, next, &client_lru) {
+        list_for_each_safe(pos, next, &nn->client_lru) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
                        t = clp->cl_time - cutoff;
@@ -3189,7 +3217,7 @@ nfs4_laundromat(void)
                unhash_client_locked(clp);
                list_add(&clp->cl_lru, &reaplist);
        }
-        spin_unlock(&client_lock);
+        spin_unlock(&nn->client_lock);
        list_for_each_safe(pos, next, &reaplist) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                dprintk("NFSD: purging unused client (clientid %08x)\n",
@@ -3199,6 +3227,8 @@ nfs4_laundromat(void)
        spin_lock(&recall_lock);
        list_for_each_safe(pos, next, &del_recall_lru) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+                if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
+                        continue;
                if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
                        u = dp->dl_time - cutoff;
                        if (test_val > u)
@@ -3212,8 +3242,8 @@ nfs4_laundromat(void)
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
                unhash_delegation(dp);
        }
-        test_val = nfsd4_lease;
+        test_val = nn->nfsd4_lease;
-        list_for_each_safe(pos, next, &close_lru) {
+        list_for_each_safe(pos, next, &nn->close_lru) {
                oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
                if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
                        u = oo->oo_time - cutoff;
@@ -3231,16 +3261,19 @@ nfs4_laundromat(void)
 static struct workqueue_struct *laundry_wq;
 static void laundromat_main(struct work_struct *);
-static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);
 static void
-laundromat_main(struct work_struct *not_used)
+laundromat_main(struct work_struct *laundry)
 {
        time_t t;
+        struct delayed_work *dwork = container_of(laundry, struct delayed_work,
+                                                  work);
+        struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+                                           laundromat_work);
-        t = nfs4_laundromat();
+        t = nfs4_laundromat(nn);
        dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
-        queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
+        queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
 }
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
@@ -3385,16 +3418,17 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
        return nfs_ok;
 }
-static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
+                                   struct nfs4_stid **s, bool sessions,
+                                   struct nfsd_net *nn)
 {
        struct nfs4_client *cl;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return nfserr_bad_stateid;
        if (STALE_STATEID(stateid, nn))
                return nfserr_stale_stateid;
-        cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions);
+        cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
        if (!cl)
                return nfserr_expired;
        *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3416,6 +3450,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
        struct nfs4_delegation *dp = NULL;
        struct svc_fh *current_fh = &cstate->current_fh;
        struct inode *ino = current_fh->fh_dentry->d_inode;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        __be32 status;
        if (filpp)
@@ -3427,7 +3462,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(net, current_fh, stateid, flags);
-        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion);
+        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+                                      &s, cstate->minorversion, nn);
        if (status)
                return status;
        status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3441,7 +3477,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
                        goto out;
                if (filpp) {
                        *filpp = dp->dl_file->fi_deleg_file;
-                        BUG_ON(!*filpp);
+                        if (!*filpp) {
+                                WARN_ON_ONCE(1);
+                                status = nfserr_serverfault;
+                                goto out;
+                        }
                }
                break;
        case NFS4_OPEN_STID:
@@ -3568,7 +3608,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                         stateid_t *stateid, char typemask,
-                         struct nfs4_ol_stateid **stpp)
+                         struct nfs4_ol_stateid **stpp,
+                         struct nfsd_net *nn)
 {
        __be32 status;
        struct nfs4_stid *s;
@@ -3577,7 +3618,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                seqid, STATEID_VAL(stateid));
        *stpp = NULL;
-        status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion);
+        status = nfsd4_lookup_stateid(stateid, typemask, &s,
+                                      cstate->minorversion, nn);
        if (status)
                return status;
        *stpp = openlockstateid(s);
@@ -3586,13 +3628,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
        return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
 }
-static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+                                                 stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
 {
        __be32 status;
        struct nfs4_openowner *oo;
        status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-                                                NFS4_OPEN_STID, stpp);
+                                                NFS4_OPEN_STID, stpp, nn);
        if (status)
                return status;
        oo = openowner((*stpp)->st_stateowner);
@@ -3608,6 +3651,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 status;
        struct nfs4_openowner *oo;
        struct nfs4_ol_stateid *stp;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
                        (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3621,7 +3665,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = nfs4_preprocess_seqid_op(cstate,
                                        oc->oc_seqid, &oc->oc_req_stateid,
-                                        NFS4_OPEN_STID, &stp);
+                                        NFS4_OPEN_STID, &stp, nn);
        if (status)
                goto out;
        oo = openowner(stp->st_stateowner);
@@ -3664,7 +3708,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
        case NFS4_SHARE_ACCESS_BOTH:
                break;
        default:
-                BUG();
+                WARN_ON_ONCE(1);
        }
 }
@@ -3685,6 +3729,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 {
        __be32 status;
        struct nfs4_ol_stateid *stp;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
                        (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3697,7 +3742,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
        nfs4_lock_state();
        status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
-                                        &od->od_stateid, &stp);
+                                        &od->od_stateid, &stp, nn);
        if (status)
                goto out; 
        status = nfserr_inval;
@@ -3760,6 +3805,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 status;
        struct nfs4_openowner *oo;
        struct nfs4_ol_stateid *stp;
+        struct net *net = SVC_NET(rqstp);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        dprintk("NFSD: nfsd4_close on file %.*s\n", 
                        (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3769,7 +3816,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
                                        &close->cl_stateid,
                                        NFS4_OPEN_STID|NFS4_CLOSED_STID,
-                                        &stp);
+                                        &stp, nn);
        if (status)
                goto out; 
        oo = openowner(stp->st_stateowner);
@@ -3791,7 +3838,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                         * little while to handle CLOSE replay.
                         */
                        if (list_empty(&oo->oo_owner.so_stateids))
-                                move_to_close_lru(oo);
+                                move_to_close_lru(oo, SVC_NET(rqstp));
                }
        }
 out:
@@ -3807,15 +3854,15 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct nfs4_delegation *dp;
        stateid_t *stateid = &dr->dr_stateid;
        struct nfs4_stid *s;
-        struct inode *inode;
        __be32 status;
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
                return status;
-        inode = cstate->current_fh.fh_dentry->d_inode;
        nfs4_lock_state();
-        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion);
+        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
+                                      cstate->minorversion, nn);
        if (status)
                goto out;
        dp = delegstateid(s);
@@ -3833,8 +3880,6 @@ out:
 #define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
-#define LOCKOWNER_INO_HASH_BITS 8
-#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
 #define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
 static inline u64
@@ -3852,7 +3897,7 @@ last_byte_offset(u64 start, u64 len)
 {
        u64 end;
-        BUG_ON(!len);
+        WARN_ON_ONCE(!len);
        end = start + len;
        return end > start ? end - 1: NFS4_MAX_UINT64;
 }
@@ -3864,8 +3909,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct
                & LOCKOWNER_INO_HASH_MASK;
 }
-static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
 /*
 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3931,12 +3974,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
 static struct nfs4_lockowner *
 find_lockowner_str(struct inode *inode, clientid_t *clid,
-                struct xdr_netobj *owner)
+                   struct xdr_netobj *owner, struct nfsd_net *nn)
 {
        unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
        struct nfs4_lockowner *lo;
-        list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
+        list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
                if (same_lockowner_ino(lo, inode, clid, owner))
                        return lo;
        }
@@ -3948,9 +3991,10 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
        struct inode *inode = open_stp->st_file->fi_inode;
        unsigned int inohash = lockowner_ino_hashval(inode,
                        clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+        list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
-        list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
+        list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
        list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
@@ -4024,8 +4068,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s
        struct nfs4_client *cl = oo->oo_owner.so_client;
        struct nfs4_lockowner *lo;
        unsigned int strhashval;
+        struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
-        lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+        lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
+                                &lock->v.new.owner, nn);
        if (lo) {
                if (!cstate->minorversion)
                        return nfserr_bad_seqid;
@@ -4065,7 +4111,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        bool new_state = false;
        int lkflg;
        int err;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        struct net *net = SVC_NET(rqstp);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
                (long long) lock->lk_offset,
@@ -4099,7 +4146,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfs4_preprocess_confirmed_seqid_op(cstate,
                                        lock->lk_new_open_seqid,
                                        &lock->lk_new_open_stateid,
-                                        &open_stp);
+                                        &open_stp, nn);
                if (status)
                        goto out;
                open_sop = openowner(open_stp->st_stateowner);
@@ -4113,7 +4160,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid,
                                       &lock->lk_old_lock_stateid,
-                                       NFS4_LOCK_STID, &lock_stp);
+                                       NFS4_LOCK_STID, &lock_stp, nn);
        if (status)
                goto out;
        lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4124,10 +4171,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                goto out;
        status = nfserr_grace;
-        if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim)
+        if (locks_in_grace(net) && !lock->lk_reclaim)
                goto out;
        status = nfserr_no_grace;
-        if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim)
+        if (!locks_in_grace(net) && lock->lk_reclaim)
                goto out;
        file_lock = locks_alloc_lock();
@@ -4238,7 +4285,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file_lock *file_lock = NULL;
        struct nfs4_lockowner *lo;
        __be32 status;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        if (locks_in_grace(SVC_NET(rqstp)))
                return nfserr_grace;
@@ -4248,9 +4295,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
-        status = nfserr_stale_clientid;
+        if (!nfsd4_has_session(cstate)) {
-        if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn))
+                status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
-                goto out;
+                if (status)
+                        goto out;
+        }
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
                goto out;
@@ -4278,7 +4327,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                goto out;
        }
-        lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+        lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
        if (lo)
                file_lock->fl_owner = (fl_owner_t)lo;
        file_lock->fl_pid = current->tgid;
@@ -4313,7 +4362,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file_lock *file_lock = NULL;
        __be32 status;
        int err;
-                                                        
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
                (long long) locku->lu_offset,
                (long long) locku->lu_length);
@@ -4324,7 +4374,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
                                                                                
        status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
-                                        &locku->lu_stateid, NFS4_LOCK_STID, &stp);
+                                        &locku->lu_stateid, NFS4_LOCK_STID,
+                                        &stp, nn);
        if (status)
                goto out;
        filp = find_any_file(stp->st_file);
@@ -4414,23 +4465,21 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
        struct list_head matches;
        unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
        __be32 status;
-        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
                clid->cl_boot, clid->cl_id);
-        /* XXX check for lease expiration */
-        status = nfserr_stale_clientid;
-        if (STALE_CLIENTID(clid, nn))
-                return status;
        nfs4_lock_state();
+        status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
+        if (status)
+                goto out;
        status = nfserr_locks_held;
        INIT_LIST_HEAD(&matches);
-        list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
+        list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
                if (sop->so_is_open_owner)
                        continue;
                if (!same_owner_str(sop, owner, clid))
@@ -4466,73 +4515,74 @@ alloc_reclaim(void)
        return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
 }
-int
+bool
-nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
+nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
 {
-        unsigned int strhashval = clientstr_hashval(name);
+        struct nfs4_client_reclaim *crp;
-        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval);
+        crp = nfsd4_find_reclaim_client(name, nn);
-        if (!clp)
+        return (crp && crp->cr_clp);
-                return 0;
-        return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 }
 /*
 * failure => all reset bets are off, nfserr_no_grace...
 */
-int
+struct nfs4_client_reclaim *
-nfs4_client_to_reclaim(const char *name)
+nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
 {
        unsigned int strhashval;
-        struct nfs4_client_reclaim *crp = NULL;
+        struct nfs4_client_reclaim *crp;
        dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
        crp = alloc_reclaim();
-        if (!crp)
+        if (crp) {
-                return 0;
+                strhashval = clientstr_hashval(name);
-        strhashval = clientstr_hashval(name);
+                INIT_LIST_HEAD(&crp->cr_strhash);
-        INIT_LIST_HEAD(&crp->cr_strhash);
+                list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
-        list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
+                memcpy(crp->cr_recdir, name, HEXDIR_LEN);
-        memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+                crp->cr_clp = NULL;
-        reclaim_str_hashtbl_size++;
+                nn->reclaim_str_hashtbl_size++;
-        return 1;
+        }
+        return crp;
+}
+void
+nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
+{
+        list_del(&crp->cr_strhash);
+        kfree(crp);
+        nn->reclaim_str_hashtbl_size--;
 }
 void
-nfs4_release_reclaim(void)
+nfs4_release_reclaim(struct nfsd_net *nn)
 {
        struct nfs4_client_reclaim *crp = NULL;
        int i;
        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
-                while (!list_empty(&reclaim_str_hashtbl[i])) {
+                while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
-                        crp = list_entry(reclaim_str_hashtbl[i].next,
+                        crp = list_entry(nn->reclaim_str_hashtbl[i].next,
                                        struct nfs4_client_reclaim, cr_strhash);
-                        list_del(&crp->cr_strhash);
+                        nfs4_remove_reclaim_record(crp, nn);
-                        kfree(crp);
-                        reclaim_str_hashtbl_size--;
                }
        }
-        BUG_ON(reclaim_str_hashtbl_size);
+        WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
 }
 /*
 * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
 struct nfs4_client_reclaim *
-nfsd4_find_reclaim_client(struct nfs4_client *clp)
+nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
 {
        unsigned int strhashval;
        struct nfs4_client_reclaim *crp = NULL;
-        dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n",
+        dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
-                            clp->cl_name.len, clp->cl_name.data,
-                            clp->cl_recdir);
-        /* find clp->cl_name in reclaim_str_hashtbl */
+        strhashval = clientstr_hashval(recdir);
-        strhashval = clientstr_hashval(clp->cl_recdir);
+        list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
-        list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
+                if (same_name(crp->cr_recdir, recdir)) {
-                if (same_name(crp->cr_recdir, clp->cl_recdir)) {
                        return crp;
                }
        }
@@ -4543,12 +4593,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
 * Called from OPEN. Look for clientid in reclaim list.
 */
 __be32
-nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
+nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
        struct nfs4_client *clp;
        /* find clientid in conf_id_hashtbl */
-        clp = find_confirmed_client(clid, sessions);
+        clp = find_confirmed_client(clid, sessions, nn);
        if (clp == NULL)
                return nfserr_reclaim_bad;
@@ -4557,124 +4607,177 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
 #ifdef CONFIG_NFSD_FAULT_INJECTION
-void nfsd_forget_clients(u64 num)
+u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
 {
-        struct nfs4_client *clp, *next;
+        expire_client(clp);
-        int count = 0;
+        return 1;
-        nfs4_lock_state();
-        list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
-                expire_client(clp);
-                if (++count == num)
-                        break;
-        }
-        nfs4_unlock_state();
-        printk(KERN_INFO "NFSD: Forgot %d clients", count);
 }
-static void release_lockowner_sop(struct nfs4_stateowner *sop)
+u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
 {
-        release_lockowner(lockowner(sop));
+        char buf[INET6_ADDRSTRLEN];
+        rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+        printk(KERN_INFO "NFS Client: %s\n", buf);
+        return 1;
 }
-static void release_openowner_sop(struct nfs4_stateowner *sop)
+static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
+                             const char *type)
 {
-        release_openowner(openowner(sop));
+        char buf[INET6_ADDRSTRLEN];
+        rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+        printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
 }
-static int nfsd_release_n_owners(u64 num, bool is_open_owner,
+static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
-                                void (*release_sop)(struct nfs4_stateowner *))
 {
-        int i, count = 0;
+        struct nfs4_openowner *oop;
-        struct nfs4_stateowner *sop, *next;
+        struct nfs4_lockowner *lop, *lo_next;
+        struct nfs4_ol_stateid *stp, *st_next;
+        u64 count = 0;
-        for (i = 0; i < OWNER_HASH_SIZE; i++) {
+        list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
-                list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
+                list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
-                        if (sop->so_is_open_owner != is_open_owner)
+                        list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
-                                continue;
+                                if (func)
-                        release_sop(sop);
+                                        func(lop);
-                        if (++count == num)
+                                if (++count == max)
-                                return count;
+                                        return count;
+                        }
                }
        }
        return count;
 }
-void nfsd_forget_locks(u64 num)
+u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
 {
-        int count;
+        return nfsd_foreach_client_lock(clp, max, release_lockowner);
+}
-        nfs4_lock_state();
-        count = nfsd_release_n_owners(num, false, release_lockowner_sop);
-        nfs4_unlock_state();
-        printk(KERN_INFO "NFSD: Forgot %d locks", count);
+u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
+{
+        u64 count = nfsd_foreach_client_lock(clp, max, NULL);
+        nfsd_print_count(clp, count, "locked files");
+        return count;
 }
-void nfsd_forget_openowners(u64 num)
+static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
 {
-        int count;
+        struct nfs4_openowner *oop, *next;
+        u64 count = 0;
-        nfs4_lock_state();
+        list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
-        count = nfsd_release_n_owners(num, true, release_openowner_sop);
+                if (func)
-        nfs4_unlock_state();
+                        func(oop);
+                if (++count == max)
+                        break;
+        }
-        printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+        return count;
 }
-static int nfsd_process_n_delegations(u64 num, struct list_head *list)
+u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
 {
-        int i, count = 0;
+        return nfsd_foreach_client_open(clp, max, release_openowner);
-        struct nfs4_file *fp, *fnext;
+}
-        struct nfs4_delegation *dp, *dnext;
-        for (i = 0; i < FILE_HASH_SIZE; i++) {
+u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
-                list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
+{
-                        list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
+        u64 count = nfsd_foreach_client_open(clp, max, NULL);
-                                list_move(&dp->dl_recall_lru, list);
+        nfsd_print_count(clp, count, "open files");
-                                if (++count == num)
+        return count;
-                                        return count;
+}
-                        }
-                }
+static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
-        }
+                                     struct list_head *victims)
+{
+        struct nfs4_delegation *dp, *next;
+        u64 count = 0;
+        list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
+                if (victims)
+                        list_move(&dp->dl_recall_lru, victims);
+                if (++count == max)
+                        break;
+        }
        return count;
 }
-void nfsd_forget_delegations(u64 num)
+u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
 {
-        unsigned int count;
+        struct nfs4_delegation *dp, *next;
        LIST_HEAD(victims);
-        struct nfs4_delegation *dp, *dnext;
+        u64 count;
        spin_lock(&recall_lock);
-        count = nfsd_process_n_delegations(num, &victims);
+        count = nfsd_find_all_delegations(clp, max, &victims);
        spin_unlock(&recall_lock);
-        nfs4_lock_state();
+        list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
-        list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru)
                unhash_delegation(dp);
-        nfs4_unlock_state();
-        printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+        return count;
 }
-void nfsd_recall_delegations(u64 num)
+u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
 {
-        unsigned int count;
+        struct nfs4_delegation *dp, *next;
        LIST_HEAD(victims);
-        struct nfs4_delegation *dp, *dnext;
+        u64 count;
        spin_lock(&recall_lock);
-        count = nfsd_process_n_delegations(num, &victims);
+        count = nfsd_find_all_delegations(clp, max, &victims);
-        list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) {
+        list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
-                list_del(&dp->dl_recall_lru);
                nfsd_break_one_deleg(dp);
-        }
        spin_unlock(&recall_lock);
-        printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+        return count;
+}
+u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
+{
+        u64 count = 0;
+        spin_lock(&recall_lock);
+        count = nfsd_find_all_delegations(clp, max, NULL);
+        spin_unlock(&recall_lock);
+        nfsd_print_count(clp, count, "delegations");
+        return count;
+}
+u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
+{
+        struct nfs4_client *clp, *next;
+        u64 count = 0;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+        if (!nfsd_netns_ready(nn))
+                return 0;
+        list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+                count += func(clp, max - count);
+                if ((max != 0) && (count >= max))
+                        break;
+        }
+        return count;
+}
+struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+        if (!nfsd_netns_ready(nn))
+                return NULL;
+        list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+                if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+                        return clp;
+        }
+        return NULL;
 }
 #endif /* CONFIG_NFSD_FAULT_INJECTION */
@@ -4686,27 +4789,10 @@ nfs4_state_init(void)
 {
        int i;
-        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&conf_id_hashtbl[i]);
-                INIT_LIST_HEAD(&conf_str_hashtbl[i]);
-                INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
-                INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
-                INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
-        }
-        for (i = 0; i < SESSION_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&sessionid_hashtbl[i]);
        for (i = 0; i < FILE_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&file_hashtbl[i]);
        }
-        for (i = 0; i < OWNER_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
-        }
-        for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
-        INIT_LIST_HEAD(&close_lru);
-        INIT_LIST_HEAD(&client_lru);
        INIT_LIST_HEAD(&del_recall_lru);
-        reclaim_str_hashtbl_size = 0;
 }
 /*
@@ -4730,12 +4816,100 @@ set_max_delegations(void)
        max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
 }
-/* initialization to perform when the nfsd service is started: */
+static int nfs4_state_create_net(struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        int i;
+        nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+                        CLIENT_HASH_SIZE, GFP_KERNEL);
+        if (!nn->conf_id_hashtbl)
+                goto err;
+        nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+                        CLIENT_HASH_SIZE, GFP_KERNEL);
+        if (!nn->unconf_id_hashtbl)
+                goto err_unconf_id;
+        nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
+                        OWNER_HASH_SIZE, GFP_KERNEL);
+        if (!nn->ownerstr_hashtbl)
+                goto err_ownerstr;
+        nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
+                        LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
+        if (!nn->lockowner_ino_hashtbl)
+                goto err_lockowner_ino;
+        nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
+                        SESSION_HASH_SIZE, GFP_KERNEL);
+        if (!nn->sessionid_hashtbl)
+                goto err_sessionid;
+        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+                INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
+                INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
+        }
+        for (i = 0; i < OWNER_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
+        for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
+        for (i = 0; i < SESSION_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
+        nn->conf_name_tree = RB_ROOT;
+        nn->unconf_name_tree = RB_ROOT;
+        INIT_LIST_HEAD(&nn->client_lru);
+        INIT_LIST_HEAD(&nn->close_lru);
+        spin_lock_init(&nn->client_lock);
+        INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+        get_net(net);
+        return 0;
+err_sessionid:
+        kfree(nn->lockowner_ino_hashtbl);
+err_lockowner_ino:
+        kfree(nn->ownerstr_hashtbl);
+err_ownerstr:
+        kfree(nn->unconf_id_hashtbl);
+err_unconf_id:
+        kfree(nn->conf_id_hashtbl);
+err:
+        return -ENOMEM;
+}
+static void
+nfs4_state_destroy_net(struct net *net)
+{
+        int i;
+        struct nfs4_client *clp = NULL;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        struct rb_node *node, *tmp;
+        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+                while (!list_empty(&nn->conf_id_hashtbl[i])) {
+                        clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+                        destroy_client(clp);
+                }
+        }
+        node = rb_first(&nn->unconf_name_tree);
+        while (node != NULL) {
+                tmp = node;
+                node = rb_next(tmp);
+                clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
+                rb_erase(tmp, &nn->unconf_name_tree);
+                destroy_client(clp);
+        }
+        kfree(nn->sessionid_hashtbl);
+        kfree(nn->lockowner_ino_hashtbl);
+        kfree(nn->ownerstr_hashtbl);
+        kfree(nn->unconf_id_hashtbl);
+        kfree(nn->conf_id_hashtbl);
+        put_net(net);
+}
 int
-nfs4_state_start(void)
+nfs4_state_start_net(struct net *net)
 {
-        struct net *net = &init_net;
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        int ret;
@@ -4746,18 +4920,32 @@ nfs4_state_start(void)
         * to that instead and then do most of the rest of this on a per-net
         * basis.
         */
-        get_net(net);
+        if (net != &init_net)
+                return -EINVAL;
+        ret = nfs4_state_create_net(net);
+        if (ret)
+                return ret;
        nfsd4_client_tracking_init(net);
        nn->boot_time = get_seconds();
        locks_start_grace(net, &nn->nfsd4_manager);
        nn->grace_ended = false;
-        printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
+        printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
-               nfsd4_grace);
+               nn->nfsd4_grace, net);
+        queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
+        return 0;
+}
+/* initialization to perform when the nfsd service is started: */
+int
+nfs4_state_start(void)
+{
+        int ret;
        ret = set_callback_cred();
-        if (ret) {
+        if (ret)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                goto out_recovery;
-        }
        laundry_wq = create_singlethread_workqueue("nfsd4");
        if (laundry_wq == NULL) {
                ret = -ENOMEM;
@@ -4766,39 +4954,34 @@ nfs4_state_start(void)
        ret = nfsd4_create_callback_queue();
        if (ret)
                goto out_free_laundry;
-        queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
        set_max_delegations();
        return 0;
 out_free_laundry:
        destroy_workqueue(laundry_wq);
 out_recovery:
-        nfsd4_client_tracking_exit(net);
-        put_net(net);
        return ret;
 }
-static void
+/* should be called with the state lock held */
-__nfs4_state_shutdown(void)
+void
+nfs4_state_shutdown_net(struct net *net)
 {
-        int i;
-        struct nfs4_client *clp = NULL;
        struct nfs4_delegation *dp = NULL;
        struct list_head *pos, *next, reaplist;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        cancel_delayed_work_sync(&nn->laundromat_work);
+        locks_end_grace(&nn->nfsd4_manager);
-        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
-                while (!list_empty(&conf_id_hashtbl[i])) {
-                        clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
-                        destroy_client(clp);
-                }
-                while (!list_empty(&unconf_str_hashtbl[i])) {
-                        clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
-                        destroy_client(clp);
-                }
-        }
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        list_for_each_safe(pos, next, &del_recall_lru) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+                if (dp->dl_stid.sc_client->net != net)
+                        continue;
                list_move(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&recall_lock);
@@ -4807,22 +4990,14 @@ __nfs4_state_shutdown(void)
                unhash_delegation(dp);
        }
-        nfsd4_client_tracking_exit(&init_net);
+        nfsd4_client_tracking_exit(net);
-        put_net(&init_net);
+        nfs4_state_destroy_net(net);
 }
 void
 nfs4_state_shutdown(void)
 {
-        struct net *net = &init_net;
-        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        cancel_delayed_work_sync(&laundromat_work);
        destroy_workqueue(laundry_wq);
-        locks_end_grace(&nn->nfsd4_manager);
-        nfs4_lock_state();
-        __nfs4_state_shutdown();
-        nfs4_unlock_state();
        nfsd4_destroy_callback_queue();
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fd548d155088..0dc11586682f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -53,6 +53,7 @@
 #include "vfs.h"
 #include "state.h"
 #include "cache.h"
+#include "netns.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
@@ -65,17 +66,17 @@
 #define NFS4_REFERRAL_FSID_MINOR        0x8000000ULL
 static __be32
-check_filename(char *str, int len, __be32 err)
+check_filename(char *str, int len)
 {
        int i;
        if (len == 0)
                return nfserr_inval;
        if (isdotent(str, len))
-                return err;
+                return nfserr_badname;
        for (i = 0; i < len; i++)
                if (str[i] == '/')
-                        return err;
+                        return nfserr_badname;
        return 0;
 }
@@ -422,6 +423,86 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+{
+        DECODE_HEAD;
+        u32 dummy, uid, gid;
+        char *machine_name;
+        int i;
+        int nr_secflavs;
+        /* callback_sec_params4 */
+        READ_BUF(4);
+        READ32(nr_secflavs);
+        cbs->flavor = (u32)(-1);
+        for (i = 0; i < nr_secflavs; ++i) {
+                READ_BUF(4);
+                READ32(dummy);
+                switch (dummy) {
+                case RPC_AUTH_NULL:
+                        /* Nothing to read */
+                        if (cbs->flavor == (u32)(-1))
+                                cbs->flavor = RPC_AUTH_NULL;
+                        break;
+                case RPC_AUTH_UNIX:
+                        READ_BUF(8);
+                        /* stamp */
+                        READ32(dummy);
+                        /* machine name */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        SAVEMEM(machine_name, dummy);
+                        /* uid, gid */
+                        READ_BUF(8);
+                        READ32(uid);
+                        READ32(gid);
+                        /* more gids */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy * 4);
+                        if (cbs->flavor == (u32)(-1)) {
+                                cbs->uid = uid;
+                                cbs->gid = gid;
+                                cbs->flavor = RPC_AUTH_UNIX;
+                        }
+                        break;
+                case RPC_AUTH_GSS:
+                        dprintk("RPC_AUTH_GSS callback secflavor "
+                                "not supported!\n");
+                        READ_BUF(8);
+                        /* gcbp_service */
+                        READ32(dummy);
+                        /* gcbp_handle_from_server */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        /* gcbp_handle_from_client */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        break;
+                default:
+                        dprintk("Illegal callback secflavor\n");
+                        return nfserr_inval;
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(bc->bc_cb_program);
+        nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
+        DECODE_TAIL;
+}
 static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
 {
        DECODE_HEAD;
@@ -490,7 +571,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        READ32(create->cr_namelen);
        READ_BUF(create->cr_namelen);
        SAVEMEM(create->cr_name, create->cr_namelen);
-        if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
+        if ((status = check_filename(create->cr_name, create->cr_namelen)))
                return status;
        status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
@@ -522,7 +603,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
        READ32(link->li_namelen);
        READ_BUF(link->li_namelen);
        SAVEMEM(link->li_name, link->li_namelen);
-        if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval)))
+        if ((status = check_filename(link->li_name, link->li_namelen)))
                return status;
        DECODE_TAIL;
@@ -616,7 +697,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
        READ32(lookup->lo_len);
        READ_BUF(lookup->lo_len);
        SAVEMEM(lookup->lo_name, lookup->lo_len);
-        if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent)))
+        if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
                return status;
        DECODE_TAIL;
@@ -780,7 +861,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                READ32(open->op_fname.len);
                READ_BUF(open->op_fname.len);
                SAVEMEM(open->op_fname.data, open->op_fname.len);
-                if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+                if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
                        return status;
                break;
        case NFS4_OPEN_CLAIM_PREVIOUS:
@@ -795,7 +876,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                READ32(open->op_fname.len);
                READ_BUF(open->op_fname.len);
                SAVEMEM(open->op_fname.data, open->op_fname.len);
-                if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+                if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
                        return status;
                break;
        case NFS4_OPEN_CLAIM_FH:
@@ -907,7 +988,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
        READ32(remove->rm_namelen);
        READ_BUF(remove->rm_namelen);
        SAVEMEM(remove->rm_name, remove->rm_namelen);
-        if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent)))
+        if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
                return status;
        DECODE_TAIL;
@@ -925,9 +1006,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
        READ32(rename->rn_tnamelen);
        READ_BUF(rename->rn_tnamelen);
        SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
-        if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent)))
+        if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
                return status;
-        if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval)))
+        if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
                return status;
        DECODE_TAIL;
@@ -954,8 +1035,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
        READ32(secinfo->si_namelen);
        READ_BUF(secinfo->si_namelen);
        SAVEMEM(secinfo->si_name, secinfo->si_namelen);
-        status = check_filename(secinfo->si_name, secinfo->si_namelen,
+        status = check_filename(secinfo->si_name, secinfo->si_namelen);
-                                                                nfserr_noent);
        if (status)
                return status;
        DECODE_TAIL;
@@ -1026,31 +1106,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
 static __be32
 nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
 {
-#if 0
-        struct nfsd4_compoundargs save = {
-                .p = argp->p,
-                .end = argp->end,
-                .rqstp = argp->rqstp,
-        };
-        u32             ve_bmval[2];
-        struct iattr    ve_iattr;           /* request */
-        struct nfs4_acl *ve_acl;            /* request */
-#endif
        DECODE_HEAD;
        if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
                goto out;
        /* For convenience's sake, we compare raw xdr'd attributes in
-         * nfsd4_proc_verify; however we still decode here just to return
+         * nfsd4_proc_verify */
-         * correct error in case of bad xdr. */
-#if 0
-        status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl);
-        if (status == nfserr_inval) {
-                status = nfserrno(status);
-                goto out;
-        }
-#endif
        READ_BUF(4);
        READ32(verify->ve_attrlen);
        READ_BUF(verify->ve_attrlen);
@@ -1063,7 +1126,6 @@ static __be32
 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 {
        int avail;
-        int v;
        int len;
        DECODE_HEAD;
@@ -1087,27 +1149,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
                                __FILE__, __LINE__);
                goto xdr_error;
        }
-        argp->rqstp->rq_vec[0].iov_base = p;
+        write->wr_head.iov_base = p;
-        argp->rqstp->rq_vec[0].iov_len = avail;
+        write->wr_head.iov_len = avail;
-        v = 0;
+        WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
-        len = write->wr_buflen;
+        write->wr_pagelist = argp->pagelist;
-        while (len > argp->rqstp->rq_vec[v].iov_len) {
-                len -= argp->rqstp->rq_vec[v].iov_len;
+        len = XDR_QUADLEN(write->wr_buflen) << 2;
-                v++;
+        if (len >= avail) {
-                argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
+                int pages;
-                argp->pagelist++;
-                if (argp->pagelen >= PAGE_SIZE) {
+                len -= avail;
-                        argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
-                        argp->pagelen -= PAGE_SIZE;
+                pages = len >> PAGE_SHIFT;
-                } else {
+                argp->pagelist += pages;
-                        argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
+                argp->pagelen -= pages * PAGE_SIZE;
-                        argp->pagelen -= len;
+                len -= pages * PAGE_SIZE;
-                }
+                argp->p = (__be32 *)page_address(argp->pagelist[0]);
+                argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
        }
-        argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
+        argp->p += XDR_QUADLEN(len);
-        argp->p = (__be32*)  (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
-        argp->rqstp->rq_vec[v].iov_len = len;
-        write->wr_vlen = v+1;
        DECODE_TAIL;
 }
@@ -1237,11 +1298,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                            struct nfsd4_create_session *sess)
 {
        DECODE_HEAD;
        u32 dummy;
-        char *machine_name;
-        int i;
-        int nr_secflavs;
        READ_BUF(16);
        COPYMEM(&sess->clientid, 8);
@@ -1282,58 +1339,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                goto xdr_error;
        }
-        READ_BUF(8);
+        READ_BUF(4);
        READ32(sess->callback_prog);
+        nfsd4_decode_cb_sec(argp, &sess->cb_sec);
-        /* callback_sec_params4 */
-        READ32(nr_secflavs);
-        for (i = 0; i < nr_secflavs; ++i) {
-                READ_BUF(4);
-                READ32(dummy);
-                switch (dummy) {
-                case RPC_AUTH_NULL:
-                        /* Nothing to read */
-                        break;
-                case RPC_AUTH_UNIX:
-                        READ_BUF(8);
-                        /* stamp */
-                        READ32(dummy);
-                        /* machine name */
-                        READ32(dummy);
-                        READ_BUF(dummy);
-                        SAVEMEM(machine_name, dummy);
-                        /* uid, gid */
-                        READ_BUF(8);
-                        READ32(sess->uid);
-                        READ32(sess->gid);
-                        /* more gids */
-                        READ_BUF(4);
-                        READ32(dummy);
-                        READ_BUF(dummy * 4);
-                        break;
-                case RPC_AUTH_GSS:
-                        dprintk("RPC_AUTH_GSS callback secflavor "
-                                "not supported!\n");
-                        READ_BUF(8);
-                        /* gcbp_service */
-                        READ32(dummy);
-                        /* gcbp_handle_from_server */
-                        READ32(dummy);
-                        READ_BUF(dummy);
-                        p += XDR_QUADLEN(dummy);
-                        /* gcbp_handle_from_client */
-                        READ_BUF(4);
-                        READ32(dummy);
-                        READ_BUF(dummy);
-                        break;
-                default:
-                        dprintk("Illegal callback secflavor\n");
-                        return nfserr_inval;
-                }
-        }
        DECODE_TAIL;
 }
@@ -1528,7 +1536,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_notsupp,
        /* new operations for NFSv4.1 */
-        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_dec)nfsd4_decode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_dec)nfsd4_decode_create_session,
@@ -1568,12 +1576,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        bool cachethis = false;
        int i;
-        /*
-         * XXX: According to spec, we should check the tag
-         * for UTF-8 compliance.  I'm postponing this for
-         * now because it seems that some clients do use
-         * binary tags.
-         */
        READ_BUF(4);
        READ32(argp->taglen);
        READ_BUF(argp->taglen + 8);
@@ -1603,38 +1605,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                op = &argp->ops[i];
                op->replay = NULL;
-                /*
+                READ_BUF(4);
-                 * We can't use READ_BUF() here because we need to handle
+                READ32(op->opnum);
-                 * a missing opcode as an OP_WRITE + 1. So we need to check
-                 * to see if we're truly at the end of our buffer or if there
-                 * is another page we need to flip to.
-                 */
-                if (argp->p == argp->end) {
-                        if (argp->pagelen < 4) {
-                                /* There isn't an opcode still on the wire */
-                                op->opnum = OP_WRITE + 1;
-                                op->status = nfserr_bad_xdr;
-                                argp->opcnt = i+1;
-                                break;
-                        }
-                        /*
-                         * False alarm. We just hit a page boundary, but there
-                         * is still data available.  Move pointer across page
-                         * boundary.  *snip from READ_BUF*
-                         */
-                        argp->p = page_address(argp->pagelist[0]);
-                        argp->pagelist++;
-                        if (argp->pagelen < PAGE_SIZE) {
-                                argp->end = argp->p + (argp->pagelen>>2);
-                                argp->pagelen = 0;
-                        } else {
-                                argp->end = argp->p + (PAGE_SIZE>>2);
-                                argp->pagelen -= PAGE_SIZE;
-                        }
-                }
-                op->opnum = ntohl(*argp->p++);
                if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
                        op->status = ops->decoders[op->opnum](argp, &op->u);
@@ -2014,6 +1986,22 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
        return 0;
 }
+static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+{
+        struct path path = exp->ex_path;
+        int err;
+        path_get(&path);
+        while (follow_up(&path)) {
+                if (path.dentry != path.mnt->mnt_root)
+                        break;
+        }
+        err = vfs_getattr(path.mnt, path.dentry, stat);
+        path_put(&path);
+        return err;
+}
 /*
 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
 * ourselves.
@@ -2048,6 +2036,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                .mnt    = exp->ex_path.mnt,
                .dentry = dentry,
        };
+        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
        BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -2208,7 +2197,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
-                WRITE32(nfsd4_lease);
+                WRITE32(nn->nfsd4_lease);
        }
        if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
                if ((buflen -= 4) < 0)
@@ -2430,18 +2419,8 @@ out_acl:
                 * and this is the root of a cross-mounted filesystem.
                 */
                if (ignore_crossmnt == 0 &&
-                    dentry == exp->ex_path.mnt->mnt_root) {
+                    dentry == exp->ex_path.mnt->mnt_root)
-                        struct path path = exp->ex_path;
+                        get_parent_attributes(exp, &stat);
-                        path_get(&path);
-                        while (follow_up(&path)) {
-                                if (path.dentry != path.mnt->mnt_root)
-                                        break;
-                        }
-                        err = vfs_getattr(path.mnt, path.dentry, &stat);
-                        path_put(&path);
-                        if (err)
-                                goto out_nfserr;
-                }
                WRITE64(stat.ino);
        }
        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
@@ -2927,7 +2906,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
                  struct nfsd4_read *read)
 {
        u32 eof;
-        int v, pn;
+        int v;
+        struct page *page;
        unsigned long maxcount; 
        long len;
        __be32 *p;
@@ -2946,11 +2926,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        len = maxcount;
        v = 0;
        while (len > 0) {
-                pn = resp->rqstp->rq_resused++;
+                page = *(resp->rqstp->rq_next_page);
-                resp->rqstp->rq_vec[v].iov_base =
+                if (!page) { /* ran out of pages */
-                        page_address(resp->rqstp->rq_respages[pn]);
+                        maxcount -= len;
+                        break;
+                }
+                resp->rqstp->rq_vec[v].iov_base = page_address(page);
                resp->rqstp->rq_vec[v].iov_len =
                        len < PAGE_SIZE ? len : PAGE_SIZE;
+                resp->rqstp->rq_next_page++;
                v++;
                len -= PAGE_SIZE;
        }
@@ -2996,8 +2980,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
                return nfserr;
        if (resp->xbuf->page_len)
                return nfserr_resource;
+        if (!*resp->rqstp->rq_next_page)
+                return nfserr_resource;
-        page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+        page = page_address(*(resp->rqstp->rq_next_page++));
        maxcount = PAGE_SIZE;
        RESERVE_SPACE(4);
@@ -3045,6 +3031,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
                return nfserr;
        if (resp->xbuf->page_len)
                return nfserr_resource;
+        if (!*resp->rqstp->rq_next_page)
+                return nfserr_resource;
        RESERVE_SPACE(NFS4_VERIFIER_SIZE);
        savep = p;
@@ -3071,7 +3059,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
                goto err_no_verf;
        }
-        page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+        page = page_address(*(resp->rqstp->rq_next_page++));
        readdir->common.err = 0;
        readdir->buflen = maxcount;
        readdir->buffer = page;
@@ -3094,8 +3082,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        p = readdir->buffer;
        *p++ = 0;       /* no more entries */
        *p++ = htonl(readdir->common.err == nfserr_eof);
-        resp->xbuf->page_len = ((char*)p) - (char*)page_address(
+        resp->xbuf->page_len = ((char*)p) -
-                resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+                (char*)page_address(*(resp->rqstp->rq_next_page-1));
        /* Use rest of head for padding and remaining ops: */
        resp->xbuf->tail[0].iov_base = tailbase;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dab350dfc376..74934284d9a7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -19,7 +19,7 @@
 #include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
-#include "fault_inject.h"
+#include "state.h"
 #include "netns.h"
 /*
@@ -186,9 +186,6 @@ static struct file_operations supported_enctypes_ops = {
 };
 #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
-extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
-extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
 static const struct file_operations pool_stats_operations = {
        .open           = nfsd_pool_stats_open,
        .read           = seq_read,
@@ -399,6 +396,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
        int rv;
+        struct net *net = &init_net;
        if (size > 0) {
                int newthreads;
                rv = get_int(&mesg, &newthreads);
@@ -406,11 +405,11 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                        return rv;
                if (newthreads < 0)
                        return -EINVAL;
-                rv = nfsd_svc(newthreads);
+                rv = nfsd_svc(newthreads, net);
                if (rv < 0)
                        return rv;
        } else
-                rv = nfsd_nrthreads();
+                rv = nfsd_nrthreads(net);
        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
 }
@@ -448,9 +447,10 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
        int len;
        int npools;
        int *nthreads;
+        struct net *net = &init_net;
        mutex_lock(&nfsd_mutex);
-        npools = nfsd_nrpools();
+        npools = nfsd_nrpools(net);
        if (npools == 0) {
                /*
                 * NFS is shut down.  The admin can start it by
@@ -478,12 +478,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
                        if (nthreads[i] < 0)
                                goto out_free;
                }
-                rv = nfsd_set_nrthreads(i, nthreads);
+                rv = nfsd_set_nrthreads(i, nthreads, net);
                if (rv)
                        goto out_free;
        }
-        rv = nfsd_get_nrthreads(npools, nthreads);
+        rv = nfsd_get_nrthreads(npools, nthreads, net);
        if (rv)
                goto out_free;
@@ -510,11 +510,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        unsigned minor;
        ssize_t tlen = 0;
        char *sep;
+        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        if (size>0) {
-                if (nfsd_serv)
+                if (nn->nfsd_serv)
                        /* Cannot change versions without updating
-                         * nfsd_serv->sv_xdrsize, and reallocing
+                         * nn->nfsd_serv->sv_xdrsize, and reallocing
                         * rq_argp and rq_resp
                         */
                        return -EBUSY;
@@ -645,11 +647,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 * Zero-length write.  Return a list of NFSD's current listener
 * transports.
 */
-static ssize_t __write_ports_names(char *buf)
+static ssize_t __write_ports_names(char *buf, struct net *net)
 {
-        if (nfsd_serv == NULL)
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        if (nn->nfsd_serv == NULL)
                return 0;
-        return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+        return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
 }
 /*
@@ -657,28 +661,28 @@ static ssize_t __write_ports_names(char *buf)
 * a socket of a supported family/protocol, and we use it as an
 * nfsd listener.
 */
-static ssize_t __write_ports_addfd(char *buf)
+static ssize_t __write_ports_addfd(char *buf, struct net *net)
 {
        char *mesg = buf;
        int fd, err;
-        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        err = get_int(&mesg, &fd);
        if (err != 0 || fd < 0)
                return -EINVAL;
-        err = nfsd_create_serv();
+        err = nfsd_create_serv(net);
        if (err != 0)
                return err;
-        err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
+        err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
        if (err < 0) {
                nfsd_destroy(net);
                return err;
        }
        /* Decrease the count, but don't shut down the service */
-        nfsd_serv->sv_nrthreads--;
+        nn->nfsd_serv->sv_nrthreads--;
        return err;
 }
@@ -686,12 +690,12 @@ static ssize_t __write_ports_addfd(char *buf)
 * A transport listener is added by writing it's transport name and
 * a port number.
 */
-static ssize_t __write_ports_addxprt(char *buf)
+static ssize_t __write_ports_addxprt(char *buf, struct net *net)
 {
        char transport[16];
        struct svc_xprt *xprt;
        int port, err;
-        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        if (sscanf(buf, "%15s %5u", transport, &port) != 2)
                return -EINVAL;
@@ -699,25 +703,25 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (port < 1 || port > USHRT_MAX)
                return -EINVAL;
-        err = nfsd_create_serv();
+        err = nfsd_create_serv(net);
        if (err != 0)
                return err;
-        err = svc_create_xprt(nfsd_serv, transport, net,
+        err = svc_create_xprt(nn->nfsd_serv, transport, net,
                                PF_INET, port, SVC_SOCK_ANONYMOUS);
        if (err < 0)
                goto out_err;
-        err = svc_create_xprt(nfsd_serv, transport, net,
+        err = svc_create_xprt(nn->nfsd_serv, transport, net,
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
        /* Decrease the count, but don't shut down the service */
-        nfsd_serv->sv_nrthreads--;
+        nn->nfsd_serv->sv_nrthreads--;
        return 0;
 out_close:
-        xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
+        xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
        if (xprt != NULL) {
                svc_close_xprt(xprt);
                svc_xprt_put(xprt);
@@ -727,16 +731,17 @@ out_err:
        return err;
 }
-static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+static ssize_t __write_ports(struct file *file, char *buf, size_t size,
+                             struct net *net)
 {
        if (size == 0)
-                return __write_ports_names(buf);
+                return __write_ports_names(buf, net);
        if (isdigit(buf[0]))
-                return __write_ports_addfd(buf);
+                return __write_ports_addfd(buf, net);
        if (isalpha(buf[0]))
-                return __write_ports_addxprt(buf);
+                return __write_ports_addxprt(buf, net);
        return -EINVAL;
 }
@@ -787,9 +792,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
+        struct net *net = &init_net;
        mutex_lock(&nfsd_mutex);
-        rv = __write_ports(file, buf, size);
+        rv = __write_ports(file, buf, size, net);
        mutex_unlock(&nfsd_mutex);
        return rv;
 }
@@ -821,6 +827,9 @@ int nfsd_max_blksize;
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
+        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        if (size > 0) {
                int bsize;
                int rv = get_int(&mesg, &bsize);
@@ -835,7 +844,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                        bsize = NFSSVC_MAXBLKSIZE;
                bsize &= ~(1024-1);
                mutex_lock(&nfsd_mutex);
-                if (nfsd_serv) {
+                if (nn->nfsd_serv) {
                        mutex_unlock(&nfsd_mutex);
                        return -EBUSY;
                }
@@ -848,13 +857,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 }
 #ifdef CONFIG_NFSD_V4
-static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
+                                  time_t *time, struct nfsd_net *nn)
 {
        char *mesg = buf;
        int rv, i;
        if (size > 0) {
-                if (nfsd_serv)
+                if (nn->nfsd_serv)
                        return -EBUSY;
                rv = get_int(&mesg, &i);
                if (rv)
@@ -879,12 +889,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim
        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
 }
-static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
+                                time_t *time, struct nfsd_net *nn)
 {
        ssize_t rv;
        mutex_lock(&nfsd_mutex);
-        rv = __nfsd4_write_time(file, buf, size, time);
+        rv = __nfsd4_write_time(file, buf, size, time, nn);
        mutex_unlock(&nfsd_mutex);
        return rv;
 }
@@ -912,7 +923,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_
 */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
-        return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
 }
 /**
@@ -927,17 +939,19 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 */
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
 {
-        return nfsd4_write_time(file, buf, size, &nfsd4_grace);
+        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+        return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
 }
-static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
+                                   struct nfsd_net *nn)
 {
        char *mesg = buf;
        char *recdir;
        int len, status;
        if (size > 0) {
-                if (nfsd_serv)
+                if (nn->nfsd_serv)
                        return -EBUSY;
                if (size > PATH_MAX || buf[size-1] != '\n')
                        return -EINVAL;
@@ -981,9 +995,10 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
+        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
        mutex_lock(&nfsd_mutex);
-        rv = __write_recoverydir(file, buf, size);
+        rv = __write_recoverydir(file, buf, size, nn);
        mutex_unlock(&nfsd_mutex);
        return rv;
 }
@@ -1063,6 +1078,7 @@ int nfsd_net_id;
 static __net_init int nfsd_init_net(struct net *net)
 {
        int retval;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        retval = nfsd_export_init(net);
        if (retval)
@@ -1070,6 +1086,8 @@ static __net_init int nfsd_init_net(struct net *net)
        retval = nfsd_idmap_init(net);
        if (retval)
                goto out_idmap_error;
+        nn->nfsd4_lease = 90;   /* default lease time */
+        nn->nfsd4_grace = 90;
        return 0;
 out_idmap_error:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 80d5ce40aadb..de23db255c69 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -55,7 +55,6 @@ extern struct svc_version	nfsd_version2, nfsd_version3,
                                nfsd_version4;
 extern u32                      nfsd_supported_minorversion;
 extern struct mutex             nfsd_mutex;
-extern struct svc_serv          *nfsd_serv;
 extern spinlock_t               nfsd_drc_lock;
 extern unsigned int             nfsd_drc_max_mem;
 extern unsigned int             nfsd_drc_mem_used;
@@ -65,26 +64,17 @@ extern const struct seq_operations nfs_exports_op;
 /*
 * Function prototypes.
 */
-int             nfsd_svc(int nrservs);
+int             nfsd_svc(int nrservs, struct net *net);
 int             nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
-int             nfsd_nrthreads(void);
+int             nfsd_nrthreads(struct net *);
-int             nfsd_nrpools(void);
+int             nfsd_nrpools(struct net *);
-int             nfsd_get_nrthreads(int n, int *);
+int             nfsd_get_nrthreads(int n, int *, struct net *);
-int             nfsd_set_nrthreads(int n, int *);
+int             nfsd_set_nrthreads(int n, int *, struct net *);
 int             nfsd_pool_stats_open(struct inode *, struct file *);
 int             nfsd_pool_stats_release(struct inode *, struct file *);
-static inline void nfsd_destroy(struct net *net)
+void            nfsd_destroy(struct net *net);
-{
-        int destroy = (nfsd_serv->sv_nrthreads == 1);
-        if (destroy)
-                svc_shutdown_net(nfsd_serv, net);
-        svc_destroy(nfsd_serv);
-        if (destroy)
-                nfsd_serv = NULL;
-}
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 #ifdef CONFIG_NFSD_V2_ACL
@@ -103,7 +93,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
 int nfsd_vers(int vers, enum vers_op change);
 int nfsd_minorversion(u32 minorversion, enum vers_op change);
 void nfsd_reset_versions(void);
-int nfsd_create_serv(void);
+int nfsd_create_serv(struct net *net);
 extern int nfsd_max_blksize;
@@ -121,7 +111,9 @@ void nfs4_state_init(void);
 int nfsd4_init_slabs(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
+int nfs4_state_start_net(struct net *net);
 void nfs4_state_shutdown(void);
+void nfs4_state_shutdown_net(struct net *net);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 char * nfs4_recoverydir(void);
@@ -130,7 +122,9 @@ static inline void nfs4_state_init(void) { }
 static inline int nfsd4_init_slabs(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
+static inline int nfs4_state_start_net(struct net *net) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
+static inline void nfs4_state_shutdown_net(struct net *net) { }
 static inline void nfs4_reset_lease(time_t leasetime) { }
 static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
 static inline char * nfs4_recoverydir(void) {return NULL; }
@@ -265,16 +259,8 @@ void		nfsd_lockd_shutdown(void);
 /* Check for dir entries '.' and '..' */
 #define isdotent(n, l)  (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
-/*
- * Time of server startup
- */
-extern struct timeval   nfssvc_boot;
 #ifdef CONFIG_NFSD_V4
-extern time_t nfsd4_lease;
-extern time_t nfsd4_grace;
 /* before processing a COMPOUND operation, we have to check that there
 * is enough space in the buffer for XDR encode to succeed.  otherwise,
 * we might process an operation with side effects, and be unable to
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 032af381b3aa..814afaa4458a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                if (inode)
                        _fh_update(fhp, exp, dentry);
-                if (fhp->fh_handle.fh_fileid_type == 255) {
+                if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
                        fh_put(fhp);
                        return nfserr_opnotsupp;
                }
@@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp)
                        goto out;
                _fh_update(fhp, fhp->fh_export, dentry);
-                if (fhp->fh_handle.fh_fileid_type == 255)
+                if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
                        return nfserr_opnotsupp;
        }
 out:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2013aa001dab..cee62ab9d4a3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/fs_struct.h>
 #include <linux/swap.h>
-#include <linux/nsproxy.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svcsock.h>
@@ -22,19 +21,19 @@
 #include "nfsd.h"
 #include "cache.h"
 #include "vfs.h"
+#include "netns.h"
 #define NFSDDBG_FACILITY        NFSDDBG_SVC
 extern struct svc_program       nfsd_program;
 static int                      nfsd(void *vrqstp);
-struct timeval                  nfssvc_boot;
 /*
- * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
 * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
 * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
 *
- * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
 * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
 * of nfsd threads must exist and each must listed in ->sp_all_threads in each
 * entry of ->sv_pools[].
@@ -52,7 +51,6 @@ struct timeval			nfssvc_boot;
 *      nfsd_versions
 */
 DEFINE_MUTEX(nfsd_mutex);
-struct svc_serv                 *nfsd_serv;
 /*
 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
@@ -173,28 +171,32 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
 */
 #define NFSD_MAXSERVS           8192
-int nfsd_nrthreads(void)
+int nfsd_nrthreads(struct net *net)
 {
        int rv = 0;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        mutex_lock(&nfsd_mutex);
-        if (nfsd_serv)
+        if (nn->nfsd_serv)
-                rv = nfsd_serv->sv_nrthreads;
+                rv = nn->nfsd_serv->sv_nrthreads;
        mutex_unlock(&nfsd_mutex);
        return rv;
 }
-static int nfsd_init_socks(void)
+static int nfsd_init_socks(struct net *net)
 {
        int error;
-        if (!list_empty(&nfsd_serv->sv_permsocks))
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        if (!list_empty(&nn->nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT,
+        error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT,
+        error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
@@ -202,14 +204,15 @@ static int nfsd_init_socks(void)
        return 0;
 }
-static bool nfsd_up = false;
+static int nfsd_users = 0;
-static int nfsd_startup(int nrservs)
+static int nfsd_startup_generic(int nrservs)
 {
        int ret;
-        if (nfsd_up)
+        if (nfsd_users++)
                return 0;
        /*
         * Readahead param cache - will no-op if it already exists.
         * (Note therefore results will be suboptimal if number of
@@ -218,43 +221,79 @@ static int nfsd_startup(int nrservs)
        ret = nfsd_racache_init(2*nrservs);
        if (ret)
                return ret;
-        ret = nfsd_init_socks();
+        ret = nfs4_state_start();
        if (ret)
                goto out_racache;
-        ret = lockd_up(&init_net);
+        return 0;
+out_racache:
+        nfsd_racache_shutdown();
+        return ret;
+}
+static void nfsd_shutdown_generic(void)
+{
+        if (--nfsd_users)
+                return;
+        nfs4_state_shutdown();
+        nfsd_racache_shutdown();
+}
+static int nfsd_startup_net(int nrservs, struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        int ret;
+        if (nn->nfsd_net_up)
+                return 0;
+        ret = nfsd_startup_generic(nrservs);
        if (ret)
-                goto out_racache;
+                return ret;
-        ret = nfs4_state_start();
+        ret = nfsd_init_socks(net);
+        if (ret)
+                goto out_socks;
+        ret = lockd_up(net);
+        if (ret)
+                goto out_socks;
+        ret = nfs4_state_start_net(net);
        if (ret)
                goto out_lockd;
-        nfsd_up = true;
+        nn->nfsd_net_up = true;
        return 0;
 out_lockd:
-        lockd_down(&init_net);
+        lockd_down(net);
-out_racache:
+out_socks:
-        nfsd_racache_shutdown();
+        nfsd_shutdown_generic();
        return ret;
 }
-static void nfsd_shutdown(void)
+static void nfsd_shutdown_net(struct net *net)
 {
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        nfs4_state_shutdown_net(net);
+        lockd_down(net);
+        nn->nfsd_net_up = false;
+        nfsd_shutdown_generic();
+}
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        /*
         * write_ports can create the server without actually starting
         * any threads--if we get shut down before any threads are
         * started, then nfsd_last_thread will be run before any of this
         * other initialization has been done.
         */
-        if (!nfsd_up)
+        if (!nn->nfsd_net_up)
                return;
-        nfs4_state_shutdown();
+        nfsd_shutdown_net(net);
-        lockd_down(&init_net);
-        nfsd_racache_shutdown();
-        nfsd_up = false;
-}
-static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
-{
-        nfsd_shutdown();
        svc_rpcb_cleanup(serv, net);
@@ -327,69 +366,84 @@ static int nfsd_get_default_max_blksize(void)
        return ret;
 }
-int nfsd_create_serv(void)
+int nfsd_create_serv(struct net *net)
 {
        int error;
-        struct net *net = current->nsproxy->net_ns;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        WARN_ON(!mutex_is_locked(&nfsd_mutex));
-        if (nfsd_serv) {
+        if (nn->nfsd_serv) {
-                svc_get(nfsd_serv);
+                svc_get(nn->nfsd_serv);
                return 0;
        }
        if (nfsd_max_blksize == 0)
                nfsd_max_blksize = nfsd_get_default_max_blksize();
        nfsd_reset_versions();
-        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+        nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
-        if (nfsd_serv == NULL)
+        if (nn->nfsd_serv == NULL)
                return -ENOMEM;
-        error = svc_bind(nfsd_serv, net);
+        error = svc_bind(nn->nfsd_serv, net);
        if (error < 0) {
-                svc_destroy(nfsd_serv);
+                svc_destroy(nn->nfsd_serv);
                return error;
        }
        set_max_drc();
-        do_gettimeofday(&nfssvc_boot);          /* record boot time */
+        do_gettimeofday(&nn->nfssvc_boot);              /* record boot time */
        return 0;
 }
-int nfsd_nrpools(void)
+int nfsd_nrpools(struct net *net)
 {
-        if (nfsd_serv == NULL)
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        if (nn->nfsd_serv == NULL)
                return 0;
        else
-                return nfsd_serv->sv_nrpools;
+                return nn->nfsd_serv->sv_nrpools;
 }
-int nfsd_get_nrthreads(int n, int *nthreads)
+int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
 {
        int i = 0;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        if (nfsd_serv != NULL) {
+        if (nn->nfsd_serv != NULL) {
-                for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++)
+                for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
-                        nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads;
+                        nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
        }
        return 0;
 }
-int nfsd_set_nrthreads(int n, int *nthreads)
+void nfsd_destroy(struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
+        if (destroy)
+                svc_shutdown_net(nn->nfsd_serv, net);
+        svc_destroy(nn->nfsd_serv);
+        if (destroy)
+                nn->nfsd_serv = NULL;
+}
+int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 {
        int i = 0;
        int tot = 0;
        int err = 0;
-        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        WARN_ON(!mutex_is_locked(&nfsd_mutex));
-        if (nfsd_serv == NULL || n <= 0)
+        if (nn->nfsd_serv == NULL || n <= 0)
                return 0;
-        if (n > nfsd_serv->sv_nrpools)
+        if (n > nn->nfsd_serv->sv_nrpools)
-                n = nfsd_serv->sv_nrpools;
+                n = nn->nfsd_serv->sv_nrpools;
        /* enforce a global maximum number of threads */
        tot = 0;
@@ -419,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
                nthreads[0] = 1;
        /* apply the new numbers */
-        svc_get(nfsd_serv);
+        svc_get(nn->nfsd_serv);
        for (i = 0; i < n; i++) {
-                err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
+                err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
                                          nthreads[i]);
                if (err)
                        break;
@@ -436,11 +490,11 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 * this is the first time nrservs is nonzero.
 */
 int
-nfsd_svc(int nrservs)
+nfsd_svc(int nrservs, struct net *net)
 {
        int     error;
        bool    nfsd_up_before;
-        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
@@ -449,29 +503,29 @@ nfsd_svc(int nrservs)
        if (nrservs > NFSD_MAXSERVS)
                nrservs = NFSD_MAXSERVS;
        error = 0;
-        if (nrservs == 0 && nfsd_serv == NULL)
+        if (nrservs == 0 && nn->nfsd_serv == NULL)
                goto out;
-        error = nfsd_create_serv();
+        error = nfsd_create_serv(net);
        if (error)
                goto out;
-        nfsd_up_before = nfsd_up;
+        nfsd_up_before = nn->nfsd_net_up;
-        error = nfsd_startup(nrservs);
+        error = nfsd_startup_net(nrservs, net);
        if (error)
                goto out_destroy;
-        error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
+        error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
        if (error)
                goto out_shutdown;
-        /* We are holding a reference to nfsd_serv which
+        /* We are holding a reference to nn->nfsd_serv which
         * we don't want to count in the return value,
         * so subtract 1
         */
-        error = nfsd_serv->sv_nrthreads - 1;
+        error = nn->nfsd_serv->sv_nrthreads - 1;
 out_shutdown:
        if (error < 0 && !nfsd_up_before)
-                nfsd_shutdown();
+                nfsd_shutdown_net(net);
 out_destroy:
        nfsd_destroy(net);              /* Release server */
 out:
@@ -487,6 +541,8 @@ static int
 nfsd(void *vrqstp)
 {
        struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+        struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
+        struct net *net = perm_sock->xpt_net;
        int err;
        /* Lock module and set up kernel thread */
@@ -551,7 +607,7 @@ out:
        /* Release the thread */
        svc_exit_thread(rqstp);
-        nfsd_destroy(&init_net);
+        nfsd_destroy(net);
        /* Release module */
        mutex_unlock(&nfsd_mutex);
@@ -640,21 +696,24 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        }
        /* Store reply in cache. */
-        nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
+        nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
        return 1;
 }
 int nfsd_pool_stats_open(struct inode *inode, struct file *file)
 {
        int ret;
+        struct net *net = &init_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        mutex_lock(&nfsd_mutex);
-        if (nfsd_serv == NULL) {
+        if (nn->nfsd_serv == NULL) {
                mutex_unlock(&nfsd_mutex);
                return -ENODEV;
        }
        /* bump up the psudo refcount while traversing */
-        svc_get(nfsd_serv);
+        svc_get(nn->nfsd_serv);
-        ret = svc_pool_stats_open(nfsd_serv, file);
+        ret = svc_pool_stats_open(nn->nfsd_serv, file);
        mutex_unlock(&nfsd_mutex);
        return ret;
 }
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 65ec595e2226..979b42106979 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -246,7 +246,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_readargs *args)
 {
        unsigned int len;
-        int v,pn;
+        int v;
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
@@ -262,8 +262,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
         */
        v=0;
        while (len > 0) {
-                pn = rqstp->rq_resused++;
+                struct page *p = *(rqstp->rq_next_page++);
-                rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+                rqstp->rq_vec[v].iov_base = page_address(p);
                rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
                len -= rqstp->rq_vec[v].iov_len;
                v++;
@@ -355,7 +356,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
 {
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
-        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+        args->buffer = page_address(*(rqstp->rq_next_page++));
        return xdr_argsize_check(rqstp, p);
 }
@@ -396,7 +397,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
        if (args->count > PAGE_SIZE)
                args->count = PAGE_SIZE;
-        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+        args->buffer = page_address(*(rqstp->rq_next_page++));
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e036894bce57..d1c229feed52 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -150,6 +150,12 @@ struct nfsd4_channel_attrs {
        u32             rdma_attrs;
 };
+struct nfsd4_cb_sec {
+        u32     flavor; /* (u32)(-1) used to mean "no valid flavor" */
+        u32     uid;
+        u32     gid;
+};
 struct nfsd4_create_session {
        clientid_t                      clientid;
        struct nfs4_sessionid           sessionid;
@@ -158,8 +164,12 @@ struct nfsd4_create_session {
        struct nfsd4_channel_attrs      fore_channel;
        struct nfsd4_channel_attrs      back_channel;
        u32                             callback_prog;
-        u32                             uid;
+        struct nfsd4_cb_sec             cb_sec;
-        u32                             gid;
+};
+struct nfsd4_backchannel_ctl {
+        u32     bc_cb_program;
+        struct nfsd4_cb_sec             bc_cb_sec;
 };
 struct nfsd4_bind_conn_to_session {
@@ -192,6 +202,7 @@ struct nfsd4_session {
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
+        struct nfsd4_cb_sec     se_cb_sec;
        struct list_head        se_conns;
        u32                     se_cb_prog;
        u32                     se_cb_seq_nr;
@@ -221,13 +232,12 @@ struct nfsd4_sessionid {
 */
 struct nfs4_client {
        struct list_head        cl_idhash;      /* hash by cl_clientid.id */
-        struct list_head        cl_strhash;     /* hash by cl_name */
+        struct rb_node          cl_namenode;    /* link into by-name trees */
        struct list_head        cl_openowners;
        struct idr              cl_stateids;    /* stateid lookup */
        struct list_head        cl_delegations;
        struct list_head        cl_lru;         /* tail queue */
        struct xdr_netobj       cl_name;        /* id generated by client */
-        char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
        struct sockaddr_storage cl_addr;        /* client ipaddress */
@@ -242,9 +252,11 @@ struct nfs4_client {
 #define NFSD4_CLIENT_CB_KILL            (1)
 #define NFSD4_CLIENT_STABLE             (2)     /* client on stable storage */
 #define NFSD4_CLIENT_RECLAIM_COMPLETE   (3)     /* reclaim_complete done */
+#define NFSD4_CLIENT_CONFIRMED          (4)     /* client is confirmed */
 #define NFSD4_CLIENT_CB_FLAG_MASK       (1 << NFSD4_CLIENT_CB_UPDATE | \
                                         1 << NFSD4_CLIENT_CB_KILL)
        unsigned long           cl_flags;
+        struct rpc_cred         *cl_cb_cred;
        struct rpc_clnt         *cl_cb_client;
        u32                     cl_cb_ident;
 #define NFSD4_CB_UP             0
@@ -271,6 +283,7 @@ struct nfs4_client {
        unsigned long           cl_cb_slot_busy;
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
+        struct net              *net;
 };
 static inline void
@@ -292,6 +305,7 @@ is_client_expired(struct nfs4_client *clp)
 */
 struct nfs4_client_reclaim {
        struct list_head        cr_strhash;     /* hash by cr_name */
+        struct nfs4_client      *cr_clp;        /* pointer to associated clp */
        char                    cr_recdir[HEXDIR_LEN]; /* recover dir */
 };
@@ -452,25 +466,26 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net,
                stateid_t *stateid, int flags, struct file **filp);
 extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
-extern int nfs4_in_grace(void);
+void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
-extern void nfs4_release_reclaim(void);
+extern void nfs4_release_reclaim(struct nfsd_net *);
-extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions);
+                                                        struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
 extern void nfs4_free_openowner(struct nfs4_openowner *);
 extern void nfs4_free_lockowner(struct nfs4_lockowner *);
 extern int set_callback_cred(void);
+extern void nfsd4_init_callback(struct nfsd4_callback *);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
-extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
-extern int nfs4_client_to_reclaim(const char *name);
+                                                        struct nfsd_net *nn);
-extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
+extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
 extern void release_session_client(struct nfsd4_session *);
 extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
@@ -480,5 +495,28 @@ extern void nfsd4_client_tracking_exit(struct net *net);
 extern void nfsd4_client_record_create(struct nfs4_client *clp);
 extern void nfsd4_client_record_remove(struct nfs4_client *clp);
 extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct net *net, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+/* nfs fault injection functions */
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
+struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
+u64 nfsd_forget_client(struct nfs4_client *, u64);
+u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
+u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_print_client(struct nfs4_client *, u64);
+u64 nfsd_print_client_locks(struct nfs4_client *, u64);
+u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c120b48ec305..f0a6d88d7fff 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                  struct splice_desc *sd)
 {
        struct svc_rqst *rqstp = sd->u.data;
-        struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
+        struct page **pp = rqstp->rq_next_page;
        struct page *page = buf->page;
        size_t size;
@@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        if (rqstp->rq_res.page_len == 0) {
                get_page(page);
-                put_page(*pp);
+                put_page(*rqstp->rq_next_page);
-                *pp = page;
+                *(rqstp->rq_next_page++) = page;
-                rqstp->rq_resused++;
                rqstp->rq_res.page_base = buf->offset;
                rqstp->rq_res.page_len = size;
        } else if (page != pp[-1]) {
                get_page(page);
-                if (*pp)
+                if (*rqstp->rq_next_page)
-                        put_page(*pp);
+                        put_page(*rqstp->rq_next_page);
-                *pp = page;
+                *(rqstp->rq_next_page++) = page;
-                rqstp->rq_resused++;
                rqstp->rq_res.page_len += size;
        } else
                rqstp->rq_res.page_len += size;
@@ -936,7 +934,8 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                        .u.data         = rqstp,
                };
-                rqstp->rq_resused = 1;
+                WARN_ON_ONCE(rqstp->rq_next_page != rqstp->rq_respages + 1);
+                rqstp->rq_next_page = rqstp->rq_respages + 1;
                host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
        } else {
                oldfs = get_fs();
@@ -1020,28 +1019,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        inode = dentry->d_inode;
        exp   = fhp->fh_export;
-        /*
-         * Request sync writes if
-         *  -   the sync export option has been set, or
-         *  -   the client requested O_SYNC behavior (NFSv3 feature).
-         *  -   The file system doesn't support fsync().
-         * When NFSv2 gathered writes have been configured for this volume,
-         * flushing the data to disk is handled separately below.
-         */
        use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
-        if (!file->f_op->fsync) {/* COMMIT3 cannot work */
-               stable = 2;
-               *stablep = 2; /* FILE_SYNC */
-        }
        if (!EX_ISSYNC(exp))
                stable = 0;
-        if (stable && !use_wgather) {
-                spin_lock(&file->f_lock);
-                file->f_flags |= O_SYNC;
-                spin_unlock(&file->f_lock);
-        }
        /* Write the data. */
        oldfs = get_fs(); set_fs(KERNEL_DS);
@@ -1057,8 +1038,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        if (inode->i_mode & (S_ISUID | S_ISGID))
                kill_suid(dentry);
-        if (stable && use_wgather)
+        if (stable) {
-                host_err = wait_for_concurrent_writes(file);
+                if (use_wgather)
+                        host_err = wait_for_concurrent_writes(file);
+                else
+                        host_err = vfs_fsync_range(file, offset, offset+*cnt, 0);
+        }
 out_nfserr:
        dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1485,13 +1470,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                case NFS3_CREATE_EXCLUSIVE:
                        if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
                            && dchild->d_inode->i_atime.tv_sec == v_atime
-                            && dchild->d_inode->i_size  == 0 )
+                            && dchild->d_inode->i_size  == 0 ) {
+                                if (created)
+                                        *created = 1;
                                break;
+                        }
                case NFS4_CREATE_EXCLUSIVE4_1:
                        if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
                            && dchild->d_inode->i_atime.tv_sec == v_atime
-                            && dchild->d_inode->i_size  == 0 )
+                            && dchild->d_inode->i_size  == 0 ) {
+                                if (created)
+                                        *created = 1;
                                goto set_attr;
+                        }
                         /* fallthru */
                case NFS3_CREATE_GUARDED:
                        err = nfserr_exist;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index acd127d4ee82..0889bfb43dc9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -385,7 +385,8 @@ struct nfsd4_write {
        u64             wr_offset;          /* request */
        u32             wr_stable_how;      /* request */
        u32             wr_buflen;          /* request */
-        int             wr_vlen;
+        struct kvec     wr_head;
+        struct page **  wr_pagelist;        /* request */
        u32             wr_bytes_written;   /* response */
        u32             wr_how_written;     /* response */
@@ -462,6 +463,7 @@ struct nfsd4_op {
                /* NFSv4.1 */
                struct nfsd4_exchange_id        exchange_id;
+                struct nfsd4_backchannel_ctl    backchannel_ctl;
                struct nfsd4_bind_conn_to_session bind_conn_to_session;
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
@@ -526,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
                || nfsd4_is_solo_sequence(resp);
 }
+static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+        return argp->opcnt == resp->opcnt;
+}
 #define NFS4_SVC_XDRSIZE                sizeof(struct nfsd4_compoundargs)
 static inline void
@@ -566,6 +576,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
 extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
@@ -579,7 +590,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *,
 extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
 __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
-                struct nfsd4_open *open);
+                struct nfsd4_open *open, struct nfsd_net *nn);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
                struct svc_fh *current_fh, struct nfsd4_open *open);
 extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 16f35f7423c5..61946883025c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = {
 };
 const struct inode_operations nilfs_file_inode_operations = {
-        .truncate       = nilfs_truncate,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
        .fiemap         = nilfs_fiemap,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4d31d2cca7fd..6b49f14eac8c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page)
        return ret;
 }
+void nilfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                nilfs_truncate(inode);
+        }
+}
 static int nilfs_write_begin(struct file *file, struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata)
@@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
        err = block_write_begin(mapping, pos, len, flags, pagep,
                                nilfs_get_block);
        if (unlikely(err)) {
-                loff_t isize = mapping->host->i_size;
+                nilfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
                nilfs_transaction_abort(inode->i_sb);
        }
        return err;
@@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
        struct inode *inode = file->f_mapping->host;
        ssize_t size;
@@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                loff_t end = offset + iov_length(iov, nr_segs);
                if (end > isize)
-                        vmtruncate(inode, isize);
+                        nilfs_write_failed(mapping, end);
        }
        return size;
@@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);
+                truncate_setsize(inode, iattr->ia_size);
-                err = vmtruncate(inode, iattr->ia_size);
+                nilfs_truncate(inode);
-                if (unlikely(err))
-                        goto out_err;
        }
        setattr_copy(inode, iattr);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 74cece80e9a3..9bc72dec3fa6 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
 int nilfs_permission(struct inode *inode, int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index f1626f5011c5..ff00a0b7acb9 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                if (unlikely(err)) {
                        loff_t isize = inode->i_size;
                        if (pos + blocksize > isize)
-                                vmtruncate(inode, isize);
+                                nilfs_write_failed(inode->i_mapping,
+                                                        pos + blocksize);
                        goto failed_inode;
                }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1ecf46448f85..5b2d4f0853ac 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1762,6 +1762,16 @@ err_out:
        return err;
 }
+static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                ntfs_truncate_vfs(inode);
+        }
+}
 /**
 * ntfs_file_buffered_write -
 *
@@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                 * allocated space, which is not a disaster.
                                 */
                                i_size = i_size_read(vi);
-                                if (pos + bytes > i_size)
+                                if (pos + bytes > i_size) {
-                                        vmtruncate(vi, i_size);
+                                        ntfs_write_failed(mapping, pos + bytes);
+                                }
                                break;
                        }
                }
@@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = {
 const struct inode_operations ntfs_file_inode_ops = {
 #ifdef NTFS_RW
-        .truncate       = ntfs_truncate_vfs,
        .setattr        = ntfs_setattr,
 #endif /* NTFS_RW */
 };
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1d27331e6fc9..d3e118cc6ffa 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,9 +2866,11 @@ conv_err_out:
 *
 * See ntfs_truncate() description above for details.
 */
+#ifdef NTFS_RW
 void ntfs_truncate_vfs(struct inode *vi) {
        ntfs_truncate(vi);
 }
+#endif
 /**
 * ntfs_setattr - called from notify_change() when an attribute is being changed
@@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
                                                NInoCompressed(ni) ?
                                                "compressed" : "encrypted");
                                err = -EOPNOTSUPP;
-                        } else
+                        } else {
-                                err = vmtruncate(vi, attr->ia_size);
+                                truncate_setsize(vi, attr->ia_size);
+                                ntfs_truncate_vfs(vi);
+                        }
                        if (err || ia_valid == ATTR_SIZE)
                                goto out;
                } else {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index db29695f845c..76b6cfb579d7 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi)
        return;
 }
+#else
+static inline void ntfs_truncate_vfs(struct inode *vi) {}
 #endif /* NTFS_RW */
 #endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fe492e1a3cfc..37d313ede159 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1218,24 +1218,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        /*
-         * This will intentionally not wind up calling truncate_setsize(),
-         * since all the work for a size change has been done above.
-         * Otherwise, we could get into problems with truncate as
-         * ip_alloc_sem is used there to protect against i_size
-         * changes.
-         *
-         * XXX: this means the conditional below can probably be removed.
-         */
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                status = vmtruncate(inode, attr->ia_size);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail_commit;
-                }
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 77e3cb2962b4..e0d9b3e722bd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
        return mpage_writepages(mapping, wbc, omfs_get_block);
 }
+static void omfs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                omfs_truncate(inode);
+        }
+}
 static int omfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep,
                                omfs_get_block);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                omfs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
@@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
+                truncate_setsize(inode, attr->ia_size);
+                omfs_truncate(inode);
        }
        setattr_copy(inode, attr);
@@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
 const struct inode_operations omfs_file_inops = {
        .setattr = omfs_setattr,
-        .truncate = omfs_truncate
 };
 const struct address_space_operations omfs_aops = {
diff --git a/fs/open.c b/fs/open.c
index 182d8667b7bd..9b33c0cbfacf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,33 +61,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        return ret;
 }
-static long do_sys_truncate(const char __user *pathname, loff_t length)
+long vfs_truncate(struct path *path, loff_t length)
 {
-        struct path path;
        struct inode *inode;
-        int error;
+        long error;
-        error = -EINVAL;
-        if (length < 0) /* sorry, but loff_t says... */
-                goto out;
-        error = user_path(pathname, &path);
+        inode = path->dentry->d_inode;
-        if (error)
-                goto out;
-        inode = path.dentry->d_inode;
        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
-        error = -EISDIR;
        if (S_ISDIR(inode->i_mode))
-                goto dput_and_out;
+                return -EISDIR;
-        error = -EINVAL;
        if (!S_ISREG(inode->i_mode))
-                goto dput_and_out;
+                return -EINVAL;
-        error = mnt_want_write(path.mnt);
+        error = mnt_want_write(path->mnt);
        if (error)
-                goto dput_and_out;
+                goto out;
        error = inode_permission(inode, MAY_WRITE);
        if (error)
@@ -111,19 +100,40 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
-                error = security_path_truncate(&path);
+                error = security_path_truncate(path);
        if (!error)
-                error = do_truncate(path.dentry, length, 0, NULL);
+                error = do_truncate(path->dentry, length, 0, NULL);
 put_write_and_out:
        put_write_access(inode);
 mnt_drop_write_and_out:
-        mnt_drop_write(path.mnt);
+        mnt_drop_write(path->mnt);
-dput_and_out:
-        path_put(&path);
 out:
        return error;
 }
+EXPORT_SYMBOL_GPL(vfs_truncate);
+static long do_sys_truncate(const char __user *pathname, loff_t length)
+{
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
+        struct path path;
+        int error;
+        if (length < 0) /* sorry, but loff_t says... */
+                return -EINVAL;
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+        if (!error) {
+                error = vfs_truncate(&path, length);
+                path_put(&path);
+        }
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
+        return error;
+}
 SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 {
@@ -306,6 +316,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
        struct path path;
        struct inode *inode;
        int res;
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;
@@ -328,8 +339,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
        }
        old_cred = override_creds(override_cred);
+retry:
-        res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+        res = user_path_at(dfd, filename, lookup_flags, &path);
        if (res)
                goto out;
@@ -364,6 +375,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 out_path_release:
        path_put(&path);
+        if (retry_estale(res, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 out:
        revert_creds(old_cred);
        put_cred(override_cred);
@@ -379,8 +394,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-        error = user_path_dir(filename, &path);
+retry:
+        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;
@@ -392,6 +408,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
 dput_and_out:
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 out:
        return error;
 }
@@ -425,8 +445,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-        error = user_path_dir(filename, &path);
+retry:
+        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;
@@ -445,6 +466,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
        error = 0;
 dput_and_out:
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 out:
        return error;
 }
@@ -489,11 +514,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
-        error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+retry:
+        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
+                if (retry_estale(error, lookup_flags)) {
+                        lookup_flags |= LOOKUP_REVAL;
+                        goto retry;
+                }
        }
        return error;
 }
@@ -552,6 +582,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
+retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
@@ -562,6 +593,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 out:
        return error;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a5a0be40e40..9b43ff77a51e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
        return 0;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e659a0ff1da7..e064f562b1f7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
        if (error)
                return error;
-        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, iattr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, iattr);
        mark_inode_dirty(inode);
-        
        de->uid = inode->i_uid;
        de->gid = inode->i_gid;
        de->mode = inode->i_mode;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 701580ddfcc3..1827d88ad58b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -736,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
        return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 8375c922c0d5..50302d6f8895 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)
        return err;
 }
-static void reiserfs_vfs_truncate_file(struct inode *inode)
+void reiserfs_vfs_truncate_file(struct inode *inode)
 {
        mutex_lock(&(REISERFS_I(inode)->tailpack));
        reiserfs_truncate_file(inode, 1);
@@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = {
 };
 const struct inode_operations reiserfs_file_inode_operations = {
-        .truncate = reiserfs_vfs_truncate_file,
        .setattr = reiserfs_setattr,
        .setxattr = reiserfs_setxattr,
        .getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d83736fbc26c..95d7680ead47 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
                loff_t isize = i_size_read(inode);
                loff_t end = offset + iov_length(iov, nr_segs);
-                if (end > isize)
+                if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
-                        vmtruncate(inode, isize);
+                        truncate_setsize(inode, isize);
+                        reiserfs_vfs_truncate_file(inode);
+                }
        }
        return ret;
@@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
         */
        reiserfs_write_unlock_once(inode->i_sb, depth);
        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode))
+            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
+                if (!error) {
+                        truncate_setsize(inode, attr->ia_size);
+                        reiserfs_vfs_truncate_file(inode);
+                }
+        }
        if (!error) {
                setattr_copy(inode, attr);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 33215f57ea06..157e474ab303 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
                                                                    *,
                                                                    int count);
 int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+void reiserfs_vfs_truncate_file(struct inode *inode);
 int reiserfs_commit_page(struct inode *inode, struct page *page,
                         unsigned from, unsigned to);
 void reiserfs_flush_old_commits(struct super_block *);
diff --git a/fs/stat.c b/fs/stat.c
index eae494630a36..14f45459c83d 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -74,7 +74,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 {
        struct path path;
        int error = -EINVAL;
-        int lookup_flags = 0;
+        unsigned int lookup_flags = 0;
        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
                      AT_EMPTY_PATH)) != 0)
@@ -84,13 +84,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                lookup_flags |= LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
+retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = vfs_getattr(path.mnt, path.dentry, stat);
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
 out:
        return error;
 }
@@ -296,11 +300,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
        struct path path;
        int error;
        int empty = 0;
+        unsigned int lookup_flags = LOOKUP_EMPTY;
        if (bufsiz <= 0)
                return -EINVAL;
-        error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
+retry:
+        error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
        if (!error) {
                struct inode *inode = path.dentry->d_inode;
@@ -314,6 +320,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
                        }
                }
                path_put(&path);
+                if (retry_estale(error, lookup_flags)) {
+                        lookup_flags |= LOOKUP_REVAL;
+                        goto retry;
+                }
        }
        return error;
 }
diff --git a/fs/statfs.c b/fs/statfs.c
index f8e832e6f0a2..c219e733f553 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs);
 int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
        struct path path;
-        int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+        int error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_statfs(&path, st);
                path_put(&path);
+                if (retry_estale(error, lookup_flags)) {
+                        lookup_flags |= LOOKUP_REVAL;
+                        goto retry;
+                }
        }
        return error;
 }
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0a65939508e9..9d4dc6831792 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
+                truncate_setsize(inode, attr->ia_size);
+                sysv_truncate(inode);
        }
        setattr_copy(inode, attr);
@@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
 }
 const struct inode_operations sysv_file_inode_operations = {
-        .truncate       = sysv_truncate,
        .setattr        = sysv_setattr,
        .getattr        = sysv_getattr,
 };
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 90b54b438789..c1a591a4725b 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
        return __block_write_begin(page, pos, len, get_block);
 }
+static void sysv_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                sysv_truncate(inode);
+        }
+}
 static int sysv_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,
        int ret;
        ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                sysv_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index eb6d0b7dc879..ff24e4449ece 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
        return __block_write_begin(page, pos, len, ufs_getfrag_block);
 }
+static void ufs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size)
+                truncate_pagecache(inode, to, inode->i_size);
+}
 static int ufs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep,
                                ufs_getfrag_block);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
-                loff_t isize = mapping->host->i_size;
+                ufs_write_failed(mapping, pos + len);
-                if (pos + len > isize)
-                        vmtruncate(mapping->host, isize);
-        }
        return ret;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index bb0696a41735..f4fb7eca10e8 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
                if (!(flags & AT_SYMLINK_NOFOLLOW))
                        lookup_flags |= LOOKUP_FOLLOW;
+retry:
                error = user_path_at(dfd, filename, lookup_flags, &path);
                if (error)
                        goto out;
                error = utimes_common(&path, times);
                path_put(&path);
+                if (retry_estale(error, lookup_flags)) {
+                        lookup_flags |= LOOKUP_REVAL;
+                        goto retry;
+                }
        }
 out:
diff --git a/fs/xattr.c b/fs/xattr.c
index e21c119f4f99..3377dff18404 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
-        error = user_path(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
@@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -389,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = 0;
-        error = user_lpath(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
@@ -399,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -476,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
 {
        struct path path;
        ssize_t error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
-        error = user_path(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = getxattr(path.dentry, name, value, size);
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -490,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
 {
        struct path path;
        ssize_t error;
+        unsigned int lookup_flags = 0;
-        error = user_lpath(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = getxattr(path.dentry, name, value, size);
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -556,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
 {
        struct path path;
        ssize_t error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
-        error = user_path(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = listxattr(path.dentry, list, size);
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -570,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
 {
        struct path path;
        ssize_t error;
+        unsigned int lookup_flags = 0;
-        error = user_lpath(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = listxattr(path.dentry, list, size);
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -615,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = LOOKUP_FOLLOW;
-        error = user_path(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
@@ -625,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }
@@ -633,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 {
        struct path path;
        int error;
+        unsigned int lookup_flags = 0;
-        error = user_lpath(pathname, &path);
+retry:
+        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
@@ -643,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
+        if (retry_estale(error, lookup_flags)) {
+                lookup_flags |= LOOKUP_REVAL;
+                goto retry;
+        }
        return error;
 }