Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: (52 commits) split invalidate_inodes() fs: skip I_FREEING inodes in writeback_sb_inodes fs: fold invalidate_list into invalidate_inodes fs: do not drop inode_lock in dispose_list fs: inode split IO and LRU lists fs: switch bdev inode bdi's correctly fs: fix buffer invalidation in invalidate_list fsnotify: use dget_parent smbfs: use dget_parent exportfs: use dget_parent fs: use RCU read side protection in d_validate fs: clean up dentry lru modification fs: split __shrink_dcache_sb fs: improve DCACHE_REFERENCED usage fs: use percpu counter for nr_dentry and nr_dentry_unused fs: simplify __d_free fs: take dcache_lock inside __d_path fs: do not assign default i_ino in new_inode fs: introduce a per-cpu last_ino allocator new helper: ihold() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-26 20:58:44 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-26 20:58:44 -0400
commit: 426e1f5cec4821945642230218876b0e89aafab1 (patch)
tree: 2728ace018d0698886989da586210ef1543a7098 /fs
parent: 9e5fca251f44832cb996961048ea977f80faf6ea (diff)
parent: 63997e98a3be68d7cec806d22bf9b02b2e1daabb (diff)
90 files changed, 760 insertions, 643 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..ef5905f7c8a3 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1789,9 +1789,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                kfree(st);
        } else {
                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just i_count++
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
                 */
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        dentry->d_op = old_dentry->d_op;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
                if (AFFS_SB(sb)->s_flags & SF_OFS) {
                        struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
                        u32 tmp;
-                        if (IS_ERR(ext_bh)) {
+                        if (IS_ERR(bh)) {
                                affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
-                                             ext, PTR_ERR(ext_bh));
+                                             ext, PTR_ERR(bh));
                                return;
                        }
                        tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
                affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
                mark_buffer_dirty_inode(inode_bh, inode);
                inode->i_nlink = 2;
-                atomic_inc(&inode->i_count);
+                ihold(inode);
        }
        affs_fix_checksum(sb, bh);
        mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..5439e1bc9a86 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
        if (ret < 0)
                goto link_error;
-        atomic_inc(&vnode->vfs_inode.i_count);
+        ihold(&vnode->vfs_inode);
        d_instantiate(dentry, &vnode->vfs_inode);
        key_put(key);
        _leave(" = 0");
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..8c8f6c5b6d79 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1543,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
        }
        abe = mempool_alloc(abe_pool, GFP_KERNEL);
-        BUG_ON(!igrab(mapping->host));
+        /*
+         * we should be using igrab here, but
+         * we don't want to hammer on the global
+         * inode spinlock just to take an extra
+         * reference on a file that we must already
+         * have a reference to.
+         *
+         * When we're called, we always have a reference
+         * on the file, so we must always have a reference
+         * on the inode, so ihold() is safe here.
+         */
+        ihold(mapping->host);
        abe->mapping = mapping;
        hlist_add_head(&abe->list, &batch_hash[bucket]);
        return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..5365527ca43f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -111,10 +111,9 @@ struct file *anon_inode_getfile(const char *name,
        path.mnt = mntget(anon_inode_mnt);
        /*
         * We know the anon_inode inode count is always greater than zero,
-         * so we can avoid doing an igrab() and we can use an open-coded
+         * so ihold() is safe.
-         * atomic_inc().
         */
-        atomic_inc(&anon_inode_inode->i_count);
+        ihold(anon_inode_inode);
        path.dentry->d_op = &anon_inodefs_dentry_operations;
        d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +193,7 @@ static struct inode *anon_inode_mkinode(void)
        if (!inode)
                return ERR_PTR(-ENOMEM);
+        inode->i_ino = get_next_ino();
        inode->i_fop = &anon_inode_fops;
        inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..ac87e49fa706 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
                inode->i_gid = sb->s_root->d_inode->i_gid;
        }
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_ino = get_next_ino();
        if (S_ISDIR(inf->mode)) {
                inode->i_nlink = 2;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(new, inode);
        mutex_unlock(&info->bfs_lock);
        return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 139fc8083f53..29990f0eee0c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b737451e2e9d..dea3b628a6ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
 EXPORT_SYMBOL(I_BDEV);
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+                        struct backing_dev_info *dst)
+{
+        spin_lock(&inode_lock);
+        inode->i_data.backing_dev_info = dst;
+        if (inode->i_state & I_DIRTY)
+                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+        spin_unlock(&inode_lock);
+}
 static sector_t max_block(struct block_device *bdev)
 {
        sector_t retval = ~((sector_t)0);
@@ -550,7 +565,7 @@ EXPORT_SYMBOL(bdget);
 */
 struct block_device *bdgrab(struct block_device *bdev)
 {
-        atomic_inc(&bdev->bd_inode->i_count);
+        ihold(bdev->bd_inode);
        return bdev;
 }
@@ -580,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
        if (bdev) {
-                atomic_inc(&bdev->bd_inode->i_count);
+                ihold(bdev->bd_inode);
                spin_unlock(&bdev_lock);
                return bdev;
        }
@@ -591,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
                spin_lock(&bdev_lock);
                if (!inode->i_bdev) {
                        /*
-                         * We take an additional bd_inode->i_count for inode,
+                         * We take an additional reference to bd_inode,
                         * and it's released in clear_inode() of inode.
                         * So, we can access it via ->i_mapping always
                         * without igrab().
                         */
-                        atomic_inc(&bdev->bd_inode->i_count);
+                        ihold(bdev->bd_inode);
                        inode->i_bdev = bdev;
                        inode->i_mapping = bdev->bd_inode->i_mapping;
                        list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                bdi = blk_get_backing_dev_info(bdev);
                                if (bdi == NULL)
                                        bdi = &default_backing_dev_info;
-                                bdev->bd_inode->i_data.backing_dev_info = bdi;
+                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
                        }
                        if (bdev->bd_invalidated)
                                rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev->bd_inode->i_data.backing_dev_info =
+                        bdev_inode_switch_bdi(bdev->bd_inode,
-                           whole->bd_inode->i_data.backing_dev_info;
+                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
-        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
-                bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+                bdev_inode_switch_bdi(bdev->bd_inode,
+                                        &default_backing_dev_info);
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..64f99cf69ce0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3849,7 +3849,7 @@ again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
-        if (hlist_unhashed(&inode->i_hash))
+        if (inode_unhashed(inode))
                return;
        spin_lock(&root->inode_lock);
@@ -4758,7 +4758,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        btrfs_set_trans_block_group(trans, dir);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8d595ab2aed1..5930e382959b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1833,9 +1833,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
 {
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
@@ -1915,7 +1917,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
        }
        return err;
 }
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
@@ -1952,15 +1954,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        return 0;
 }
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-                get_block_t *get_block)
-{
-        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-        return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
@@ -2378,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        else
                end = PAGE_CACHE_SIZE;
-        ret = block_prepare_write(page, 0, end, get_block);
+        ret = __block_write_begin(page, 0, end, get_block);
        if (!ret)
                ret = block_commit_write(page, 0, end);
@@ -2465,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping,
        *fsdata = NULL;
        if (page_has_buffers(page)) {
-                unlock_page(page);
+                ret = __block_write_begin(page, pos, len, get_block);
-                page_cache_release(page);
+                if (unlikely(ret))
-                *pagep = NULL;
+                        goto out_release;
-                return block_write_begin(mapping, pos, len, flags, pagep,
+                return ret;
-                                         get_block);
        }
        if (PageMappedToDisk(page))
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 96fbeab77f2f..5d8b35539601 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -276,7 +276,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
        }
        coda_dir_update_mtime(dir_inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(de, inode);
        inc_nlink(inode);
        return 0;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..253476d78ed8 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..23702a9d4e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static void __d_free(struct dentry *dentry)
+static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+                   size_t *lenp, loff_t *ppos)
+{
+        dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+        dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+static void __d_free(struct rcu_head *head)
 {
+        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        WARN_ON(!list_empty(&dentry->d_alias));
        if (dname_external(dentry))
                kfree(dentry->d_name.name);
        kmem_cache_free(dentry_cache, dentry); 
 }
-static void d_callback(struct rcu_head *head)
-{
-        struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-        __d_free(dentry);
-}
 /*
- * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
+ * no dcache_lock, please.
- * inside dcache_lock.
 */
 static void d_free(struct dentry *dentry)
 {
+        percpu_counter_dec(&nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
        if (hlist_unhashed(&dentry->d_hash))
-                __d_free(dentry);
+                __d_free(&dentry->d_u.d_rcu);
        else
-                call_rcu(&dentry->d_u.d_rcu, d_callback);
+                call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 /*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
 }
 /*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
-        list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        if (list_empty(&dentry->d_lru)) {
-        dentry->d_sb->s_nr_dentry_unused++;
+                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-        dentry_stat.nr_unused++;
+                dentry->d_sb->s_nr_dentry_unused++;
-}
+                percpu_counter_inc(&nr_dentry_unused);
+        }
-static void dentry_lru_add_tail(struct dentry *dentry)
-{
-        list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-        dentry->d_sb->s_nr_dentry_unused++;
-        dentry_stat.nr_unused++;
 }
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del(&dentry->d_lru);
+                list_del_init(&dentry->d_lru);
                dentry->d_sb->s_nr_dentry_unused--;
-                dentry_stat.nr_unused--;
+                percpu_counter_dec(&nr_dentry_unused);
        }
 }
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
 {
-        if (likely(!list_empty(&dentry->d_lru))) {
+        if (list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
+                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-                dentry->d_sb->s_nr_dentry_unused--;
+                dentry->d_sb->s_nr_dentry_unused++;
-                dentry_stat.nr_unused--;
+                percpu_counter_inc(&nr_dentry_unused);
+        } else {
+                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
 }
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
        struct dentry *parent;
        list_del(&dentry->d_u.d_child);
-        dentry_stat.nr_dentry--;        /* For d_free, below */
        /*drops the locks, at that point nobody can reach this dentry */
        dentry_iput(dentry);
        if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
                if (dentry->d_op->d_delete(dentry))
                        goto unhash_it;
        }
        /* Unreachable? Get rid of it */
        if (d_unhashed(dentry))
                goto kill_it;
-        if (list_empty(&dentry->d_lru)) {
-                dentry->d_flags |= DCACHE_REFERENCED;
+        /* Otherwise leave it cached and ensure it's on the LRU */
-                dentry_lru_add(dentry);
+        dentry->d_flags |= DCACHE_REFERENCED;
-        }
+        dentry_lru_add(dentry);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dcache_lock);
        return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 /* This should be called _only_ with dcache_lock held */
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
        atomic_inc(&dentry->d_count);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        return dentry;
 }
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
                if (dentry->d_op && dentry->d_op->d_delete)
                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del_init(dentry);
+                dentry_lru_del(dentry);
                __d_drop(dentry);
                dentry = d_kill(dentry);
                spin_lock(&dcache_lock);
        }
 }
-/*
+static void shrink_dentry_list(struct list_head *list)
- * Shrink the dentry LRU on a given superblock.
- * @sb   : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 {
-        LIST_HEAD(referenced);
-        LIST_HEAD(tmp);
        struct dentry *dentry;
-        int cnt = 0;
-        BUG_ON(!sb);
+        while (!list_empty(list)) {
-        BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
+                dentry = list_entry(list->prev, struct dentry, d_lru);
-        spin_lock(&dcache_lock);
+                dentry_lru_del(dentry);
-        if (count != NULL)
-                /* called from prune_dcache() and shrink_dcache_parent() */
-                cnt = *count;
-restart:
-        if (count == NULL)
-                list_splice_init(&sb->s_dentry_lru, &tmp);
-        else {
-                while (!list_empty(&sb->s_dentry_lru)) {
-                        dentry = list_entry(sb->s_dentry_lru.prev,
-                                        struct dentry, d_lru);
-                        BUG_ON(dentry->d_sb != sb);
-                        spin_lock(&dentry->d_lock);
-                        /*
-                         * If we are honouring the DCACHE_REFERENCED flag and
-                         * the dentry has this flag set, don't free it. Clear
-                         * the flag and put it back on the LRU.
-                         */
-                        if ((flags & DCACHE_REFERENCED)
-                                && (dentry->d_flags & DCACHE_REFERENCED)) {
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                        } else {
-                                list_move_tail(&dentry->d_lru, &tmp);
-                                spin_unlock(&dentry->d_lock);
-                                cnt--;
-                                if (!cnt)
-                                        break;
-                        }
-                        cond_resched_lock(&dcache_lock);
-                }
-        }
-        while (!list_empty(&tmp)) {
-                dentry = list_entry(tmp.prev, struct dentry, d_lru);
-                dentry_lru_del_init(dentry);
-                spin_lock(&dentry->d_lock);
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
+                spin_lock(&dentry->d_lock);
                if (atomic_read(&dentry->d_count)) {
                        spin_unlock(&dentry->d_lock);
                        continue;
@@ -516,13 +477,60 @@ restart:
                /* dentry->d_lock was dropped in prune_one_dentry() */
                cond_resched_lock(&dcache_lock);
        }
-        if (count == NULL && !list_empty(&sb->s_dentry_lru))
+}
-                goto restart;
-        if (count != NULL)
+/**
-                *count = cnt;
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:         superblock to shrink dentry LRU.
+ * @count:      number of entries to prune
+ * @flags:      flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+        /* called from prune_dcache() and shrink_dcache_parent() */
+        struct dentry *dentry;
+        LIST_HEAD(referenced);
+        LIST_HEAD(tmp);
+        int cnt = *count;
+        spin_lock(&dcache_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                dentry = list_entry(sb->s_dentry_lru.prev,
+                                struct dentry, d_lru);
+                BUG_ON(dentry->d_sb != sb);
+                /*
+                 * If we are honouring the DCACHE_REFERENCED flag and the
+                 * dentry has this flag set, don't free it.  Clear the flag
+                 * and put it back on the LRU.
+                 */
+                if (flags & DCACHE_REFERENCED) {
+                        spin_lock(&dentry->d_lock);
+                        if (dentry->d_flags & DCACHE_REFERENCED) {
+                                dentry->d_flags &= ~DCACHE_REFERENCED;
+                                list_move(&dentry->d_lru, &referenced);
+                                spin_unlock(&dentry->d_lock);
+                                cond_resched_lock(&dcache_lock);
+                                continue;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                }
+                list_move_tail(&dentry->d_lru, &tmp);
+                if (!--cnt)
+                        break;
+                cond_resched_lock(&dcache_lock);
+        }
+        *count = cnt;
+        shrink_dentry_list(&tmp);
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
        spin_unlock(&dcache_lock);
 }
 /**
@@ -538,7 +546,7 @@ static void prune_dcache(int count)
 {
        struct super_block *sb, *p = NULL;
        int w_count;
-        int unused = dentry_stat.nr_unused;
+        int unused = percpu_counter_sum_positive(&nr_dentry_unused);
        int prune_ratio;
        int pruned;
@@ -608,13 +616,19 @@ static void prune_dcache(int count)
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
- * Shrink the dcache for the specified super block. This
+ * Shrink the dcache for the specified super block. This is used to free
- * is used to free the dcache before unmounting a file
+ * the dcache before unmounting a file system.
- * system
 */
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
 {
-        __shrink_dcache_sb(sb, NULL, 0);
+        LIST_HEAD(tmp);
+        spin_lock(&dcache_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                list_splice_init(&sb->s_dentry_lru, &tmp);
+                shrink_dentry_list(&tmp);
+        }
+        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -632,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        /* detach this root from the system */
        spin_lock(&dcache_lock);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        __d_drop(dentry);
        spin_unlock(&dcache_lock);
@@ -646,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        spin_lock(&dcache_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
-                                dentry_lru_del_init(loop);
+                                dentry_lru_del(loop);
                                __d_drop(loop);
                                cond_resched_lock(&dcache_lock);
                        }
@@ -703,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                         * otherwise we ascend to the parent and move to the
                         * next sibling if there is one */
                        if (!parent)
-                                goto out;
+                                return;
                        dentry = parent;
                } while (list_empty(&dentry->d_subdirs));
                dentry = list_entry(dentry->d_subdirs.next,
                                    struct dentry, d_u.d_child);
        }
-out:
-        /* several dentries were freed, need to correct nr_dentry */
-        spin_lock(&dcache_lock);
-        dentry_stat.nr_dentry -= detached;
-        spin_unlock(&dcache_lock);
 }
 /*
@@ -830,14 +837,15 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                dentry_lru_del_init(dentry);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
                if (!atomic_read(&dentry->d_count)) {
-                        dentry_lru_add_tail(dentry);
+                        dentry_lru_move_tail(dentry);
                        found++;
+                } else {
+                        dentry_lru_del(dentry);
                }
                /*
@@ -900,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+        int nr_unused;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_dcache(nr);
        }
-        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dcache_shrinker = {
@@ -972,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        spin_lock(&dcache_lock);
        if (parent)
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        dentry_stat.nr_dentry++;
        spin_unlock(&dcache_lock);
+        percpu_counter_inc(&nr_dentry);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
@@ -1478,33 +1491,26 @@ out:
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
 */
- 
+int d_validate(struct dentry *dentry, struct dentry *parent)
-int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *base;
+        struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
-        struct hlist_node *lhp;
+        struct hlist_node *node;
+        struct dentry *d;
        /* Check whether the ptr might be valid at all.. */
        if (!kmem_ptr_validate(dentry_cache, dentry))
-                goto out;
+                return 0;
+        if (dentry->d_parent != parent)
-        if (dentry->d_parent != dparent)
+                return 0;
-                goto out;
-        spin_lock(&dcache_lock);
+        rcu_read_lock();
-        base = d_hash(dparent, dentry->d_name.hash);
+        hlist_for_each_entry_rcu(d, node, head, d_hash) {
-        hlist_for_each(lhp,base) { 
+                if (d == dentry) {
-                /* hlist_for_each_entry_rcu() not required for d_hash list
+                        dget(dentry);
-                 * as it is parsed under dcache_lock
-                 */
-                if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
-                        __dget_locked(dentry);
-                        spin_unlock(&dcache_lock);
                        return 1;
                }
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
-out:
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1994,7 +2000,7 @@ global_root:
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -2006,10 +2012,12 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
+        spin_lock(&dcache_lock);
        error = prepend_path(path, root, &res, &buflen);
+        spin_unlock(&dcache_lock);
        if (error)
                return ERR_PTR(error);
        return res;
 }
@@ -2419,6 +2427,9 @@ static void __init dcache_init(void)
 {
        int loop;
+        percpu_counter_init(&nr_dentry, 0);
+        percpu_counter_init(&nr_dentry_unused, 0);
        /* 
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..a4ed8380e98a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
        struct inode *inode = filp->f_mapping->host;
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct super_block *sb;
        if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return 0;
-        ret = sync_inode(inode, &wbc);
+        ret = sync_inode_metadata(inode, 1);
        /* This is a good place to write the sb */
        /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return exofs_add_nondir(dentry, inode);
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..51b304056f10 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
 find_disconnected_root(struct dentry *dentry)
 {
        dget(dentry);
-        spin_lock(&dentry->d_lock);
+        while (!IS_ROOT(dentry)) {
-        while (!IS_ROOT(dentry) &&
+                struct dentry *parent = dget_parent(dentry);
-               (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-                struct dentry *parent = dentry->d_parent;
+                if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
-                dget(parent);
+                        dput(parent);
-                spin_unlock(&dentry->d_lock);
+                        break;
+                }
                dput(dentry);
                dentry = parent;
-                spin_lock(&dentry->d_lock);
        }
-        spin_unlock(&dentry->d_lock);
        return dentry;
 }
 /*
 * Make sure target_dir is fully connected to the dentry tree.
 *
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..2709b34206ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
        if (IS_DIRSYNC(dir)) {
                err = write_one_page(page, 1);
                if (!err)
-                        err = ext2_sync_inode(dir);
+                        err = sync_inode_metadata(dir, 1);
        } else {
                unlock_page(page);
        }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
-extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 533699c16040..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
-                ext2_sync_inode (inode);
+                sync_inode_metadata(inode, 1);
        } else {
                mark_inode_dirty(inode);
        }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
        return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
-int ext2_sync_inode(struct inode *inode)
-{
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0,       /* sys_fsync did this */
-        };
-        return sync_inode(inode, &wbc);
-}
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..f8aecd2e3297 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext2_add_link(dentry, inode);
        if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 85df87d0f7b7..0901320671da 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1221,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        }
        es = sbi->s_es;
-        if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
+        if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
-            (old_mount_opt & EXT2_MOUNT_XIP)) &&
-            invalidate_inodes(sb)) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
                         "xip flag with busy inodes while remounting");
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..f84700be3274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
        EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
        inode->i_ctime = CURRENT_TIME_SEC;
        if (IS_SYNC(inode)) {
-                error = ext2_sync_inode (inode);
+                error = sync_inode_metadata(inode, 1);
                /* In case sync failed due to ENOSPC the inode was actually
                 * written (only some dirty data were not) so we just proceed
                 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..ad05353040a1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1696,8 +1696,8 @@ static int ext3_journalled_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
-                                        ext3_get_block);
+                                          ext3_get_block);
                if (ret != 0) {
                        ext3_journal_stop(handle);
                        goto out_unlock;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..bce9dce639b8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@ retry:
        inode->i_ctime = CURRENT_TIME_SEC;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext3_add_entry(handle, dentry, inode);
        if (!err) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..49635ef236f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1538,10 +1538,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-         * __block_prepare_write() could have dirtied some buffers. Clean
+         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-         * by __block_prepare_write() isn't a real problem here as we clear
+         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
 * These functions should only try to map a single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
@@ -2743,7 +2742,7 @@ static int ext4_writepage(struct page *page,
                 * all are mapped and non delay. We don't want to
                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, len,
+                ret = __block_write_begin(page, 0, len,
                                          noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 19aa0d44d822..42f77b1dc72d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2373,6 +2373,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                goto err_freesgi;
        }
+        sbi->s_buddy_cache->i_ino = get_next_ino();
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..bd39885b5998 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2312,7 +2312,7 @@ retry:
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..8c04eac5079d 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
        struct inode                    *ip = NULL;
        if ((ip = new_inode(sbp))) {
+                ip->i_ino = get_next_ino();
                vxfs_iinit(ip, vip);
                ip->i_mapping->a_ops = &vxfs_aops;
        }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9e46aec10d1a..aed881a76b22 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
        return sb->s_bdi;
 }
+static inline struct inode *wb_inode(struct list_head *head)
+{
+        return list_entry(head, struct inode, i_wb_list);
+}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
 {
@@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode)
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
-                tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-        list_move(&inode->i_list, &wb->b_dirty);
+        list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 /*
@@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-        list_move(&inode->i_list, &wb->b_more_io);
+        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
@@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        while (!list_empty(delaying_queue)) {
-                inode = list_entry(delaying_queue->prev, struct inode, i_list);
+                inode = wb_inode(delaying_queue->prev);
                if (older_than_this &&
                    inode_dirtied_after(inode, *older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
-                list_move(&inode->i_list, &tmp);
+                list_move(&inode->i_wb_list, &tmp);
        }
        /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
-                inode = list_entry(tmp.prev, struct inode, i_list);
+                sb = wb_inode(tmp.prev)->i_sb;
-                sb = inode->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
-                        inode = list_entry(pos, struct inode, i_list);
+                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
-                                list_move(&inode->i_list, dispatch_queue);
+                                list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
 }
@@ -408,16 +412,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                         * completion.
                         */
                        redirty_tail(inode);
-                } else if (atomic_read(&inode->i_count)) {
-                        /*
-                         * The inode is clean, inuse
-                         */
-                        list_move(&inode->i_list, &inode_in_use);
                } else {
                        /*
-                         * The inode is clean, unused
+                         * The inode is clean.  At this point we either have
+                         * a reference to the inode or it's on it's way out.
+                         * No need to add it back to the LRU.
                         */
-                        list_move(&inode->i_list, &inode_unused);
+                        list_del_init(&inode->i_wb_list);
                }
        }
        inode_sync_complete(inode);
@@ -465,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                if (inode->i_sb != sb) {
                        if (only_this_sb) {
@@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                        return 0;
                }
-                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+                /*
+                 * Don't bother with new inodes or inodes beeing freed, first
+                 * kind does not need peridic writeout yet, and for the latter
+                 * kind writeout is handled by the freer.
+                 */
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
-                BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@ -536,8 +541,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                struct super_block *sb = inode->i_sb;
                if (!pin_sb_for_writeback(sb)) {
@@ -675,8 +679,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 */
                spin_lock(&inode_lock);
                if (!list_empty(&wb->b_more_io))  {
-                        inode = list_entry(wb->b_more_io.prev,
+                        inode = wb_inode(wb->b_more_io.prev);
-                                                struct inode, i_list);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
@@ -727,7 +730,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
         */
        nr_pages = global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+                        get_nr_dirty_inodes();
        if (nr_pages) {
                struct wb_writeback_work work = {
@@ -966,7 +969,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
-                        if (hlist_unhashed(&inode->i_hash))
+                        if (inode_unhashed(inode))
                                goto out;
                }
                if (inode->i_state & I_FREEING)
@@ -994,7 +997,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &bdi->wb.b_dirty);
+                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                }
        }
 out:
@@ -1094,8 +1097,7 @@ void writeback_inodes_sb(struct super_block *sb)
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        work.nr_pages = nr_dirty + nr_unstable +
+        work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        bdi_queue_work(sb->s_bdi, &work);
        wait_for_completion(&done);
@@ -1202,3 +1204,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+/**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+        struct writeback_control wbc = {
+                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+                .nr_to_write = 0, /* metadata-only */
+        };
+        return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 7367e177186f..4eba07661e5c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -222,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = fc->user_id;
        inode->i_gid = fc->group_id;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b24afb96aae..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        struct gfs2_alloc *al = NULL;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-        unsigned to = from + len;
        struct page *page;
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        }
 prepare_write:
-        error = block_prepare_write(page, from, to, gfs2_block_map);
+        error = __block_write_begin(page, from, len, gfs2_block_map);
 out:
        if (error == 0)
                return 0;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aeafc233dc89..cade1acbcea9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1219,7 +1219,6 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        invalidate_inodes(sb);
        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
 fail_sys:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0534510200d5..12cbea7502c2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -255,7 +255,7 @@ out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
        if (!error) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_instantiate(dentry, inode);
                mark_inode_dirty(inode);
        }
@@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
        int error;
        if (!page_has_buffers(page)) {
-                error = block_prepare_write(page, from, to, gfs2_block_map);
+                error = __block_write_begin(page, from, to - from, gfs2_block_map);
                if (unlikely(error))
                        return error;
@@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                next += bh->b_size;
                if (buffer_mapped(bh)) {
                        if (end) {
-                                error = block_prepare_write(page, start, end,
+                                error = __block_write_begin(page, start, end - start,
                                                            gfs2_block_map);
                                if (unlikely(error))
                                        return error;
@@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
        } while (next < to);
        if (end) {
-                error = block_prepare_write(page, start, end, gfs2_block_map);
+                error = __block_write_begin(page, start, end - start, gfs2_block_map);
                if (unlikely(error))
                        return error;
                empty_write_end(page, start, end);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 047d1176096c..2b2c4997430b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -857,7 +857,6 @@ restart:
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        invalidate_inodes(sdp->sd_vfs);
        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..c8cffb81e849 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
        u16 blockoffset;
        int fs_div;
-        struct hlist_head rsrc_inodes;
 };
 #define HFS_FLG_BITMAP_DIRTY    0
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
        sb->s_dirt = 1;
 }
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
-        while (buffer_locked(bh)) {
-                wait_on_buffer(bh);
-        }
-        if (buffer_dirty(bh)) {
-                ll_rw_block(WRITE, 1, &bh);
-                wait_on_buffer(bh);
-        }
-}
 #define sb_bread512(sb, sec, data) ({                   \
        struct buffer_head *__bh;                       \
        sector_t __block;                               \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
        HFS_I(inode)->rsrc_inode = dir;
        HFS_I(dir)->rsrc_inode = inode;
        igrab(dir);
-        hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
        d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
                mdb->drLsMod = hfs_mtime();
                mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
        }
        return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
                HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
                HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
                mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
        }
        if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 33254160f650..6ee1586f2334 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -382,7 +382,6 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
        res = -EINVAL;
        if (!parse_options((char *)data, sbi)) {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85ec9d7..e318bbc0daf6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
        inc_nlink(inode);
        hfsplus_instantiate(dst_dentry, inode, cnid);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
        sbi->file_count++;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 78449280dae0..8afd7e84f98d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        inode->i_hash.pprev = &inode->i_hash.next;
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a14328d270e8..b14be3f781c7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -456,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
        inode = new_inode(sb);
        if (inode) {
                struct hugetlbfs_inode_info *info;
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
diff --git a/fs/inode.c b/fs/inode.c
index 56d909d69bc8..ae2727ab0c3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -29,7 +29,6 @@
 /*
 * This is needed for the following functions:
 *  - inode_has_buffers
- *  - invalidate_inode_buffers
 *  - invalidate_bdev
 *
 * FIXME: remove all knowledge of the buffer layer from this file
@@ -73,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
 * allowing for low-overhead inode sync() operations.
 */
-LIST_HEAD(inode_in_use);
+static LIST_HEAD(inode_lru);
-LIST_HEAD(inode_unused);
 static struct hlist_head *inode_hashtable __read_mostly;
 /*
@@ -104,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
+static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 static struct kmem_cache *inode_cachep __read_mostly;
+static inline int get_nr_inodes(void)
+{
+        return percpu_counter_sum_positive(&nr_inodes);
+}
+static inline int get_nr_inodes_unused(void)
+{
+        return percpu_counter_sum_positive(&nr_inodes_unused);
+}
+int get_nr_dirty_inodes(void)
+{
+        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+        return nr_dirty > 0 ? nr_dirty : 0;
+}
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        inodes_stat.nr_inodes = get_nr_inodes();
+        inodes_stat.nr_unused = get_nr_inodes_unused();
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
 static void wake_up_inode(struct inode *inode)
 {
        /*
@@ -193,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
+        percpu_counter_inc(&nr_inodes);
        return 0;
 out:
        return -ENOMEM;
@@ -233,11 +266,13 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
+        percpu_counter_dec(&nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
-void destroy_inode(struct inode *inode)
+static void destroy_inode(struct inode *inode)
 {
+        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
@@ -256,6 +291,8 @@ void inode_init_once(struct inode *inode)
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_dentry);
        INIT_LIST_HEAD(&inode->i_devices);
+        INIT_LIST_HEAD(&inode->i_wb_list);
+        INIT_LIST_HEAD(&inode->i_lru);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
        spin_lock_init(&inode->i_data.tree_lock);
        spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -282,14 +319,109 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_inc_return(&inode->i_count) != 1)
+        atomic_inc(&inode->i_count);
-                return;
+}
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+static void inode_lru_list_add(struct inode *inode)
+{
+        if (list_empty(&inode->i_lru)) {
+                list_add(&inode->i_lru, &inode_lru);
+                percpu_counter_inc(&nr_inodes_unused);
+        }
+}
-        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+static void inode_lru_list_del(struct inode *inode)
-                list_move(&inode->i_list, &inode_in_use);
+{
-        inodes_stat.nr_unused--;
+        if (!list_empty(&inode->i_lru)) {
+                list_del_init(&inode->i_lru);
+                percpu_counter_dec(&nr_inodes_unused);
+        }
+}
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 }
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        __inode_sb_list_add(inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+        list_del_init(&inode->i_sb_list);
+}
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+/**
+ *      __insert_inode_hash - hash an inode
+ *      @inode: unhashed inode
+ *      @hashval: unsigned long value used to locate this object in the
+ *              inode_hashtable.
+ *
+ *      Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+        spin_lock(&inode_lock);
+        hlist_add_head(&inode->i_hash, b);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+/**
+ *      __remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+        hlist_del_init(&inode->i_hash);
+}
+/**
+ *      remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        hlist_del_init(&inode->i_hash);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
        might_sleep();
@@ -328,101 +460,113 @@ static void evict(struct inode *inode)
 */
 static void dispose_list(struct list_head *head)
 {
-        int nr_disposed = 0;
        while (!list_empty(head)) {
                struct inode *inode;
-                inode = list_first_entry(head, struct inode, i_list);
+                inode = list_first_entry(head, struct inode, i_lru);
-                list_del(&inode->i_list);
+                list_del_init(&inode->i_lru);
                evict(inode);
                spin_lock(&inode_lock);
-                hlist_del_init(&inode->i_hash);
+                __remove_inode_hash(inode);
-                list_del_init(&inode->i_sb_list);
+                __inode_sb_list_del(inode);
                spin_unlock(&inode_lock);
                wake_up_inode(inode);
                destroy_inode(inode);
-                nr_disposed++;
        }
-        spin_lock(&inode_lock);
-        inodes_stat.nr_inodes -= nr_disposed;
-        spin_unlock(&inode_lock);
 }
-/*
+/**
- * Invalidate all inodes for a device.
+ * evict_inodes - evict all evictable inodes for a superblock
+ * @sb:         superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
 */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+void evict_inodes(struct super_block *sb)
 {
-        struct list_head *next;
+        struct inode *inode, *next;
-        int busy = 0, count = 0;
+        LIST_HEAD(dispose);
-        next = head->next;
-        for (;;) {
-                struct list_head *tmp = next;
-                struct inode *inode;
-                /*
+        down_write(&iprune_sem);
-                 * We can reschedule here without worrying about the list's
-                 * consistency because the per-sb list of inodes must not
-                 * change during umount anymore, and because iprune_sem keeps
-                 * shrink_icache_memory() away.
-                 */
-                cond_resched_lock(&inode_lock);
-                next = next->next;
+        spin_lock(&inode_lock);
-                if (tmp == head)
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-                        break;
+                if (atomic_read(&inode->i_count))
-                inode = list_entry(tmp, struct inode, i_sb_list);
-                if (inode->i_state & I_NEW)
                        continue;
-                invalidate_inode_buffers(inode);
-                if (!atomic_read(&inode->i_count)) {
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        list_move(&inode->i_list, dispose);
+                        WARN_ON(1);
-                        WARN_ON(inode->i_state & I_NEW);
-                        inode->i_state |= I_FREEING;
-                        count++;
                        continue;
                }
-                busy = 1;
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        percpu_counter_dec(&nr_inodes_unused);
        }
-        /* only unused inodes may be cached with i_count zero */
+        spin_unlock(&inode_lock);
-        inodes_stat.nr_unused -= count;
-        return busy;
+        dispose_list(&dispose);
+        up_write(&iprune_sem);
 }
 /**
- *      invalidate_inodes       - discard the inodes on a device
+ * invalidate_inodes    - attempt to free all inodes on a superblock
- *      @sb: superblock
+ * @sb:         superblock to operate on
 *
- *      Discard all of the inodes for a given superblock. If the discard
+ * Attempts to free all inodes for a given superblock.  If there were any
- *      fails because there are busy inodes then a non zero value is returned.
+ * busy inodes return a non-zero value, else zero.
- *      If the discard is successful all the inodes have been discarded.
 */
 int invalidate_inodes(struct super_block *sb)
 {
-        int busy;
+        int busy = 0;
-        LIST_HEAD(throw_away);
+        struct inode *inode, *next;
+        LIST_HEAD(dispose);
        down_write(&iprune_sem);
        spin_lock(&inode_lock);
-        fsnotify_unmount_inodes(&sb->s_inodes);
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-        busy = invalidate_list(&sb->s_inodes, &throw_away);
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                        continue;
+                if (atomic_read(&inode->i_count)) {
+                        busy = 1;
+                        continue;
+                }
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        percpu_counter_dec(&nr_inodes_unused);
+        }
        spin_unlock(&inode_lock);
-        dispose_list(&throw_away);
+        dispose_list(&dispose);
        up_write(&iprune_sem);
        return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
 static int can_unuse(struct inode *inode)
 {
-        if (inode->i_state)
+        if (inode->i_state & ~I_REFERENCED)
                return 0;
        if (inode_has_buffers(inode))
                return 0;
@@ -434,22 +578,24 @@ static int can_unuse(struct inode *inode)
 }
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  We expect the final iput() on that inode to add it to
+ * pagecache removed.  If the inode has metadata buffers attached to
- * the front of the inode_unused list.  So look for it there and if the
+ * mapping->private_list then try to remove them.
- * inode is still freeable, proceed.  The right inode is found 99.9% of the
- * time in testing on a 4-way.
 *
- * If the inode has metadata buffers attached to mapping->private_list then
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
- * try to remove them.
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
 */
 static void prune_icache(int nr_to_scan)
 {
        LIST_HEAD(freeable);
-        int nr_pruned = 0;
        int nr_scanned;
        unsigned long reap = 0;
@@ -458,13 +604,26 @@ static void prune_icache(int nr_to_scan)
        for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                struct inode *inode;
-                if (list_empty(&inode_unused))
+                if (list_empty(&inode_lru))
                        break;
-                inode = list_entry(inode_unused.prev, struct inode, i_list);
+                inode = list_entry(inode_lru.prev, struct inode, i_lru);
-                if (inode->i_state || atomic_read(&inode->i_count)) {
+                /*
-                        list_move(&inode->i_list, &inode_unused);
+                 * Referenced or dirty inodes are still in use. Give them
+                 * another pass through the LRU as we canot reclaim them now.
+                 */
+                if (atomic_read(&inode->i_count) ||
+                    (inode->i_state & ~I_REFERENCED)) {
+                        list_del_init(&inode->i_lru);
+                        percpu_counter_dec(&nr_inodes_unused);
+                        continue;
+                }
+                /* recently referenced inodes get one more pass */
+                if (inode->i_state & I_REFERENCED) {
+                        list_move(&inode->i_lru, &inode_lru);
+                        inode->i_state &= ~I_REFERENCED;
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -476,18 +635,23 @@ static void prune_icache(int nr_to_scan)
                        iput(inode);
                        spin_lock(&inode_lock);
-                        if (inode != list_entry(inode_unused.next,
+                        if (inode != list_entry(inode_lru.next,
-                                                struct inode, i_list))
+                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
                        if (!can_unuse(inode))
                                continue;
                }
-                list_move(&inode->i_list, &freeable);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
-                nr_pruned++;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &freeable);
+                list_del_init(&inode->i_wb_list);
+                percpu_counter_dec(&nr_inodes_unused);
        }
-        inodes_stat.nr_unused -= nr_pruned;
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
@@ -519,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                        return -1;
                prune_icache(nr);
        }
-        return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker icache_shrinker = {
@@ -530,9 +694,6 @@ static struct shrinker icache_shrinker = {
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
 */
 static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
@@ -552,9 +713,10 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
 }
 /*
@@ -577,53 +739,49 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
-}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-                        struct inode *inode)
-{
-        inodes_stat.nr_inodes++;
-        list_add(&inode->i_list, &inode_in_use);
-        list_add(&inode->i_sb_list, &sb->s_inodes);
-        if (head)
-                hlist_add_head(&inode->i_hash, head);
 }
-/**
+/*
- * inode_add_to_lists - add a new inode to relevant lists
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
- * @sb: superblock inode belongs to
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
- * @inode: inode to mark in use
+ * to renew the exhausted range.
 *
- * When an inode is allocated it needs to be accounted for, added to the in use
+ * This does not significantly increase overflow rate because every CPU can
- * list, the owning superblock and the inode hash. This needs to be done under
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
- * the inode_lock, so export a function to do this rather than the inode lock
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
- * itself. We calculate the hash list to add to here so it is all internal
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
- * which requires the caller to have already set up the inode number in the
+ * overflow rate by 2x, which does not seem too significant.
- * inode to add.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
 */
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+unsigned int get_next_ino(void)
 {
-        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        unsigned int *p = &get_cpu_var(last_ino);
+        unsigned int res = *p;
-        spin_lock(&inode_lock);
+#ifdef CONFIG_SMP
-        __inode_add_to_lists(sb, head, inode);
+        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
-        spin_unlock(&inode_lock);
+                static atomic_t shared_last_ino;
+                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+                res = next - LAST_INO_BATCH;
+        }
+#endif
+        *p = ++res;
+        put_cpu_var(last_ino);
+        return res;
 }
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
+EXPORT_SYMBOL(get_next_ino);
 /**
 *      new_inode       - obtain an inode
@@ -639,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
 */
 struct inode *new_inode(struct super_block *sb)
 {
-        /*
-         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-         * error if st_ino won't fit in target struct field. Use 32bit counter
-         * here to attempt to avoid that.
-         */
-        static unsigned int last_ino;
        struct inode *inode;
        spin_lock_prefetch(&inode_lock);
@@ -652,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                __inode_add_to_lists(sb, NULL, inode);
+                __inode_sb_list_add(inode);
-                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
        }
@@ -664,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-        if (inode->i_mode & S_IFDIR) {
+        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;
                /* Set new key only if filesystem hasn't already changed it */
@@ -721,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
                        if (set(inode, data))
                                goto set_failed;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -736,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -768,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -783,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -792,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
        return inode;
 }
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+        struct hlist_head *b = inode_hashtable + hash(sb, ino);
+        struct hlist_node *node;
+        struct inode *inode;
+        hlist_for_each_entry(inode, node, b, i_hash) {
+                if (inode->i_ino == ino && inode->i_sb == sb)
+                        return 0;
+        }
+        return 1;
+}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -813,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
+        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
-        struct inode *inode;
-        struct hlist_head *head;
        ino_t res;
        spin_lock(&inode_lock);
+        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
-                head = inode_hashtable + hash(sb, res);
+        } while (!test_inode_iunique(sb, res));
-                inode = find_inode_fast(sb, head, res);
+        spin_unlock(&iunique_lock);
-        } while (inode != NULL);
        spin_unlock(&inode_lock);
        return res;
@@ -877,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode(sb, head, test, data);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                if (likely(wait))
                        wait_on_inode(inode);
@@ -910,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode_fast(sb, head, ino);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                wait_on_inode(inode);
                return inode;
@@ -1096,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1135,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1144,36 +1313,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 }
 EXPORT_SYMBOL(insert_inode_locked4);
-/**
- *      __insert_inode_hash - hash an inode
- *      @inode: unhashed inode
- *      @hashval: unsigned long value used to locate this object in the
- *              inode_hashtable.
- *
- *      Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
-        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-        spin_lock(&inode_lock);
-        hlist_add_head(&inode->i_hash, head);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
-/**
- *      remove_inode_hash - remove an inode from the hash
- *      @inode: inode to unhash
- *
- *      Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
-{
-        spin_lock(&inode_lock);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(remove_inode_hash);
 int generic_delete_inode(struct inode *inode)
 {
@@ -1188,7 +1327,7 @@ EXPORT_SYMBOL(generic_delete_inode);
 */
 int generic_drop_inode(struct inode *inode)
 {
-        return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+        return !inode->i_nlink || inode_unhashed(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
@@ -1214,10 +1353,11 @@ static void iput_final(struct inode *inode)
                drop = generic_drop_inode(inode);
        if (!drop) {
-                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-                        list_move(&inode->i_list, &inode_unused);
-                inodes_stat.nr_unused++;
                if (sb->s_flags & MS_ACTIVE) {
+                        inode->i_state |= I_REFERENCED;
+                        if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+                                inode_lru_list_add(inode);
+                        }
                        spin_unlock(&inode_lock);
                        return;
                }
@@ -1228,19 +1368,23 @@ static void iput_final(struct inode *inode)
                spin_lock(&inode_lock);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
-                inodes_stat.nr_unused--;
+                __remove_inode_hash(inode);
-                hlist_del_init(&inode->i_hash);
        }
-        list_del_init(&inode->i_list);
-        list_del_init(&inode->i_sb_list);
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        inodes_stat.nr_inodes--;
+        /*
+         * Move the inode off the IO lists and LRU once I_FREEING is
+         * set so that it won't get moved back on there if it is dirty.
+         */
+        inode_lru_list_del(inode);
+        list_del_init(&inode->i_wb_list);
+        __inode_sb_list_del(inode);
        spin_unlock(&inode_lock);
        evict(inode);
-        spin_lock(&inode_lock);
+        remove_inode_hash(inode);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
        wake_up_inode(inode);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        destroy_inode(inode);
@@ -1504,6 +1648,8 @@ void __init inode_init(void)
                                         SLAB_MEM_SPREAD),
                                         init_once);
        register_shrinker(&icache_shrinker);
+        percpu_counter_init(&nr_inodes, 0);
+        percpu_counter_init(&nr_inodes_unused, 0);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..ebad3b90752d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,10 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+/*
+ * inode.c
+ */
+extern int get_nr_dirty_inodes(void);
+extern int evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 09ff41a752a0..60c2b944d762 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -962,25 +962,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 * or getblk() if they are not.  Returns the number of blocks inserted
 * (-ve == error.)
 */
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
                     struct buffer_head **bh, unsigned long nblocks)
 {
-        unsigned long b_off;
+        unsigned long b_off = iblock;
        unsigned offset, sect_size;
        unsigned int firstext;
        unsigned long nextblk, nextoff;
-        long iblock = (long)iblock_s;
        int section, rv, error;
        struct iso_inode_info *ei = ISOFS_I(inode);
        error = -EIO;
        rv = 0;
-        if (iblock < 0 || iblock != iblock_s) {
+        if (iblock != b_off) {
                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }
-        b_off = iblock;
        offset = 0;
        firstext = ei->i_first_extent;
@@ -998,8 +996,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-                        printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
+                        printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
-                                __func__, iblock, (unsigned long) inode->i_size);
+                                __func__, b_off,
+                                (unsigned long long)inode->i_size);
                        goto abort;
                }
@@ -1025,9 +1024,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                        if (++section > 100) {
                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
                                        " aborting...\n", __func__);
-                                printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+                                printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
                                        "nextblk=%lu nextoff=%lu\n", __func__,
-                                        iblock, firstext, (unsigned) sect_size,
+                                        b_off, firstext, (unsigned) sect_size,
                                        nextblk, nextoff);
                                goto abort;
                        }
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..79121aa5858b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
                mutex_unlock(&f->sem);
                d_instantiate(dentry, old_dentry->d_inode);
                dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        return ret;
 }
@@ -864,7 +864,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
                printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
                /* Might as well let the VFS know */
                d_instantiate(new_dentry, old_dentry->d_inode);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
                new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                return ret;
        }
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        ip->i_hash.pprev = &ip->i_hash.next;
+        hlist_add_fake(&ip->i_hash);
        return (ip);
 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
         * lazy commit thread finishes processing
         */
        if (tblk->xflag & COMMIT_DELETE) {
-                atomic_inc(&tblk->u.ip->i_count);
+                ihold(tblk->u.ip);
                /*
                 * Avoid a rare deadlock
                 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..231ca4af9bce 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
        ip->i_ctime = CURRENT_TIME;
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        mark_inode_dirty(dir);
-        atomic_inc(&ip->i_count);
+        ihold(ip);
        iplist[0] = ip;
        iplist[1] = dir;
diff --git a/fs/libfs.c b/fs/libfs.c
index 62baa0387d6e..304a5132ca27 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -255,7 +255,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
@@ -892,10 +892,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 */
 int generic_file_fsync(struct file *file, int datasync)
 {
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -906,7 +902,7 @@ int generic_file_fsync(struct file *file, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return ret;
-        err = sync_inode(inode, &wbc);
+        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;
        return ret;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 1eb4e89e045b..409dfd65e9a1 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EMLINK;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_nlink++;
        mark_inode_dirty_sync(inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..c0d35a3accef 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..f7dbc06857ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
+        struct inode *inode = base->d_inode;
        struct dentry *dentry;
-        struct inode *inode;
        int err;
-        inode = base->d_inode;
+        err = exec_permission(inode);
+        if (err)
+                return ERR_PTR(err);
        /*
         * See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@ out:
 */
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
-        int err;
-        err = exec_permission(nd->path.dentry->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
@@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = exec_permission(base->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
 }
@@ -2291,7 +2285,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                        goto slashes;
                inode = dentry->d_inode;
                if (inode)
-                        atomic_inc(&inode->i_count);
+                        ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7ca5182c0bed..8a415c9c5e55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                goto out_free;
                }
-                mnt->mnt_flags = old->mnt_flags;
+                mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
                atomic_inc(&sb->s_active);
                mnt->mnt_sb = sb;
                mnt->mnt_root = dget(root);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 257e4052492e..07ac3847e562 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1801,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
        d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_add(dentry, inode);
        }
        return error;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..ac7b814ce162 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                        iput(inode);
                        return -ENOMEM;
                }
-                /* Circumvent igrab(): we know the inode is not being freed */
+                ihold(inode);
-                atomic_inc(&inode->i_count);
                /*
                 * Ensure that this dentry is invisible to d_find_alias().
                 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..184938fcff04 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
 {
        struct inode *inode = fhp->fh_dentry->d_inode;
        const struct export_operations *export_ops = inode->i_sb->s_export_op;
-        int error = 0;
        if (!EX_ISSYNC(fhp->fh_export))
                return 0;
-        if (export_ops->commit_metadata) {
+        if (export_ops->commit_metadata)
-                error = export_ops->commit_metadata(inode);
+                return export_ops->commit_metadata(inode);
-        } else {
+        return sync_inode_metadata(inode, 1);
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = 0, /* metadata only */
-                };
-                error = sync_inode(inode, &wbc);
-        }
-        return error;
 }
 /*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 185d1607cb00..6e9557ecf161 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -207,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = nilfs_add_nondir(dentry, inode);
        if (!err)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..4498a208df94 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -88,8 +88,6 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
        struct dentry *parent;
        struct inode *p_inode;
-        bool send = false;
-        bool should_update_children = false;
        if (!dentry)
                dentry = path->dentry;
@@ -97,29 +95,12 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                return;
-        spin_lock(&dentry->d_lock);
+        parent = dget_parent(dentry);
-        parent = dentry->d_parent;
        p_inode = parent->d_inode;
-        if (fsnotify_inode_watches_children(p_inode)) {
+        if (unlikely(!fsnotify_inode_watches_children(p_inode)))
-                if (p_inode->i_fsnotify_mask & mask) {
+                __fsnotify_update_child_dentry_flags(p_inode);
-                        dget(parent);
+        else if (p_inode->i_fsnotify_mask & mask) {
-                        send = true;
-                }
-        } else {
-                /*
-                 * The parent doesn't care about events on it's children but
-                 * at least one child thought it did.  We need to run all the
-                 * children and update their d_flags to let them know p_inode
-                 * doesn't care about them any more.
-                 */
-                dget(parent);
-                should_update_children = true;
-        }
-        spin_unlock(&dentry->d_lock);
-        if (send) {
                /* we are notifying a parent so come up with the new mask which
                 * specifies these are events which came from a child. */
                mask |= FS_EVENT_ON_CHILD;
@@ -130,13 +111,9 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
                else
                        fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
                                 dentry->d_name.name, 0);
-                dput(parent);
        }
-        if (unlikely(should_update_children)) {
+        dput(parent);
-                __fsnotify_update_child_dentry_flags(p_inode);
-                dput(parent);
-        }
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..21ed10660b80 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -240,6 +240,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 {
        struct inode *inode, *next_i, *need_iput = NULL;
+        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
                struct inode *need_iput_tmp;
@@ -297,4 +298,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
                spin_lock(&inode_lock);
        }
+        spin_unlock(&inode_lock);
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 19c5180f8a28..d3fbe5730bfc 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2911,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto unl_upcase_iput_tmp_ino_err_out_now;
        }
        if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-                /* We increment i_count simulating an ntfs_iget(). */
+                /* We grab a reference, simulating an ntfs_iget(). */
-                atomic_inc(&vol->root_ino->i_count);
+                ihold(vol->root_ino);
                ntfs_debug("Exiting, status successful.");
                /* Release the default upcase if it has no users. */
                mutex_lock(&ntfs_lock);
@@ -3021,21 +3021,6 @@ iput_tmp_ino_err_out_now:
        if (vol->mft_ino && vol->mft_ino != tmp_ino)
                iput(vol->mft_ino);
        vol->mft_ino = NULL;
-        /*
-         * This is needed to get ntfs_clear_extent_inode() called for each
-         * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-         * leak resources and B) a subsequent mount fails automatically due to
-         * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-         * method again... FIXME: Do we need to do this twice now because of
-         * attribute inodes? I think not, so leave as is for now... (AIA)
-         */
-        if (invalidate_inodes(sb)) {
-                ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-                                "driver bug.");
-                /* Copied from fs/super.c. I just love this message. (-; */
-                printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-                                "seconds.  Have a nice day...\n");
-        }
        /* Errors at this stage are irrelevant. */
 err_out_now:
        sb->s_fs_info = NULL;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5cfeee118158..f1e962cb3b73 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
         * ocfs2 never allocates in this function - the only time we
         * need to use BH_New is when we're extending i_size on a file
         * system which doesn't support holes, in which case BH_New
-         * allows block_prepare_write() to zero.
+         * allows __block_write_begin() to zero.
         *
         * If we see this on a sparse file system, then a truncate has
         * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to)
-{
-        int ret;
-        ret = block_prepare_write(page, from, to, ocfs2_get_block);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 }
 /*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
 * mapping by now though, and the entire write will be allocating or
 * it won't, so not much need to use BH_New.
 *
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 7606f663da6d..76bfdfda691a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to);
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         struct page *page,
                                                         unsigned from,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index a7ebd9d42dc8..75e115f1bd73 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                ip = DLMFS_I(inode);
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1ca6867935bb..77b4c04a2809 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -796,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                block_end = block_start + (1 << inode->i_blkbits);
                /*
-                 * block_start is block-aligned.  Bump it by one to
+                 * block_start is block-aligned.  Bump it by one to force
-                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * __block_write_begin and block_commit_write to zero the
                 * whole block.
                 */
-                ret = ocfs2_prepare_write_nolock(inode, page,
+                ret = __block_write_begin(page, block_start + 1, 0,
-                                                 block_start + 1,
+                                          ocfs2_get_block);
-                                                 block_start + 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e7bde21149ae..ff5744e1e36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_commit;
        }
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37eb1ebeaa90..d2d7566ce68e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
        if (!inode)
                goto fail_inode;
+        inode->i_ino = get_next_ino();
        pipe = alloc_pipe_info(inode);
        if (!pipe)
                goto fail_iput;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 53dc8ad40ae6..9b094c1c8465 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
        file->private_data = (void*)((long)current->self_exec_id);
+        /* OK to pass negative loff_t, we can catch out-of-range */
+        file->f_mode |= FMODE_UNSIGNED_OFFSET;
        return 0;
 }
@@ -1646,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        /* Common stuff */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &proc_def_inode_operations;
@@ -2592,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        /* Initialize the inode */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 2fc52552271d..b652cb00906b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        if (!inode)
                goto out;
+        inode->i_ino = get_next_ino();
        sysctl_head_get(head);
        ei = PROC_I(inode);
        ei->sysctl = head;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..67fadb1ad2c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/read_write.c b/fs/read_write.c
index e757ef26e4ce..9cd9d148105d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
+static int
+__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+{
+        /*
+         * pos or pos+count is negative here, check overflow.
+         * too big "count" will be caught in rw_verify_area().
+         */
+        if ((pos < 0) && (pos + count < pos))
+                return -EOVERFLOW;
+        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+                return 0;
+        return -EINVAL;
+}
 /**
 * generic_file_llseek_unlocked - lockless generic llseek implementation
 * @file:       file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                break;
        }
-        if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+        if (offset < 0 && __negative_fpos_check(file, offset, 0))
+                return -EINVAL;
+        if (offset > inode->i_sb->s_maxbytes)
                return -EINVAL;
        /* Special lock needed here? */
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += file->f_pos;
        }
        retval = -EINVAL;
-        if (offset >= 0) {
+        if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -221,6 +237,7 @@ bad:
 }
 #endif
 /*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
@@ -238,8 +255,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        if (unlikely((ssize_t) count < 0))
                return retval;
        pos = *ppos;
-        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
+        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
-                return retval;
+                retval = __negative_fpos_check(file, pos, count);
+                if (retval)
+                        return retval;
+        }
        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c1f93896cb53..41656d40dc5c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 void reiserfs_evict_inode(struct inode *inode)
 {
@@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 }
 /* special version of get_block that is only used by grab_tail_page right
-** now.  It is sent to block_prepare_write, and when you try to get a
+** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
@@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
         **
         ** We must fix the tail page for writing because it might have buffers
         ** that are mapped, but have a block number of 0.  This indicates tail
-         ** data that has been read directly into the page, and block_prepare_write
+         ** data that has been read directly into the page, and
-         ** won't trigger a get_block in this case.
+         ** __block_write_begin won't trigger a get_block in this case.
         */
        fix_tail_page_for_writing(tail_page);
-        retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+        retval = __reiserfs_write_begin(tail_page, tail_start,
+                                      tail_end - tail_start);
        if (retval)
                goto unlock;
@@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
        /* start within the page of the last block in the file */
        start = (offset / blocksize) * blocksize;
-        error = block_prepare_write(page, start, offset,
+        error = __block_write_begin(page, start, offset - start,
                                    reiserfs_get_block_create_0);
        if (error)
                goto unlock;
@@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
        return ret;
 }
-int reiserfs_prepare_write(struct file *f, struct page *page,
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-                           unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        int ret;
@@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
                th->t_refcount++;
        }
-        ret = block_prepare_write(page, from, to, reiserfs_get_block);
+        ret = __block_write_begin(page, from, len, reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
                /* this gets a little ugly.  If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..adf22b485cea 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
@@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        }
        /* we unpack by finding the page with the tail, and calling
-         ** reiserfs_prepare_write on that page.  This will force a
+         ** __reiserfs_write_begin on that page.  This will force a
         ** reiserfs_get_block to unpack the tail for us.
         */
        index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        if (!page) {
                goto out;
        }
-        retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+        retval = __reiserfs_write_begin(page, write_from, 0);
        if (retval)
                goto out_unlock;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        reiserfs_update_sd(&th, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
        reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..5d04a7828e7a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 static void update_ctime(struct inode *inode)
 {
        struct timespec now = current_fs_time(inode->i_sb);
-        if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+        if (inode_unhashed(inode) || !inode->i_nlink ||
            timespec_equal(&inode->i_ctime, &now))
                return;
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                        rxh->h_hash = cpu_to_le32(xahash);
                }
-                err = reiserfs_prepare_write(NULL, page, page_offset,
+                err = __reiserfs_write_begin(page, page_offset, chunk + skip);
-                                            page_offset + chunk + skip);
                if (!err) {
                        if (buffer)
                                memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 0e7cb1395a94..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
        if (size) {
                char *p;
-                spin_lock(&dcache_lock);
                p = __d_path(path, root, buf, size);
-                spin_unlock(&dcache_lock);
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 00a70cab1f36..f678d421e541 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -406,21 +406,15 @@ void
 smb_renew_times(struct dentry * dentry)
 {
        dget(dentry);
-        spin_lock(&dentry->d_lock);
+        dentry->d_time = jiffies;
-        for (;;) {
-                struct dentry *parent;
-                dentry->d_time = jiffies;
+        while (!IS_ROOT(dentry)) {
-                if (IS_ROOT(dentry))
+                struct dentry *parent = dget_parent(dentry);
-                        break;
-                parent = dentry->d_parent;
-                dget(parent);
-                spin_unlock(&dentry->d_lock);
                dput(dentry);
                dentry = parent;
-                spin_lock(&dentry->d_lock);
+                dentry->d_time = jiffies;
        }
-        spin_unlock(&dentry->d_lock);
        dput(dentry);
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 8fc5e50e142f..f6e9ee59757e 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -229,7 +229,6 @@ smb_invalidate_inodes(struct smb_sb_info *server)
 {
        VERBOSE("\n");
        shrink_dcache_sb(SB_of(server));
-        invalidate_inodes(SB_of(server));
 }
 /*
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 71c29b6670b4..3dcf638d4d3a 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -332,16 +332,15 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
         * and store it in reversed order [see reverse_string()]
         */
        dget(entry);
-        spin_lock(&entry->d_lock);
        while (!IS_ROOT(entry)) {
                struct dentry *parent;
                if (maxlen < (3<<unicode)) {
-                        spin_unlock(&entry->d_lock);
                        dput(entry);
                        return -ENAMETOOLONG;
                }
+                spin_lock(&entry->d_lock);
                len = server->ops->convert(path, maxlen-2, 
                                      entry->d_name.name, entry->d_name.len,
                                      server->local_nls, server->remote_nls);
@@ -359,15 +358,12 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
                }
                *path++ = '\\';
                maxlen -= len+1;
-                parent = entry->d_parent;
-                dget(parent);
                spin_unlock(&entry->d_lock);
+                parent = dget_parent(entry);
                dput(entry);
                entry = parent;
-                spin_lock(&entry->d_lock);
        }
-        spin_unlock(&entry->d_lock);
        dput(entry);
        reverse_string(buf, path-buf);
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..b9c9869165db 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /* bad name - it should be evict_inodes() */
+                fsnotify_unmount_inodes(&sb->s_inodes);
-                invalidate_inodes(sb);
+                evict_inodes(sb);
                if (sop->put_super)
                        sop->put_super(sb);
-                /* Forget any remaining inodes */
+                if (!list_empty(&sb->s_inodes)) {
-                if (invalidate_inodes(sb)) {
                        printk("VFS: Busy inodes after unmount of %s. "
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..11e7f7d11cd0 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        lock_2_inodes(dir, inode);
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = ubifs_current_time(inode);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..6d8dc02baebb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = current_fs_time(inode->i_sb);
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        unlock_kernel();
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
        unlock_kernel();
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba5312802aa9..63fd2c07cb57 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1580,6 +1580,7 @@ xfs_mapping_buftarg(
                        XFS_BUFTARG_NAME(btp));
                return ENOMEM;
        }
+        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFBLK;
        inode->i_bdev = bdev;
        inode->i_rdev = bdev->bd_dev;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ec858e09d546..96107efc0c61 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -317,7 +317,7 @@ xfs_vn_link(
        if (unlikely(error))
                return -error;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -760,7 +760,9 @@ xfs_setup_inode(
        inode->i_ino = ip->i_ino;
        inode->i_state = I_NEW;
-        inode_add_to_lists(ip->i_mount->m_super, inode);
+        inode_sb_list_add(inode);
+        insert_inode_hash(inode);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index ab31ce5aeaf9..cf808782c065 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -576,7 +576,7 @@ xfs_max_file_offset(
        /* Figure out maximum filesize, on Linux this can depend on
         * the filesystem blocksize (on 32 bit platforms).
-         * __block_prepare_write does this in an [unsigned] long...
+         * __block_write_begin does this in an [unsigned] long...
         *      page->index << (PAGE_CACHE_SHIFT - bbits)
         * So, for page sized blocks (4K on 32 bit platforms),
         * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fac52290de90..fb2ca2e4cdc9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,7 +500,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
+        ihold(VFS_I(ip)); \
        trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-26 20:58:44 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-26 20:58:44 -0400
commit	426e1f5cec4821945642230218876b0e89aafab1 (patch)
tree	2728ace018d0698886989da586210ef1543a7098 /fs
parent	9e5fca251f44832cb996961048ea977f80faf6ea (diff)
parent	63997e98a3be68d7cec806d22bf9b02b2e1daabb (diff)