Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: fs: brlock vfsmount_lock fs: scale files_lock lglock: introduce special lglock and brlock spin locks tty: fix fu_list abuse fs: cleanup files_lock locking fs: remove extra lookup in __lookup_hash fs: fs_struct rwlock to spinlock apparmor: use task path helpers fs: dentry allocation consolidation fs: fix do_lookup false negative mbcache: Limit the maximum number of cache entries hostfs ->follow_link() braino hostfs: dumb (and usually harmless) tpyo - strncpy instead of strlcpy remove SWRITE* I/O types kill BH_Ordered flag vfs: update ctime when changing the file's permission by setfacl cramfs: only unlock new inodes fix reiserfs_evict_inode end_writeback second call
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-08-18 12:35:08 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-08-18 12:35:08 -0400
commit: 145c3ae46b37993b0debb0b3da6256daea4a6ec5 (patch)
tree: 0dbff382ce36b23b3d2dbff87d3eaab73a07a2a4
parent: 81ca03a0e2ea0207b2df80e0edcf4c775c07a505 (diff)
parent: 99b7db7b8ffd6bb755eb0a175596421a0b581cb2 (diff)
43 files changed, 797 insertions, 450 deletions
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index ad46eae1f9bb..c350d01716bd 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -675,8 +675,8 @@ static int ptmx_open(struct inode *inode, struct file *filp)
        }
        set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
-        filp->private_data = tty;
-        file_move(filp, &tty->tty_files);
+        tty_add_file(tty, filp);
        retval = devpts_pty_new(inode, tty->link);
        if (retval)
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 0350c42375a2..949067a0bd47 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -136,6 +136,9 @@ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
 DEFINE_MUTEX(tty_mutex);
 EXPORT_SYMBOL(tty_mutex);
+/* Spinlock to protect the tty->tty_files list */
+DEFINE_SPINLOCK(tty_files_lock);
 static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
 static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
 ssize_t redirected_tty_write(struct file *, const char __user *,
@@ -185,6 +188,41 @@ void free_tty_struct(struct tty_struct *tty)
        kfree(tty);
 }
+static inline struct tty_struct *file_tty(struct file *file)
+{
+        return ((struct tty_file_private *)file->private_data)->tty;
+}
+/* Associate a new file with the tty structure */
+void tty_add_file(struct tty_struct *tty, struct file *file)
+{
+        struct tty_file_private *priv;
+        /* XXX: must implement proper error handling in callers */
+        priv = kmalloc(sizeof(*priv), GFP_KERNEL|__GFP_NOFAIL);
+        priv->tty = tty;
+        priv->file = file;
+        file->private_data = priv;
+        spin_lock(&tty_files_lock);
+        list_add(&priv->list, &tty->tty_files);
+        spin_unlock(&tty_files_lock);
+}
+/* Delete file from its tty */
+void tty_del_file(struct file *file)
+{
+        struct tty_file_private *priv = file->private_data;
+        spin_lock(&tty_files_lock);
+        list_del(&priv->list);
+        spin_unlock(&tty_files_lock);
+        file->private_data = NULL;
+        kfree(priv);
+}
 #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base)
 /**
@@ -235,11 +273,11 @@ static int check_tty_count(struct tty_struct *tty, const char *routine)
        struct list_head *p;
        int count = 0;
-        file_list_lock();
+        spin_lock(&tty_files_lock);
        list_for_each(p, &tty->tty_files) {
                count++;
        }
-        file_list_unlock();
+        spin_unlock(&tty_files_lock);
        if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
            tty->driver->subtype == PTY_TYPE_SLAVE &&
            tty->link && tty->link->count)
@@ -497,6 +535,7 @@ void __tty_hangup(struct tty_struct *tty)
        struct file *cons_filp = NULL;
        struct file *filp, *f = NULL;
        struct task_struct *p;
+        struct tty_file_private *priv;
        int    closecount = 0, n;
        unsigned long flags;
        int refs = 0;
@@ -506,7 +545,7 @@ void __tty_hangup(struct tty_struct *tty)
        spin_lock(&redirect_lock);
-        if (redirect && redirect->private_data == tty) {
+        if (redirect && file_tty(redirect) == tty) {
                f = redirect;
                redirect = NULL;
        }
@@ -519,9 +558,10 @@ void __tty_hangup(struct tty_struct *tty)
           workqueue with the lock held */
        check_tty_count(tty, "tty_hangup");
-        file_list_lock();
+        spin_lock(&tty_files_lock);
        /* This breaks for file handles being sent over AF_UNIX sockets ? */
-        list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) {
+        list_for_each_entry(priv, &tty->tty_files, list) {
+                filp = priv->file;
                if (filp->f_op->write == redirected_tty_write)
                        cons_filp = filp;
                if (filp->f_op->write != tty_write)
@@ -530,7 +570,7 @@ void __tty_hangup(struct tty_struct *tty)
                __tty_fasync(-1, filp, 0);      /* can't block */
                filp->f_op = &hung_up_tty_fops;
        }
-        file_list_unlock();
+        spin_unlock(&tty_files_lock);
        tty_ldisc_hangup(tty);
@@ -889,12 +929,10 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
                        loff_t *ppos)
 {
        int i;
-        struct tty_struct *tty;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct inode *inode;
+        struct tty_struct *tty = file_tty(file);
        struct tty_ldisc *ld;
-        tty = file->private_data;
-        inode = file->f_path.dentry->d_inode;
        if (tty_paranoia_check(tty, inode, "tty_read"))
                return -EIO;
        if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags)))
@@ -1065,12 +1103,11 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 static ssize_t tty_write(struct file *file, const char __user *buf,
                                                size_t count, loff_t *ppos)
 {
-        struct tty_struct *tty;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct tty_struct *tty = file_tty(file);
+        struct tty_ldisc *ld;
        ssize_t ret;
-        struct tty_ldisc *ld;
-        tty = file->private_data;
        if (tty_paranoia_check(tty, inode, "tty_write"))
                return -EIO;
        if (!tty || !tty->ops->write ||
@@ -1424,9 +1461,9 @@ static void release_one_tty(struct work_struct *work)
        tty_driver_kref_put(driver);
        module_put(driver->owner);
-        file_list_lock();
+        spin_lock(&tty_files_lock);
        list_del_init(&tty->tty_files);
-        file_list_unlock();
+        spin_unlock(&tty_files_lock);
        put_pid(tty->pgrp);
        put_pid(tty->session);
@@ -1507,13 +1544,13 @@ static void release_tty(struct tty_struct *tty, int idx)
 int tty_release(struct inode *inode, struct file *filp)
 {
-        struct tty_struct *tty, *o_tty;
+        struct tty_struct *tty = file_tty(filp);
+        struct tty_struct *o_tty;
        int     pty_master, tty_closing, o_tty_closing, do_sleep;
        int     devpts;
        int     idx;
        char    buf[64];
-        tty = filp->private_data;
        if (tty_paranoia_check(tty, inode, "tty_release_dev"))
                return 0;
@@ -1671,8 +1708,7 @@ int tty_release(struct inode *inode, struct file *filp)
         *  - do_tty_hangup no longer sees this file descriptor as
         *    something that needs to be handled for hangups.
         */
-        file_kill(filp);
+        tty_del_file(filp);
-        filp->private_data = NULL;
        /*
         * Perform some housekeeping before deciding whether to return.
@@ -1839,8 +1875,8 @@ got_driver:
                return PTR_ERR(tty);
        }
-        filp->private_data = tty;
+        tty_add_file(tty, filp);
-        file_move(filp, &tty->tty_files);
        check_tty_count(tty, "tty_open");
        if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
            tty->driver->subtype == PTY_TYPE_MASTER)
@@ -1916,11 +1952,10 @@ got_driver:
 static unsigned int tty_poll(struct file *filp, poll_table *wait)
 {
-        struct tty_struct *tty;
+        struct tty_struct *tty = file_tty(filp);
        struct tty_ldisc *ld;
        int ret = 0;
-        tty = filp->private_data;
        if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
                return 0;
@@ -1933,11 +1968,10 @@ static unsigned int tty_poll(struct file *filp, poll_table *wait)
 static int __tty_fasync(int fd, struct file *filp, int on)
 {
-        struct tty_struct *tty;
+        struct tty_struct *tty = file_tty(filp);
        unsigned long flags;
        int retval = 0;
-        tty = filp->private_data;
        if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_fasync"))
                goto out;
@@ -2491,13 +2525,13 @@ EXPORT_SYMBOL(tty_pair_get_pty);
 */
 long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct tty_struct *tty, *real_tty;
+        struct tty_struct *tty = file_tty(file);
+        struct tty_struct *real_tty;
        void __user *p = (void __user *)arg;
        int retval;
        struct tty_ldisc *ld;
        struct inode *inode = file->f_dentry->d_inode;
-        tty = file->private_data;
        if (tty_paranoia_check(tty, inode, "tty_ioctl"))
                return -EINVAL;
@@ -2619,7 +2653,7 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
 {
        struct inode *inode = file->f_dentry->d_inode;
-        struct tty_struct *tty = file->private_data;
+        struct tty_struct *tty = file_tty(file);
        struct tty_ldisc *ld;
        int retval = -ENOIOCTLCMD;
@@ -2711,7 +2745,7 @@ void __do_SAK(struct tty_struct *tty)
                                if (!filp)
                                        continue;
                                if (filp->f_op->read == tty_read &&
-                                    filp->private_data == tty) {
+                                    file_tty(filp) == tty) {
                                        printk(KERN_NOTICE "SAK: killed process %d"
                                            " (%s): fd#%d opened to the tty\n",
                                            task_pid_nr(p), p->comm, i);
diff --git a/drivers/staging/pohmelfs/path_entry.c b/drivers/staging/pohmelfs/path_entry.c
index cdc4dd50d638..8ec83d2dffb7 100644
--- a/drivers/staging/pohmelfs/path_entry.c
+++ b/drivers/staging/pohmelfs/path_entry.c
@@ -44,9 +44,9 @@ int pohmelfs_construct_path_string(struct pohmelfs_inode *pi, void *data, int le
                return -ENOENT;
        }
-        read_lock(&current->fs->lock);
+        spin_lock(&current->fs->lock);
        path.mnt = mntget(current->fs->root.mnt);
-        read_unlock(&current->fs->lock);
+        spin_unlock(&current->fs->lock);
        path.dentry = d;
@@ -91,9 +91,9 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
                return -ENOENT;
        }
-        read_lock(&current->fs->lock);
+        spin_lock(&current->fs->lock);
        root = dget(current->fs->root.dentry);
-        read_unlock(&current->fs->lock);
+        spin_unlock(&current->fs->lock);
        spin_lock(&dcache_lock);
diff --git a/fs/buffer.c b/fs/buffer.c
index 50efa339e051..3e7dca279d1c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
-                                 * ll_rw_block() actually writes the current
+                                 * write_dirty_buffer() actually writes the
-                                 * contents - it is a noop if I/O is still in
+                                 * current contents - it is a noop if I/O is
-                                 * flight on potentially older contents.
+                                 * still in flight on potentially older
+                                 * contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -2912,13 +2913,6 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(buffer_unwritten(bh));
        /*
-         * Mask in barrier bit for a write (could be either a WRITE or a
-         * WRITE_SYNC
-         */
-        if (buffer_ordered(bh) && (rw & WRITE))
-                rw |= WRITE_BARRIER;
-        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -2956,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
 /**
 * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 * @nr: number of &struct buffer_heads in the array
 * @bhs: array of pointers to &struct buffer_head
 *
 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
+ * %READA option is described in the documentation for generic_make_request()
- * are sent to disk. The fourth %READA option is described in the documentation
+ * which ll_rw_block() calls.
- * for generic_make_request() which ll_rw_block() calls.
 *
 * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
- * clean when doing a write request, and any buffer that appears to be
+ * request, and any buffer that appears to be up-to-date when doing read
- * up-to-date when doing read request.  Further it marks as clean buffers that
+ * request.  Further it marks as clean buffers that are processed for
- * are processed for writing (the buffer cache won't assume that they are
+ * writing (the buffer cache won't assume that they are actually clean
- * actually clean until the buffer gets unlocked).
+ * until the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2987,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
+                if (!trylock_buffer(bh))
-                        lock_buffer(bh);
-                else if (!trylock_buffer(bh))
                        continue;
+                if (rw == WRITE) {
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
-                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
-                                if (rw == SWRITE_SYNC)
+                                submit_bh(WRITE, bh);
-                                        submit_bh(WRITE_SYNC, bh);
-                                else
-                                        submit_bh(WRITE, bh);
                                continue;
                        }
                } else {
@@ -3016,12 +3002,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 }
 EXPORT_SYMBOL(ll_rw_block);
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+        lock_buffer(bh);
+        if (!test_clear_buffer_dirty(bh)) {
+                unlock_buffer(bh);
+                return;
+        }
+        bh->b_end_io = end_buffer_write_sync;
+        get_bh(bh);
+        submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
 /*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
-int sync_dirty_buffer(struct buffer_head *bh)
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 {
        int ret = 0;
@@ -3030,7 +3029,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(rw, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
@@ -3043,6 +3042,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
        }
        return ret;
 }
+EXPORT_SYMBOL(__sync_dirty_buffer);
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+        return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
 EXPORT_SYMBOL(sync_dirty_buffer);
 /*
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a53b130b366c..1e7a33028d33 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -80,7 +80,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
                }
        } else {
                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                if (inode) {
+                if (inode && (inode->i_state & I_NEW)) {
                        setup_inode(inode, cramfs_inode);
                        unlock_new_inode(inode);
                }
diff --git a/fs/dcache.c b/fs/dcache.c
index 4d13bf50b7b1..83293be48149 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
 *
- * Searches the children of the parent dentry for the name in question. If
+ * d_lookup searches the children of the parent dentry for the name in
- * the dentry is found its reference count is incremented and the dentry
+ * question. If the dentry is found its reference count is incremented and the
- * is returned. The caller must use dput to free the entry when it has
+ * dentry is returned. The caller must use dput to free the entry when it has
- * finished using it. %NULL is returned on failure.
+ * finished using it. %NULL is returned if the dentry does not exist.
- *
- * __d_lookup is dcache_lock free. The hash list is protected using RCU.
- * Memory barriers are used while updating and doing lockless traversal. 
- * To avoid races with d_move while rename is happening, d_lock is used.
- *
- * Overflows in memcmp(), while d_move, are avoided by keeping the length
- * and name pointer in one structure pointed by d_qstr.
- *
- * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
- * lookup is going on.
- *
- * The dentry unused LRU is not updated even if lookup finds the required dentry
- * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
- * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
- * acquisition.
- *
- * d_lookup() is protected against the concurrent renames in some unrelated
- * directory using the seqlockt_t rename_lock.
 */
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
        struct dentry * dentry = NULL;
@@ -1372,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
+/*
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
        unsigned int len = name->len;
@@ -1382,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
        struct hlist_node *node;
        struct dentry *dentry;
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Take d_lock when comparing a candidate dentry, to avoid races
+         * with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
        rcu_read_lock();
        
        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1396,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                /*
                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things.  Don't bother checking the hash because we're
+                 * changed things. Don't bother checking the hash because
-                 * about to compare the whole name anyway.
+                 * we're about to compare the whole name anyway.
                 */
                if (dentry->d_parent != parent)
                        goto next;
@@ -1925,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root,
        bool slash = false;
        int error = 0;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        while (dentry != root->dentry || vfsmnt != root->mnt) {
                struct dentry * parent;
@@ -1954,7 +1964,7 @@ out:
        if (!error && !slash)
                error = prepend(buffer, buflen, "/", 1);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return error;
 global_root:
@@ -2292,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
        struct vfsmount *mnt = path1->mnt;
        struct dentry *dentry = path1->dentry;
        int res;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (mnt != path2->mnt) {
                for (;;) {
                        if (mnt->mnt_parent == mnt) {
-                                spin_unlock(&vfsmount_lock);
+                                br_read_unlock(vfsmount_lock);
                                return 0;
                        }
                        if (mnt->mnt_parent == path2->mnt)
@@ -2306,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
                dentry = mnt->mnt_mountpoint;
        }
        res = is_subdir(dentry, path2->dentry);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return res;
 }
 EXPORT_SYMBOL(path_is_under);
diff --git a/fs/exec.c b/fs/exec.c
index 05c7d6b84df7..2d9455282744 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1118,7 +1118,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
        bprm->unsafe = tracehook_unsafe_exec(p);
        n_fs = 1;
-        write_lock(&p->fs->lock);
+        spin_lock(&p->fs->lock);
        rcu_read_lock();
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
@@ -1135,7 +1135,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
                        res = 1;
                }
        }
-        write_unlock(&p->fs->lock);
+        spin_unlock(&p->fs->lock);
        return res;
 }
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba5..1736f2356388 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
        int i, err = 0;
-        ll_rw_block(SWRITE, nr_bhs, bhs);
+        for (i = 0; i < nr_bhs; i++)
+                write_dirty_buffer(bhs[i], WRITE);
        for (i = 0; i < nr_bhs; i++) {
                wait_on_buffer(bhs[i]);
                if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/file_table.c b/fs/file_table.c
index edecd36fed9b..a04bdd81c11c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
 #include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
        .max_files = NR_FILE
 };
-/* public. Not pretty! */
+DECLARE_LGLOCK(files_lglock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DEFINE_LGLOCK(files_lglock);
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -249,7 +251,7 @@ static void __fput(struct file *file)
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
-        file_kill(file);
+        file_sb_list_del(file);
        if (file->f_mode & FMODE_WRITE)
                drop_file_write_access(file);
        file->f_path.dentry = NULL;
@@ -328,41 +330,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        return file;
 }
 void put_filp(struct file *file)
 {
        if (atomic_long_dec_and_test(&file->f_count)) {
                security_file_free(file);
-                file_kill(file);
+                file_sb_list_del(file);
                file_free(file);
        }
 }
-void file_move(struct file *file, struct list_head *list)
+static inline int file_list_cpu(struct file *file)
 {
-        if (!list)
+#ifdef CONFIG_SMP
-                return;
+        return file->f_sb_list_cpu;
-        file_list_lock();
+#else
-        list_move(&file->f_u.fu_list, list);
+        return smp_processor_id();
-        file_list_unlock();
+#endif
+}
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+        struct list_head *list;
+#ifdef CONFIG_SMP
+        int cpu;
+        cpu = smp_processor_id();
+        file->f_sb_list_cpu = cpu;
+        list = per_cpu_ptr(sb->s_files, cpu);
+#else
+        list = &sb->s_files;
+#endif
+        list_add(&file->f_u.fu_list, list);
 }
-void file_kill(struct file *file)
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+        lg_local_lock(files_lglock);
+        __file_sb_list_add(file, sb);
+        lg_local_unlock(files_lglock);
+}
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
 {
        if (!list_empty(&file->f_u.fu_list)) {
-                file_list_lock();
+                lg_local_lock_cpu(files_lglock, file_list_cpu(file));
                list_del_init(&file->f_u.fu_list);
-                file_list_unlock();
+                lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
        }
 }
+#ifdef CONFIG_SMP
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)               \
+{                                                               \
+        int i;                                                  \
+        for_each_possible_cpu(i) {                              \
+                struct list_head *list;                         \
+                list = per_cpu_ptr((__sb)->s_files, i);         \
+                list_for_each_entry((__file), list, f_u.fu_list)
+#define while_file_list_for_each_entry                          \
+        }                                                       \
+}
+#else
+#define do_file_list_for_each_entry(__sb, __file)               \
+{                                                               \
+        struct list_head *list;                                 \
+        list = &(sb)->s_files;                                  \
+        list_for_each_entry((__file), list, f_u.fu_list)
+#define while_file_list_for_each_entry                          \
+}
+#endif
 int fs_may_remount_ro(struct super_block *sb)
 {
        struct file *file;
        /* Check that no files are currently opened for writing. */
-        file_list_lock();
+        lg_global_lock(files_lglock);
-        list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+        do_file_list_for_each_entry(sb, file) {
                struct inode *inode = file->f_path.dentry->d_inode;
                /* File with pending delete? */
@@ -372,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
                /* Writeable file? */
                if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
                        goto too_bad;
-        }
+        } while_file_list_for_each_entry;
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
        return 1; /* Tis' cool bro. */
 too_bad:
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
        return 0;
 }
@@ -392,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
        struct file *f;
 retry:
-        file_list_lock();
+        lg_global_lock(files_lglock);
-        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+        do_file_list_for_each_entry(sb, f) {
                struct vfsmount *mnt;
                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
                       continue;
@@ -408,16 +476,13 @@ retry:
                        continue;
                file_release_write(f);
                mnt = mntget(f->f_path.mnt);
-                file_list_unlock();
+                /* This can sleep, so we can't hold the spinlock. */
-                /*
+                lg_global_unlock(files_lglock);
-                 * This can sleep, so we can't hold
-                 * the file_list_lock() spinlock.
-                 */
                mnt_drop_write(mnt);
                mntput(mnt);
                goto retry;
-        }
+        } while_file_list_for_each_entry;
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
 }
 void __init files_init(unsigned long mempages)
@@ -437,5 +502,6 @@ void __init files_init(unsigned long mempages)
        if (files_stat.max_files < NR_FILE)
                files_stat.max_files = NR_FILE;
        files_defer_init();
+        lg_lock_init(files_lglock);
        percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 1ee40eb9a2c0..ed45a9cf5f3d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 {
        struct path old_root;
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        old_root = fs->root;
        fs->root = *path;
        path_get(path);
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
 }
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 {
        struct path old_pwd;
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        path_get(path);
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
                path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                task_lock(p);
                fs = p->fs;
                if (fs) {
-                        write_lock(&fs->lock);
+                        spin_lock(&fs->lock);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
                                path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                                fs->pwd = *new_root;
                                count++;
                        }
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
        if (fs) {
                int kill;
                task_lock(tsk);
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
@@ -104,7 +104,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
-                rwlock_init(&fs->lock);
+                spin_lock_init(&fs->lock);
                fs->umask = old->umask;
                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
        }
@@ -121,10 +121,10 @@ int unshare_fs_struct(void)
                return -ENOMEM;
        task_lock(current);
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        task_unlock(current);
        if (kill)
@@ -143,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
        .users          = 1,
-        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .umask          = 0022,
 };
@@ -156,14 +156,14 @@ void daemonize_fs_struct(void)
                task_lock(current);
-                write_lock(&init_fs.lock);
+                spin_lock(&init_fs.lock);
                init_fs.users++;
-                write_unlock(&init_fs.lock);
+                spin_unlock(&init_fs.lock);
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                current->fs = &init_fs;
                kill = !--fs->users;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                task_unlock(current);
                if (kill)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e564157..6bc9e3a5a693 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
                        if (error < 0)
                                goto failed;
                        inode->i_mode = mode;
+                        inode->i_ctime = CURRENT_TIME;
                        if (error == 0) {
                                posix_acl_release(acl);
                                acl = NULL;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dd1e55535a4e..f7dc9b5f9ef8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -104,7 +104,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
                __putname(name);
                return NULL;
        }
-        strncpy(name, root, PATH_MAX);
+        strlcpy(name, root, PATH_MAX);
        if (len > p - name) {
                __putname(name);
                return NULL;
@@ -876,7 +876,7 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
                char *path = dentry_name(dentry);
                int err = -ENOMEM;
                if (path) {
-                        int err = hostfs_do_readlink(path, link, PATH_MAX);
+                        err = hostfs_do_readlink(path, link, PATH_MAX);
                        if (err == PATH_MAX)
                                err = -E2BIG;
                        __putname(path);
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a66..a6910e91cee8 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/lglock.h>
 struct super_block;
 struct linux_binprm;
 struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void __init mnt_init(void);
-extern spinlock_t vfsmount_lock;
+DECLARE_BRLOCK(vfsmount_lock);
 /*
 * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
 /*
 * file_table.c
 */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654d..05a38b9c4c0e 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, bhs);
+        for (i = 0; i < *batch_count; i++)
+                write_dirty_buffer(bhs[i], WRITE);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = bhs[i];
                clear_buffer_jwrite(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c49..95d8c11c929e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
        struct buffer_head *bh;
        journal_header_t *header;
        int ret;
-        int barrier_done = 0;
        if (is_journal_aborted(journal))
                return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
        if (journal->j_flags & JFS_BARRIER) {
-                set_buffer_ordered(bh);
+                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-                barrier_done = 1;
-        }
-        ret = sync_dirty_buffer(bh);
-        if (barrier_done)
-                clear_buffer_ordered(bh);
-        /* is it possible for another commit to fail at roughly
-         * the same time as this one?  If so, we don't want to
-         * trust the barrier flag in the super, but instead want
-         * to remember if we sent a barrier request
-         */
-        if (ret == -EOPNOTSUPP && barrier_done) {
-                char b[BDEVNAME_SIZE];
-                printk(KERN_WARNING
+                /*
-                        "JBD: barrier-based sync failed on %s - "
+                 * Is it possible for another commit to fail at roughly
-                        "disabling barriers\n",
+                 * the same time as this one?  If so, we don't want to
-                        bdevname(journal->j_dev, b));
+                 * trust the barrier flag in the super, but instead want
-                spin_lock(&journal->j_state_lock);
+                 * to remember if we sent a barrier request
-                journal->j_flags &= ~JFS_BARRIER;
+                 */
-                spin_unlock(&journal->j_state_lock);
+                if (ret == -EOPNOTSUPP) {
+                        char b[BDEVNAME_SIZE];
-                /* And try again, without the barrier */
+                        printk(KERN_WARNING
-                set_buffer_uptodate(bh);
+                                "JBD: barrier-based sync failed on %s - "
-                set_buffer_dirty(bh);
+                                "disabling barriers\n",
+                                bdevname(journal->j_dev, b));
+                        spin_lock(&journal->j_state_lock);
+                        journal->j_flags &= ~JFS_BARRIER;
+                        spin_unlock(&journal->j_state_lock);
+                        /* And try again, without the barrier */
+                        set_buffer_uptodate(bh);
+                        set_buffer_dirty(bh);
+                        ret = sync_dirty_buffer(bh);
+                }
+        } else {
                ret = sync_dirty_buffer(bh);
        }
        put_bh(bh);             /* One for getblk() */
        journal_put_journal_head(descriptor);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f19ce94693d8..2c4b1f109da9 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1024,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
        if (wait)
                sync_dirty_buffer(bh);
        else
-                ll_rw_block(SWRITE, 1, &bh);
+                write_dirty_buffer(bh, WRITE);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343a..d29018307e2e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+        write_dirty_buffer(bh, write_op);
 }
 #endif
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c23a0f4e8a3..5247e7ffdcb4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
+        for (i = 0; i < *batch_count; i++)
+                write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = journal->j_chkpt_bhs[i];
                clear_buffer_jwrite(bh);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f52e5e8049f1..7c068c189d80 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
        struct commit_header *tmp;
        struct buffer_head *bh;
        int ret;
-        int barrier_done = 0;
        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-                set_buffer_ordered(bh);
+                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-                barrier_done = 1;
+                if (ret == -EOPNOTSUPP) {
-        }
+                        printk(KERN_WARNING
-        ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                               "JBD2: Disabling barriers on %s, "
-        if (barrier_done)
+                               "not supported by device\n", journal->j_devname);
-                clear_buffer_ordered(bh);
+                        write_lock(&journal->j_state_lock);
+                        journal->j_flags &= ~JBD2_BARRIER;
-        /* is it possible for another commit to fail at roughly
+                        write_unlock(&journal->j_state_lock);
-         * the same time as this one?  If so, we don't want to
-         * trust the barrier flag in the super, but instead want
-         * to remember if we sent a barrier request
-         */
-        if (ret == -EOPNOTSUPP && barrier_done) {
-                printk(KERN_WARNING
-                       "JBD2: Disabling barriers on %s, "
-                       "not supported by device\n", journal->j_devname);
-                write_lock(&journal->j_state_lock);
-                journal->j_flags &= ~JBD2_BARRIER;
-                write_unlock(&journal->j_state_lock);
-                /* And try again, without the barrier */
+                        /* And try again, without the barrier */
-                lock_buffer(bh);
+                        lock_buffer(bh);
-                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                clear_buffer_dirty(bh);
+                        clear_buffer_dirty(bh);
+                        ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                }
+        } else {
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
        }
        *cbh = bh;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ad5866aaf0f9..0e8014ea6b94 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1124,7 +1124,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                        set_buffer_uptodate(bh);
                }
        } else
-                ll_rw_block(SWRITE, 1, &bh);
+                write_dirty_buffer(bh, WRITE);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e3..9ad321fd63fd 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+        write_dirty_buffer(bh, write_op);
 }
 #endif
diff --git a/fs/mbcache.c b/fs/mbcache.c
index cf4e6cdfd15b..93444747237b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -80,6 +80,7 @@ struct mb_cache {
        struct list_head                c_cache_list;
        const char                      *c_name;
        atomic_t                        c_entry_count;
+        int                             c_max_entries;
        int                             c_bucket_bits;
        struct kmem_cache               *c_entry_cache;
        struct list_head                *c_block_hash;
@@ -243,6 +244,12 @@ mb_cache_create(const char *name, int bucket_bits)
        if (!cache->c_entry_cache)
                goto fail2;
+        /*
+         * Set an upper limit on the number of cache entries so that the hash
+         * chains won't grow too long.
+         */
+        cache->c_max_entries = bucket_count << 4;
        spin_lock(&mb_cache_spinlock);
        list_add(&cache->c_cache_list, &mb_cache_list);
        spin_unlock(&mb_cache_spinlock);
@@ -333,7 +340,6 @@ mb_cache_destroy(struct mb_cache *cache)
        kfree(cache);
 }
 /*
 * mb_cache_entry_alloc()
 *
@@ -345,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-        struct mb_cache_entry *ce;
+        struct mb_cache_entry *ce = NULL;
-        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+        if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
-        if (ce) {
+                spin_lock(&mb_cache_spinlock);
+                if (!list_empty(&mb_cache_lru_list)) {
+                        ce = list_entry(mb_cache_lru_list.next,
+                                        struct mb_cache_entry, e_lru_list);
+                        list_del_init(&ce->e_lru_list);
+                        __mb_cache_entry_unhash(ce);
+                }
+                spin_unlock(&mb_cache_spinlock);
+        }
+        if (!ce) {
+                ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+                if (!ce)
+                        return NULL;
                atomic_inc(&cache->c_entry_count);
                INIT_LIST_HEAD(&ce->e_lru_list);
                INIT_LIST_HEAD(&ce->e_block_list);
                ce->e_cache = cache;
-                ce->e_used = 1 + MB_CACHE_WRITER;
                ce->e_queued = 0;
        }
+        ce->e_used = 1 + MB_CACHE_WRITER;
        return ce;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 17ea76bf2fbe..24896e833565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,15 +595,16 @@ int follow_up(struct path *path)
 {
        struct vfsmount *parent;
        struct dentry *mountpoint;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        parent = path->mnt->mnt_parent;
        if (parent == path->mnt) {
-                spin_unlock(&vfsmount_lock);
+                br_read_unlock(vfsmount_lock);
                return 0;
        }
        mntget(parent);
        mountpoint = dget(path->mnt->mnt_mountpoint);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
@@ -686,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 }
 /*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+                                struct qstr *name, struct nameidata *nd)
+{
+        struct inode *inode = parent->d_inode;
+        struct dentry *dentry;
+        struct dentry *old;
+        /* Don't create child dentry for a dead directory. */
+        if (unlikely(IS_DEADDIR(inode)))
+                return ERR_PTR(-ENOENT);
+        dentry = d_alloc(parent, name);
+        if (unlikely(!dentry))
+                return ERR_PTR(-ENOMEM);
+        old = inode->i_op->lookup(inode, dentry, nd);
+        if (unlikely(old)) {
+                dput(dentry);
+                dentry = old;
+        }
+        return dentry;
+}
+/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
@@ -706,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                        return err;
        }
+        /*
+         * Rename seqlock is not required here because in the off chance
+         * of a false negative due to a concurrent rename, we're going to
+         * do the non-racy lookup, below.
+         */
        dentry = __d_lookup(nd->path.dentry, name);
        if (!dentry)
                goto need_lookup;
+found:
        if (dentry->d_op && dentry->d_op->d_revalidate)
                goto need_revalidate;
 done:
@@ -724,56 +760,28 @@ need_lookup:
        mutex_lock(&dir->i_mutex);
        /*
         * First re-do the cached lookup just in case it was created
-         * while we waited for the directory semaphore..
+         * while we waited for the directory semaphore, or the first
+         * lookup failed due to an unrelated rename.
         *
-         * FIXME! This could use version numbering or similar to
+         * This could use version numbering or similar to avoid unnecessary
-         * avoid unnecessary cache lookups.
+         * cache lookups, but then we'd have to do the first lookup in the
-         *
+         * non-racy way. However in the common case here, everything should
-         * The "dcache_lock" is purely to protect the RCU list walker
+         * be hot in cache, so would it be a big win?
-         * from concurrent renames at this point (we mustn't get false
-         * negatives from the RCU list walk here, unlike the optimistic
-         * fast walk).
-         *
-         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
         */
        dentry = d_lookup(parent, name);
-        if (!dentry) {
+        if (likely(!dentry)) {
-                struct dentry *new;
+                dentry = d_alloc_and_lookup(parent, name, nd);
-                /* Don't create child dentry for a dead directory. */
-                dentry = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(dir))
-                        goto out_unlock;
-                new = d_alloc(parent, name);
-                dentry = ERR_PTR(-ENOMEM);
-                if (new) {
-                        dentry = dir->i_op->lookup(dir, new, nd);
-                        if (dentry)
-                                dput(new);
-                        else
-                                dentry = new;
-                }
-out_unlock:
                mutex_unlock(&dir->i_mutex);
                if (IS_ERR(dentry))
                        goto fail;
                goto done;
        }
        /*
         * Uhhuh! Nasty case: the cache was re-populated while
         * we waited on the semaphore. Need to revalidate.
         */
        mutex_unlock(&dir->i_mutex);
-        if (dentry->d_op && dentry->d_op->d_revalidate) {
+        goto found;
-                dentry = do_revalidate(dentry, nd);
-                if (!dentry)
-                        dentry = ERR_PTR(-ENOENT);
-        }
-        if (IS_ERR(dentry))
-                goto fail;
-        goto done;
 need_revalidate:
        dentry = do_revalidate(dentry, nd);
@@ -1130,35 +1138,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
                        goto out;
        }
-        dentry = __d_lookup(base, name);
+        /*
+         * Don't bother with __d_lookup: callers are for creat as
-        /* lockess __d_lookup may fail due to concurrent d_move()
+         * well as unlink, so a lot of the time it would cost
-         * in some unrelated directory, so try with d_lookup
+         * a double lookup.
         */
-        if (!dentry)
+        dentry = d_lookup(base, name);
-                dentry = d_lookup(base, name);
        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
                dentry = do_revalidate(dentry, nd);
-        if (!dentry) {
+        if (!dentry)
-                struct dentry *new;
+                dentry = d_alloc_and_lookup(base, name, nd);
-                /* Don't create child dentry for a dead directory. */
-                dentry = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(inode))
-                        goto out;
-                new = d_alloc(base, name);
-                dentry = ERR_PTR(-ENOMEM);
-                if (!new)
-                        goto out;
-                dentry = inode->i_op->lookup(inode, new, nd);
-                if (!dentry)
-                        dentry = new;
-                else
-                        dput(new);
-        }
 out:
        return dentry;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b0..de402eb6eafb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -38,12 +40,10 @@
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
 static int mnt_alloc_id(struct vfsmount *mnt)
 {
        int res;
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-        spin_lock(&vfsmount_lock);
+        spin_lock(&mnt_id_lock);
        res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
        if (!res)
                mnt_id_start = mnt->mnt_id + 1;
-        spin_unlock(&vfsmount_lock);
+        spin_unlock(&mnt_id_lock);
        if (res == -EAGAIN)
                goto retry;
@@ -86,11 +99,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
        int id = mnt->mnt_id;
-        spin_lock(&vfsmount_lock);
+        spin_lock(&mnt_id_lock);
        ida_remove(&mnt_id_ida, id);
        if (mnt_id_start > id)
                mnt_id_start = id;
-        spin_unlock(&vfsmount_lock);
+        spin_unlock(&mnt_id_lock);
 }
 /*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         */
        smp_wmb();
        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        return ret;
 }
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_flags &= ~MNT_READONLY;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 /*
 * find the first or last mount at @dentry on vfsmount @mnt depending on
 * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
 */
 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                              int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return child_mnt;
 }
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
        old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        old_path->dentry->d_mounted--;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
                        struct vfsmount *child_mnt)
 {
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
        dentry->d_mounted++;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
        mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 }
 /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
 */
 static void commit_tree(struct vfsmount *mnt)
 {
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-        if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
+        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
-                if (likely(!mnt->mnt_pinned)) {
+                return;
-                        spin_unlock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
-                        __mntput(mnt);
+        if (!atomic_dec_and_test(&mnt->mnt_count)) {
-                        return;
+                br_write_unlock(vfsmount_lock);
-                }
+                return;
-                atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+        }
-                mnt->mnt_pinned = 0;
+        if (likely(!mnt->mnt_pinned)) {
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
-                acct_auto_close_mnt(mnt);
+                __mntput(mnt);
-                goto repeat;
+                return;
        }
+        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+        mnt->mnt_pinned = 0;
+        br_write_unlock(vfsmount_lock);
+        acct_auto_close_mnt(mnt);
+        goto repeat;
 }
 EXPORT_SYMBOL(mntput_no_expire);
 void mnt_pin(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_pinned++;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
                atomic_inc(&mnt->mnt_count);
                mnt->mnt_pinned--;
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
        struct mnt_namespace *ns = p->ns;
        int res = 0;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (p->event != ns->event) {
                p->event = ns->event;
                res = 1;
        }
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return res;
 }
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += atomic_read(&p->mnt_count);
                minimum_refs += 2;
        }
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
                if (mnt->mnt_parent != mnt) {
                        struct dentry *dentry;
                        struct vfsmount *m;
-                        spin_lock(&vfsmount_lock);
+                        br_write_lock(vfsmount_lock);
                        dentry = mnt->mnt_mountpoint;
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt_root;
                        mnt->mnt_parent = mnt;
                        m->mnt_ghosts--;
-                        spin_unlock(&vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        dput(dentry);
                        mntput(m);
                }
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
        struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
        }
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        event++;
        if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                        umount_tree(mnt, 1, &umount_list);
                retval = 0;
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                        q = clone_mnt(p, p->mnt_root, flag);
                        if (!q)
                                goto Enomem;
-                        spin_lock(&vfsmount_lock);
+                        br_write_lock(vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, &path);
-                        spin_unlock(&vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                }
        }
        return res;
 Enomem:
        if (res) {
                LIST_HEAD(umount_list);
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                umount_tree(res, 0, &umount_list);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
        return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        LIST_HEAD(umount_list);
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        umount_tree(mnt, 0, &umount_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
 }
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
                list_del_init(&child->mnt_hash);
                commit_tree(child);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        return 0;
 out_cleanup_ids:
@@ -1466,10 +1506,10 @@ static int do_change_type(struct path *path, int flag)
                        goto out_unlock;
        }
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 out_unlock:
        up_write(&namespace_sem);
@@ -1513,9 +1553,10 @@ static int do_loopback(struct path *path, char *old_name,
        err = graft_tree(mnt, path);
        if (err) {
                LIST_HEAD(umount_list);
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
@@ -1568,16 +1609,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        else
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
        }
        up_write(&sb->s_umount);
        if (!err) {
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
        }
        return err;
 }
@@ -1754,7 +1795,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                return;
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
@@ -1773,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, 1, &umounts);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umounts);
@@ -1830,6 +1871,8 @@ resume:
 /*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
 */
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
@@ -2048,9 +2091,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                kfree(new_ns);
                return ERR_PTR(-ENOMEM);
        }
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2287,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                goto out2; /* not attached */
        /* make sure we can reach put_old from new_root */
        tmp = old.mnt;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
@@ -2264,7 +2307,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* mount new_root on / */
        attach_mnt(new.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
@@ -2279,7 +2322,7 @@ out1:
 out0:
        return error;
 out3:
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        goto out2;
 }
@@ -2326,6 +2369,8 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
+        br_lock_init(vfsmount_lock);
        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2389,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        if (!atomic_dec_and_test(&ns->count))
                return;
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        umount_tree(ns->root, 0, &umount_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index bee60c04109a..922263393c76 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -175,24 +175,24 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
-        int barrier_done = 0;
-        if (nilfs_test_opt(sbi, BARRIER)) {
-                set_buffer_ordered(nilfs->ns_sbh[0]);
-                barrier_done = 1;
-        }
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
-        err = sync_dirty_buffer(nilfs->ns_sbh[0]);
-        if (err == -EOPNOTSUPP && barrier_done) {
+        if (nilfs_test_opt(sbi, BARRIER)) {
-                nilfs_warning(sbi->s_super, __func__,
+                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                              "barrier-based sync failed. "
+                                          WRITE_SYNC | WRITE_BARRIER);
-                              "disabling barriers\n");
+                if (err == -EOPNOTSUPP) {
-                nilfs_clear_opt(sbi, BARRIER);
+                        nilfs_warning(sbi->s_super, __func__,
-                barrier_done = 0;
+                                      "barrier-based sync failed. "
-                clear_buffer_ordered(nilfs->ns_sbh[0]);
+                                      "disabling barriers\n");
-                goto retry;
+                        nilfs_clear_opt(sbi, BARRIER);
+                        goto retry;
+                }
+        } else {
+                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: unable to write superblock (err=%d)\n", err);
diff --git a/fs/open.c b/fs/open.c
index 630715f9f73d..d74e1983e8dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.mnt = mnt;
        f->f_pos = 0;
        f->f_op = fops_get(inode->i_fop);
-        file_move(f, &inode->i_sb->s_files);
+        file_sb_list_add(f, inode->i_sb);
        error = security_dentry_open(f, cred);
        if (error)
@@ -721,7 +721,7 @@ cleanup_all:
                        mnt_drop_write(mnt);
                }
        }
-        file_kill(f);
+        file_sb_list_del(f);
        f->f_path.dentry = NULL;
        f->f_path.mnt = NULL;
 cleanup_file:
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a83149..8066b8dd748f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
        return 0;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
        if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
                prev_src_mnt  = child;
        }
 out:
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        while (!list_empty(&tmp_list)) {
                child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
                umount_tree(child, 0, &umount_list);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        release_mounts(&umount_list);
        return ret;
 }
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * other mounts its parent propagates to.
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for read or write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
 * collect all mounts that receive propagation from the mount in @list,
 * and return these additional mounts in the same list.
 * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
 */
 int propagate_umount(struct list_head *list)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae35413dcbe1..caa758377d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -83,6 +83,7 @@ void reiserfs_evict_inode(struct inode *inode)
        dquot_drop(inode);
        inode->i_blocks = 0;
        reiserfs_write_unlock_once(inode->i_sb, depth);
+        return;
 no_delete:
        end_writeback(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1ec952b1f036..812e2c05aa29 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2311,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
        /* flush out the real blocks */
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                set_buffer_dirty(real_blocks[i]);
-                ll_rw_block(SWRITE, 1, real_blocks + i);
+                write_dirty_buffer(real_blocks[i], WRITE);
        }
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                wait_on_buffer(real_blocks[i]);
diff --git a/fs/super.c b/fs/super.c
index 9674ab2c8718..8819e3a7ff20 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
                        s = NULL;
                        goto out;
                }
+#ifdef CONFIG_SMP
+                s->s_files = alloc_percpu(struct list_head);
+                if (!s->s_files) {
+                        security_sb_free(s);
+                        kfree(s);
+                        s = NULL;
+                        goto out;
+                } else {
+                        int i;
+                        for_each_possible_cpu(i)
+                                INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+                }
+#else
                INIT_LIST_HEAD(&s->s_files);
+#endif
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
 */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+        free_percpu(s->s_files);
+#endif
        security_sb_free(s);
        kfree(s->s_subtype);
        kfree(s->s_options);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d2..46f7a807bbc1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
        
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        
        unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        if (overflow) {
                fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
 succed:
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 428017e018fe..2eabf04af3de 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -113,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        
        sb->s_dirt = 1;
        unlock_super (sb);
@@ -156,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
        fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
        ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer(UCPI_UBH(ucpi));
-        }
        UFSD("EXIT\n");
 }
@@ -290,10 +286,8 @@ cg_found:
        }
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 34d5cb135320..a58f9155fc9a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
                ubh_bforget(ind_ubh);
                ind_ubh = NULL;
        }
-        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
+        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
-                ubh_ll_rw_block(SWRITE, ind_ubh);
+                ubh_sync_block(ind_ubh);
-                ubh_wait_on_buffer (ind_ubh);
-        }
        ubh_brelse (ind_ubh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
                ubh_bforget(dind_bh);
                dind_bh = NULL;
        }
-        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
+        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
-                ubh_ll_rw_block(SWRITE, dind_bh);
+                ubh_sync_block(dind_bh);
-                ubh_wait_on_buffer (dind_bh);
-        }
        ubh_brelse (dind_bh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
                ubh_bforget(tind_bh);
                tind_bh = NULL;
        }
-        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
+        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
-                ubh_ll_rw_block(SWRITE, tind_bh);
+                ubh_sync_block(tind_bh);
-                ubh_wait_on_buffer (tind_bh);
-        }
        ubh_brelse (tind_bh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4e..d2c36d53fe66 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
        }
 }
-void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
+void ubh_sync_block(struct ufs_buffer_head *ubh)
 {
-        if (!ubh)
+        if (ubh) {
-                return;
+                unsigned i;
-        ll_rw_block(rw, ubh->count, ubh->bh);
+                for (i = 0; i < ubh->count; i++)
-}
+                        write_dirty_buffer(ubh->bh[i], WRITE);
-void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
+                for (i = 0; i < ubh->count; i++)
-{
+                        wait_on_buffer(ubh->bh[i]);
-        unsigned i;
+        }
-        if (!ubh)
-                return;
-        for ( i = 0; i < ubh->count; i++ )
-                wait_on_buffer (ubh->bh[i]);
 }
 void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 0466036912f1..9f8775ce381c 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -269,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
+extern void ubh_sync_block(struct ufs_buffer_head *);
-extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 43e649a72529..ec94c12f21da 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,6 @@ enum bh_state_bits {
        BH_Delay,       /* Buffer is not yet allocated on disk */
        BH_Boundary,    /* Block is followed by a discontiguity */
        BH_Write_EIO,   /* I/O error on write */
-        BH_Ordered,     /* ordered write */
        BH_Eopnotsupp,  /* operation not supported (barrier) */
        BH_Unwritten,   /* Buffer is allocated on disk but not written */
        BH_Quiet,       /* Buffer Error Prinks to be quiet */
@@ -125,7 +124,6 @@ BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
-BUFFER_FNS(Ordered, ordered)
 BUFFER_FNS(Eopnotsupp, eopnotsupp)
 BUFFER_FNS(Unwritten, unwritten)
@@ -183,6 +181,8 @@ void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
+int __sync_dirty_buffer(struct buffer_head *bh, int rw);
+void write_dirty_buffer(struct buffer_head *bh, int rw);
 int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9a96b4d83fc1..76041b614758 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -125,9 +125,6 @@ struct inodes_stat_t {
 *                      block layer could (in theory) choose to ignore this
 *                      request if it runs into resource problems.
 * WRITE                A normal async write. Device will be plugged.
- * SWRITE               Like WRITE, but a special case for ll_rw_block() that
- *                      tells it to lock the buffer first. Normally a buffer
- *                      must be locked before doing IO.
 * WRITE_SYNC_PLUG      Synchronous write. Identical to WRITE, but passes down
 *                      the hint that someone will be waiting on this IO
 *                      shortly. The device must still be unplugged explicitly,
@@ -138,9 +135,6 @@ struct inodes_stat_t {
 *                      immediately after submission. The write equivalent
 *                      of READ_SYNC.
 * WRITE_ODIRECT_PLUG   Special case write for O_DIRECT only.
- * SWRITE_SYNC
- * SWRITE_SYNC_PLUG     Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
- *                      See SWRITE.
 * WRITE_BARRIER        Like WRITE_SYNC, but tells the block layer that all
 *                      previously submitted writes must be safely on storage
 *                      before this one is started. Also guarantees that when
@@ -155,7 +149,6 @@ struct inodes_stat_t {
 #define READ                    0
 #define WRITE                   RW_MASK
 #define READA                   RWA_MASK
-#define SWRITE                  (WRITE | READA)
 #define READ_SYNC               (READ | REQ_SYNC | REQ_UNPLUG)
 #define READ_META               (READ | REQ_META)
@@ -165,8 +158,6 @@ struct inodes_stat_t {
 #define WRITE_META              (WRITE | REQ_META)
 #define WRITE_BARRIER           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
                                 REQ_HARDBARRIER)
-#define SWRITE_SYNC_PLUG        (SWRITE | REQ_SYNC | REQ_NOIDLE)
-#define SWRITE_SYNC             (SWRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
 /*
 * These aren't really reads or writes, they pass down information about
@@ -929,6 +920,9 @@ struct file {
 #define f_vfsmnt        f_path.mnt
        const struct file_operations    *f_op;
        spinlock_t              f_lock;  /* f_ep_links, f_flags, no IRQ */
+#ifdef CONFIG_SMP
+        int                     f_sb_list_cpu;
+#endif
        atomic_long_t           f_count;
        unsigned int            f_flags;
        fmode_t                 f_mode;
@@ -953,9 +947,6 @@ struct file {
        unsigned long f_mnt_write_state;
 #endif
 };
-extern spinlock_t files_lock;
-#define file_list_lock() spin_lock(&files_lock);
-#define file_list_unlock() spin_unlock(&files_lock);
 #define get_file(x)     atomic_long_inc(&(x)->f_count)
 #define fput_atomic(x)  atomic_long_add_unless(&(x)->f_count, -1, 1)
@@ -1346,7 +1337,11 @@ struct super_block {
        struct list_head        s_inodes;       /* all inodes */
        struct hlist_head       s_anon;         /* anonymous dentries for (nfs) exporting */
+#ifdef CONFIG_SMP
+        struct list_head __percpu *s_files;
+#else
        struct list_head        s_files;
+#endif
        /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
        struct list_head        s_dentry_lru;   /* unused dentry lru */
        int                     s_nr_dentry_unused;     /* # of dentry on lru */
@@ -2197,8 +2192,6 @@ static inline void insert_inode_hash(struct inode *inode) {
        __insert_inode_hash(inode, inode->i_ino);
 }
-extern void file_move(struct file *f, struct list_head *list);
-extern void file_kill(struct file *f);
 #ifdef CONFIG_BLOCK
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index eca3d5202138..a42b5bf02f8b 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -5,7 +5,7 @@
 struct fs_struct {
        int users;
-        rwlock_t lock;
+        spinlock_t lock;
        int umask;
        int in_exec;
        struct path root, pwd;
@@ -23,29 +23,29 @@ extern int unshare_fs_struct(void);
 static inline void get_fs_root(struct fs_struct *fs, struct path *root)
 {
-        read_lock(&fs->lock);
+        spin_lock(&fs->lock);
        *root = fs->root;
        path_get(root);
-        read_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
 }
 static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 {
-        read_lock(&fs->lock);
+        spin_lock(&fs->lock);
        *pwd = fs->pwd;
        path_get(pwd);
-        read_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
 }
 static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root,
                                       struct path *pwd)
 {
-        read_lock(&fs->lock);
+        spin_lock(&fs->lock);
        *root = fs->root;
        path_get(root);
        *pwd = fs->pwd;
        path_get(pwd);
-        read_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
 }
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
new file mode 100644
index 000000000000..b288cb713b90
--- /dev/null
+++ b/include/linux/lglock.h
@@ -0,0 +1,172 @@
+/*
+ * Specialised local-global spinlock. Can only be declared as global variables
+ * to avoid overhead and keep things simple (and we don't want to start using
+ * these inside dynamically allocated structures).
+ *
+ * "local/global locks" (lglocks) can be used to:
+ *
+ * - Provide fast exclusive access to per-CPU data, with exclusive access to
+ *   another CPU's data allowed but possibly subject to contention, and to
+ *   provide very slow exclusive access to all per-CPU data.
+ * - Or to provide very fast and scalable read serialisation, and to provide
+ *   very slow exclusive serialisation of data (not necessarily per-CPU data).
+ *
+ * Brlocks are also implemented as a short-hand notation for the latter use
+ * case.
+ *
+ * Copyright 2009, 2010, Nick Piggin, Novell Inc.
+ */
+#ifndef __LINUX_LGLOCK_H
+#define __LINUX_LGLOCK_H
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+#include <linux/percpu.h>
+/* can make br locks by using local lock for read side, global lock for write */
+#define br_lock_init(name)      name##_lock_init()
+#define br_read_lock(name)      name##_local_lock()
+#define br_read_unlock(name)    name##_local_unlock()
+#define br_write_lock(name)     name##_global_lock_online()
+#define br_write_unlock(name)   name##_global_unlock_online()
+#define DECLARE_BRLOCK(name)    DECLARE_LGLOCK(name)
+#define DEFINE_BRLOCK(name)     DEFINE_LGLOCK(name)
+#define lg_lock_init(name)      name##_lock_init()
+#define lg_local_lock(name)     name##_local_lock()
+#define lg_local_unlock(name)   name##_local_unlock()
+#define lg_local_lock_cpu(name, cpu)    name##_local_lock_cpu(cpu)
+#define lg_local_unlock_cpu(name, cpu)  name##_local_unlock_cpu(cpu)
+#define lg_global_lock(name)    name##_global_lock()
+#define lg_global_unlock(name)  name##_global_unlock()
+#define lg_global_lock_online(name) name##_global_lock_online()
+#define lg_global_unlock_online(name) name##_global_unlock_online()
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define LOCKDEP_INIT_MAP lockdep_init_map
+#define DEFINE_LGLOCK_LOCKDEP(name)                                     \
+ struct lock_class_key name##_lock_key;                                 \
+ struct lockdep_map name##_lock_dep_map;                                \
+ EXPORT_SYMBOL(name##_lock_dep_map)
+#else
+#define LOCKDEP_INIT_MAP(a, b, c, d)
+#define DEFINE_LGLOCK_LOCKDEP(name)
+#endif
+#define DECLARE_LGLOCK(name)                                            \
+ extern void name##_lock_init(void);                                    \
+ extern void name##_local_lock(void);                                   \
+ extern void name##_local_unlock(void);                                 \
+ extern void name##_local_lock_cpu(int cpu);                            \
+ extern void name##_local_unlock_cpu(int cpu);                          \
+ extern void name##_global_lock(void);                                  \
+ extern void name##_global_unlock(void);                                \
+ extern void name##_global_lock_online(void);                           \
+ extern void name##_global_unlock_online(void);                         \
+#define DEFINE_LGLOCK(name)                                             \
+                                                                        \
+ DEFINE_PER_CPU(arch_spinlock_t, name##_lock);                          \
+ DEFINE_LGLOCK_LOCKDEP(name);                                           \
+                                                                        \
+ void name##_lock_init(void) {                                          \
+        int i;                                                          \
+        LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
+        for_each_possible_cpu(i) {                                      \
+                arch_spinlock_t *lock;                                  \
+                lock = &per_cpu(name##_lock, i);                        \
+                *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;     \
+        }                                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_lock_init);                                       \
+                                                                        \
+ void name##_local_lock(void) {                                         \
+        arch_spinlock_t *lock;                                          \
+        preempt_disable();                                              \
+        rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
+        lock = &__get_cpu_var(name##_lock);                             \
+        arch_spin_lock(lock);                                           \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_local_lock);                                      \
+                                                                        \
+ void name##_local_unlock(void) {                                       \
+        arch_spinlock_t *lock;                                          \
+        rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
+        lock = &__get_cpu_var(name##_lock);                             \
+        arch_spin_unlock(lock);                                         \
+        preempt_enable();                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_local_unlock);                                    \
+                                                                        \
+ void name##_local_lock_cpu(int cpu) {                                  \
+        arch_spinlock_t *lock;                                          \
+        preempt_disable();                                              \
+        rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
+        lock = &per_cpu(name##_lock, cpu);                              \
+        arch_spin_lock(lock);                                           \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_local_lock_cpu);                                  \
+                                                                        \
+ void name##_local_unlock_cpu(int cpu) {                                \
+        arch_spinlock_t *lock;                                          \
+        rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
+        lock = &per_cpu(name##_lock, cpu);                              \
+        arch_spin_unlock(lock);                                         \
+        preempt_enable();                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_local_unlock_cpu);                                \
+                                                                        \
+ void name##_global_lock_online(void) {                                 \
+        int i;                                                          \
+        preempt_disable();                                              \
+        rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
+        for_each_online_cpu(i) {                                        \
+                arch_spinlock_t *lock;                                  \
+                lock = &per_cpu(name##_lock, i);                        \
+                arch_spin_lock(lock);                                   \
+        }                                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_global_lock_online);                              \
+                                                                        \
+ void name##_global_unlock_online(void) {                               \
+        int i;                                                          \
+        rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
+        for_each_online_cpu(i) {                                        \
+                arch_spinlock_t *lock;                                  \
+                lock = &per_cpu(name##_lock, i);                        \
+                arch_spin_unlock(lock);                                 \
+        }                                                               \
+        preempt_enable();                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_global_unlock_online);                            \
+                                                                        \
+ void name##_global_lock(void) {                                        \
+        int i;                                                          \
+        preempt_disable();                                              \
+        rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
+        for_each_online_cpu(i) {                                        \
+                arch_spinlock_t *lock;                                  \
+                lock = &per_cpu(name##_lock, i);                        \
+                arch_spin_lock(lock);                                   \
+        }                                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_global_lock);                                     \
+                                                                        \
+ void name##_global_unlock(void) {                                      \
+        int i;                                                          \
+        rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
+        for_each_online_cpu(i) {                                        \
+                arch_spinlock_t *lock;                                  \
+                lock = &per_cpu(name##_lock, i);                        \
+                arch_spin_unlock(lock);                                 \
+        }                                                               \
+        preempt_enable();                                               \
+ }                                                                      \
+ EXPORT_SYMBOL(name##_global_unlock);
+#endif
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1437da3ddc62..67d64e6efe7a 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -329,6 +329,13 @@ struct tty_struct {
        struct tty_port *port;
 };
+/* Each of a tty's open files has private_data pointing to tty_file_private */
+struct tty_file_private {
+        struct tty_struct *tty;
+        struct file *file;
+        struct list_head list;
+};
 /* tty magic number */
 #define TTY_MAGIC               0x5401
@@ -458,6 +465,7 @@ extern void proc_clear_tty(struct task_struct *p);
 extern struct tty_struct *get_current_tty(void);
 extern void tty_default_fops(struct file_operations *fops);
 extern struct tty_struct *alloc_tty_struct(void);
+extern void tty_add_file(struct tty_struct *tty, struct file *file);
 extern void free_tty_struct(struct tty_struct *tty);
 extern void initialize_tty_struct(struct tty_struct *tty,
                struct tty_driver *driver, int idx);
@@ -470,6 +478,7 @@ extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty);
 extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
 extern struct mutex tty_mutex;
+extern spinlock_t tty_files_lock;
 extern void tty_write_unlock(struct tty_struct *tty);
 extern int tty_write_lock(struct tty_struct *tty, int ndelay);
diff --git a/kernel/fork.c b/kernel/fork.c
index 98b450876f93..856eac3ec52e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -752,13 +752,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                if (fs->in_exec) {
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
@@ -1676,13 +1676,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                if (new_fs) {
                        fs = current->fs;
-                        write_lock(&fs->lock);
+                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                }
                if (new_mm) {
diff --git a/security/apparmor/path.c b/security/apparmor/path.c
index 96bab9469d48..19358dc14605 100644
--- a/security/apparmor/path.c
+++ b/security/apparmor/path.c
@@ -62,19 +62,14 @@ static int d_namespace_path(struct path *path, char *buf, int buflen,
        int deleted, connected;
        int error = 0;
-        /* Get the root we want to resolve too */
+        /* Get the root we want to resolve too, released below */
        if (flags & PATH_CHROOT_REL) {
                /* resolve paths relative to chroot */
-                read_lock(&current->fs->lock);
+                get_fs_root(current->fs, &root);
-                root = current->fs->root;
-                /* released below */
-                path_get(&root);
-                read_unlock(&current->fs->lock);
        } else {
                /* resolve paths relative to namespace */
                root.mnt = current->nsproxy->mnt_ns->root;
                root.dentry = root.mnt->mnt_root;
-                /* released below */
                path_get(&root);
        }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 42043f96e54f..4796ddd4e721 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2170,8 +2170,9 @@ static inline void flush_unauthorized_files(const struct cred *cred,
        tty = get_current_tty();
        if (tty) {
-                file_list_lock();
+                spin_lock(&tty_files_lock);
                if (!list_empty(&tty->tty_files)) {
+                        struct tty_file_private *file_priv;
                        struct inode *inode;
                        /* Revalidate access to controlling tty.
@@ -2179,14 +2180,16 @@ static inline void flush_unauthorized_files(const struct cred *cred,
                           than using file_has_perm, as this particular open
                           file may belong to another process and we are only
                           interested in the inode-based check here. */
-                        file = list_first_entry(&tty->tty_files, struct file, f_u.fu_list);
+                        file_priv = list_first_entry(&tty->tty_files,
+                                                struct tty_file_private, list);
+                        file = file_priv->file;
                        inode = file->f_path.dentry->d_inode;
                        if (inode_has_perm(cred, inode,
                                           FILE__READ | FILE__WRITE, NULL)) {
                                drop_tty = 1;
                        }
                }
-                file_list_unlock();
+                spin_unlock(&tty_files_lock);
                tty_kref_put(tty);
        }
        /* Reset controlling tty. */
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-08-18 12:35:08 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-08-18 12:35:08 -0400
commit	145c3ae46b37993b0debb0b3da6256daea4a6ec5 (patch)
tree	0dbff382ce36b23b3d2dbff87d3eaab73a07a2a4
parent	81ca03a0e2ea0207b2df80e0edcf4c775c07a505 (diff)
parent	99b7db7b8ffd6bb755eb0a175596421a0b581cb2 (diff)