From 7b7b1ace2d9d06d76bce7481a045c22ed75e35dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Nov 2005 17:13:39 -0500
Subject: [PATCH] saner handling of auto_acct_off() and DQUOT_OFF() in umount

The way we currently deal with quota and process accounting that might
keep vfsmount busy at umount time is inherently broken; we try to turn
them off just in case (not quite correctly, at that) and

  a) pray umount doesn't fail (otherwise they'll stay turned off)
  b) pray nobody doesn anything funny just as we turn quota off

Moreover, LSM provides hooks for doing the same sort of broken logics.

The proper way to deal with that is to introduce the second kind of
reference to vfsmount.  Semantics:

 - when the last normal reference is dropped, all special ones are
   converted to normal ones and if there had been any, cleanup is done.
 - normal reference can be cloned into a special one
 - special reference can be converted to normal one; that's a no-op if
   we'd already passed the point of no return (i.e.  mntput() had
   converted special references to normal and started cleanup).

The way it works: e.g. starting process accounting converts the vfsmount
reference pinned by the opened file into special one and turns it back
to normal when it gets shut down; acct_auto_close() is done when no
normal references are left.  That way it does *not* obstruct umount(2)
and it silently gets turned off when the last normal reference to
vfsmount is gone.  Which is exactly what we want...

The same should be done by LSM module that holds some internal
references to vfsmount and wants to shut them down on umount - it should
make them special and security_sb_umount_close() will be called exactly
when the last normal reference to vfsmount is gone.

quota handling is even simpler - we don't use normal file IO anymore, so
there's no need to hold vfsmounts at all.  DQUOT_OFF() is done from
deactivate_super(), where it really belongs.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 64 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 23 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 2fa9fdf7d6f5..1d83302f30c3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -172,7 +172,7 @@ clone_mnt(struct vfsmount *old, struct dentry *root)
 	return mnt;
 }
 
-void __mntput(struct vfsmount *mnt)
+static inline void __mntput(struct vfsmount *mnt)
 {
 	struct super_block *sb = mnt->mnt_sb;
 	dput(mnt->mnt_root);
@@ -180,7 +180,46 @@ void __mntput(struct vfsmount *mnt)
 	deactivate_super(sb);
 }
 
-EXPORT_SYMBOL(__mntput);
+void mntput_no_expire(struct vfsmount *mnt)
+{
+repeat:
+	if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
+		if (likely(!mnt->mnt_pinned)) {
+			spin_unlock(&vfsmount_lock);
+			__mntput(mnt);
+			return;
+		}
+		atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+		mnt->mnt_pinned = 0;
+		spin_unlock(&vfsmount_lock);
+		acct_auto_close_mnt(mnt);
+		security_sb_umount_close(mnt);
+		goto repeat;
+	}
+}
+
+EXPORT_SYMBOL(mntput_no_expire);
+
+void mnt_pin(struct vfsmount *mnt)
+{
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_pinned++;
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL(mnt_pin);
+
+void mnt_unpin(struct vfsmount *mnt)
+{
+	spin_lock(&vfsmount_lock);
+	if (mnt->mnt_pinned) {
+		atomic_inc(&mnt->mnt_count);
+		mnt->mnt_pinned--;
+	}
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL(mnt_unpin);
 
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
@@ -435,16 +474,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	down_write(&current->namespace->sem);
 	spin_lock(&vfsmount_lock);
 
-	if (atomic_read(&sb->s_active) == 1) {
-		/* last instance - try to be smart */
-		spin_unlock(&vfsmount_lock);
-		lock_kernel();
-		DQUOT_OFF(sb);
-		acct_auto_close(sb);
-		unlock_kernel();
-		security_sb_umount_close(mnt);
-		spin_lock(&vfsmount_lock);
-	}
 	retval = -EBUSY;
 	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
@@ -850,17 +879,6 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts)
 		detach_mnt(mnt, &old_nd);
 		spin_unlock(&vfsmount_lock);
 		path_release(&old_nd);
-
-		/*
-		 * Now lay it to rest if this was the last ref on the superblock
-		 */
-		if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
-			/* last instance - try to be smart */
-			lock_kernel();
-			DQUOT_OFF(mnt->mnt_sb);
-			acct_auto_close(mnt->mnt_sb);
-			unlock_kernel();
-		}
 		mntput(mnt);
 	} else {
 		/*
-- 
cgit v1.2.2


From ccd48bc7fac284caf704dcdcafd223a24f70bccf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Nov 2005 17:15:04 -0500
Subject: [PATCH] cleanups and bug fix in do_loopback()

 - check_mnt() on the source of binding should've been unconditional
   from the very beginning.  My fault - as far I could've trace it,
   that's an old thinko made back in 2001.  Kudos to Miklos for spotting
   it...

   Fixed.

 - code cleaned up.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1d83302f30c3..611f777bbd61 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -661,29 +661,32 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 
 	down_write(&current->namespace->sem);
 	err = -EINVAL;
-	if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
-		err = -ENOMEM;
-		if (recurse)
-			mnt = copy_tree(old_nd.mnt, old_nd.dentry);
-		else
-			mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
-	}
+	if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+		goto out;
 
-	if (mnt) {
-		/* stop bind mounts from expiring */
+	err = -ENOMEM;
+	if (recurse)
+		mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+	else
+		mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+
+	if (!mnt)
+		goto out;
+
+	/* stop bind mounts from expiring */
+	spin_lock(&vfsmount_lock);
+	list_del_init(&mnt->mnt_expire);
+	spin_unlock(&vfsmount_lock);
+
+	err = graft_tree(mnt, nd);
+	if (err) {
 		spin_lock(&vfsmount_lock);
-		list_del_init(&mnt->mnt_expire);
+		umount_tree(mnt);
 		spin_unlock(&vfsmount_lock);
+	} else
+		mntput(mnt);
 
-		err = graft_tree(mnt, nd);
-		if (err) {
-			spin_lock(&vfsmount_lock);
-			umount_tree(mnt);
-			spin_unlock(&vfsmount_lock);
-		} else
-			mntput(mnt);
-	}
-
+out:
 	up_write(&current->namespace->sem);
 	path_release(&old_nd);
 	return err;
-- 
cgit v1.2.2


From 5addc5dd8836aa061f6efc4a0d9ba6323726297a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Nov 2005 17:15:49 -0500
Subject: [PATCH] make /proc/mounts pollable

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 611f777bbd61..d1aca685aacf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -37,7 +37,9 @@ static inline int sysfs_init(void)
 #endif
 
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
+
+static int event;
 
 static struct list_head *mount_hashtable;
 static int hash_mask __read_mostly, hash_bits __read_mostly;
@@ -111,6 +113,22 @@ static inline int check_mnt(struct vfsmount *mnt)
 	return mnt->mnt_namespace == current->namespace;
 }
 
+static void touch_namespace(struct namespace *ns)
+{
+	if (ns) {
+		ns->event = ++event;
+		wake_up_interruptible(&ns->poll);
+	}
+}
+
+static void __touch_namespace(struct namespace *ns)
+{
+	if (ns && ns->event != event) {
+		ns->event = event;
+		wake_up_interruptible(&ns->poll);
+	}
+}
+
 static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
 {
 	old_nd->dentry = mnt->mnt_mountpoint;
@@ -384,6 +402,7 @@ static void umount_tree(struct vfsmount *mnt)
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		list_del(&p->mnt_list);
 		list_add(&p->mnt_list, &kill);
+		__touch_namespace(p->mnt_namespace);
 		p->mnt_namespace = NULL;
 	}
 
@@ -473,6 +492,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 
 	down_write(&current->namespace->sem);
 	spin_lock(&vfsmount_lock);
+	event++;
 
 	retval = -EBUSY;
 	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
@@ -634,6 +654,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		list_splice(&head, current->namespace->list.prev);
 		mntget(mnt);
 		err = 0;
+		touch_namespace(current->namespace);
 	}
 	spin_unlock(&vfsmount_lock);
 out_unlock:
@@ -771,6 +792,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 
 	detach_mnt(old_nd.mnt, &parent_nd);
 	attach_mnt(old_nd.mnt, nd);
+	touch_namespace(current->namespace);
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
@@ -877,6 +899,7 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts)
 		struct nameidata old_nd;
 
 		/* delete from the namespace */
+		touch_namespace(mnt->mnt_namespace);
 		list_del_init(&mnt->mnt_list);
 		mnt->mnt_namespace = NULL;
 		detach_mnt(mnt, &old_nd);
@@ -1114,6 +1137,8 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	atomic_set(&new_ns->count, 1);
 	init_rwsem(&new_ns->sem);
 	INIT_LIST_HEAD(&new_ns->list);
+	init_waitqueue_head(&new_ns->poll);
+	new_ns->event = 0;
 
 	down_write(&tsk->namespace->sem);
 	/* First pass: copy the tree topology */
@@ -1377,6 +1402,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
 	detach_mnt(user_nd.mnt, &root_parent);
 	attach_mnt(user_nd.mnt, &old_nd);     /* mount old root on put_old */
 	attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */
+	touch_namespace(current->namespace);
 	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&user_nd, &new_nd);
 	security_sb_post_pivotroot(&user_nd, &new_nd);
@@ -1413,6 +1439,8 @@ static void __init init_mount_tree(void)
 	atomic_set(&namespace->count, 1);
 	INIT_LIST_HEAD(&namespace->list);
 	init_rwsem(&namespace->sem);
+	init_waitqueue_head(&namespace->poll);
+	namespace->event = 0;
 	list_add(&mnt->mnt_list, &namespace->list);
 	namespace->root = mnt;
 	mnt->mnt_namespace = namespace;
-- 
cgit v1.2.2


From b58fed8b1959d6b9e4c951a54adc8960e1401b18 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:16:09 -0500
Subject: [PATCH] lindent fs/namespace.c

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 97 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index d1aca685aacf..685687dccbf1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -43,29 +43,29 @@ static int event;
 
 static struct list_head *mount_hashtable;
 static int hash_mask __read_mostly, hash_bits __read_mostly;
-static kmem_cache_t *mnt_cache; 
+static kmem_cache_t *mnt_cache;
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
-	unsigned long tmp = ((unsigned long) mnt / L1_CACHE_BYTES);
-	tmp += ((unsigned long) dentry / L1_CACHE_BYTES);
+	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
+	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> hash_bits);
 	return tmp & hash_mask;
 }
 
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
-	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
+	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		memset(mnt, 0, sizeof(struct vfsmount));
-		atomic_set(&mnt->mnt_count,1);
+		atomic_set(&mnt->mnt_count, 1);
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		if (name) {
-			int size = strlen(name)+1;
+			int size = strlen(name) + 1;
 			char *newname = kmalloc(size, GFP_KERNEL);
 			if (newname) {
 				memcpy(newname, name, size);
@@ -88,8 +88,8 @@ void free_vfsmnt(struct vfsmount *mnt)
  */
 struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
-	struct list_head * head = mount_hashtable + hash(mnt, dentry);
-	struct list_head * tmp = head;
+	struct list_head *head = mount_hashtable + hash(mnt, dentry);
+	struct list_head *tmp = head;
 	struct vfsmount *p, *found = NULL;
 
 	spin_lock(&vfsmount_lock);
@@ -144,7 +144,7 @@ static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
 {
 	mnt->mnt_parent = mntget(nd->mnt);
 	mnt->mnt_mountpoint = dget(nd->dentry);
-	list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
+	list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
 	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
 	nd->dentry->d_mounted++;
 }
@@ -165,8 +165,7 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
 	return list_entry(next, struct vfsmount, mnt_child);
 }
 
-static struct vfsmount *
-clone_mnt(struct vfsmount *old, struct dentry *root)
+static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root)
 {
 	struct super_block *sb = old->mnt_sb;
 	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
@@ -258,7 +257,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 	struct namespace *n = m->private;
 	struct list_head *p = ((struct vfsmount *)v)->mnt_list.next;
 	(*pos)++;
-	return p==&n->list ? NULL : list_entry(p, struct vfsmount, mnt_list);
+	return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list);
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -344,7 +343,8 @@ repeat:
 	next = this_parent->mnt_mounts.next;
 resume:
 	while (next != &this_parent->mnt_mounts) {
-		struct vfsmount *p = list_entry(next, struct vfsmount, mnt_child);
+		struct vfsmount *p =
+		    list_entry(next, struct vfsmount, mnt_child);
 
 		next = next->next;
 
@@ -425,7 +425,7 @@ static void umount_tree(struct vfsmount *mnt)
 
 static int do_umount(struct vfsmount *mnt, int flags)
 {
-	struct super_block * sb = mnt->mnt_sb;
+	struct super_block *sb = mnt->mnt_sb;
 	int retval;
 
 	retval = security_sb_umount(mnt, flags);
@@ -461,7 +461,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	 */
 
 	lock_kernel();
-	if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
+	if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
 		sb->s_op->umount_begin(sb);
 	unlock_kernel();
 
@@ -543,12 +543,11 @@ out:
 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
 
 /*
- *	The 2.0 compatible umount. No flags. 
+ *	The 2.0 compatible umount. No flags.
  */
- 
 asmlinkage long sys_oldumount(char __user * name)
 {
-	return sys_umount(name,0);
+	return sys_umount(name, 0);
 }
 
 #endif
@@ -571,8 +570,7 @@ static int mount_is_safe(struct nameidata *nd)
 #endif
 }
 
-static int
-lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
+static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
 {
 	while (1) {
 		if (d == dentry)
@@ -616,7 +614,7 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
 		}
 	}
 	return res;
- Enomem:
+Enomem:
 	if (res) {
 		spin_lock(&vfsmount_lock);
 		umount_tree(res);
@@ -718,12 +716,11 @@ out:
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
-
 static int do_remount(struct nameidata *nd, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
-	struct super_block * sb = nd->mnt->mnt_sb;
+	struct super_block *sb = nd->mnt->mnt_sb;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -737,7 +734,7 @@ static int do_remount(struct nameidata *nd, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	err = do_remount_sb(sb, flags, data, 0);
 	if (!err)
-		nd->mnt->mnt_flags=mnt_flags;
+		nd->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
 	if (!err)
 		security_sb_post_remount(nd->mnt, flags, data);
@@ -758,7 +755,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 		return err;
 
 	down_write(&current->namespace->sem);
-	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+	while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
 		;
 	err = -EINVAL;
 	if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
@@ -785,7 +782,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 		goto out2;
 
 	err = -ELOOP;
-	for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
+	for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent)
 		if (p == old_nd.mnt)
 			goto out2;
 	err = 0;
@@ -843,7 +840,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 
 	down_write(&current->namespace->sem);
 	/* Something was mounted here while we slept */
-	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+	while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
 		;
 	err = -EINVAL;
 	if (!check_mnt(nd->mnt))
@@ -986,8 +983,8 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
  * Note that this function differs from copy_from_user() in that it will oops
  * on bad values of `to', rather than returning a short copy.
  */
-static long
-exact_copy_from_user(void *to, const void __user *from, unsigned long n)
+static long exact_copy_from_user(void *to, const void __user * from,
+				 unsigned long n)
 {
 	char *t = to;
 	const char __user *f = from;
@@ -1008,12 +1005,12 @@ exact_copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 
-int copy_mount_options(const void __user *data, unsigned long *where)
+int copy_mount_options(const void __user * data, unsigned long *where)
 {
 	int i;
 	unsigned long page;
 	unsigned long size;
-	
+
 	*where = 0;
 	if (!data)
 		return 0;
@@ -1032,7 +1029,7 @@ int copy_mount_options(const void __user *data, unsigned long *where)
 
 	i = size - exact_copy_from_user((void *)page, data, size);
 	if (!i) {
-		free_page(page); 
+		free_page(page);
 		return -EFAULT;
 	}
 	if (i != PAGE_SIZE)
@@ -1055,7 +1052,7 @@ int copy_mount_options(const void __user *data, unsigned long *where)
  * Therefore, if this magic number is present, it carries no information
  * and must be discarded.
  */
-long do_mount(char * dev_name, char * dir_name, char *type_page,
+long do_mount(char *dev_name, char *dir_name, char *type_page,
 		  unsigned long flags, void *data_page)
 {
 	struct nameidata nd;
@@ -1083,7 +1080,7 @@ long do_mount(char * dev_name, char * dir_name, char *type_page,
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
-	flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE);
 
 	/* ... and get the mountpoint */
 	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
@@ -1207,7 +1204,7 @@ asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
 	unsigned long dev_page;
 	char *dir_page;
 
-	retval = copy_mount_options (type, &type_page);
+	retval = copy_mount_options(type, &type_page);
 	if (retval < 0)
 		return retval;
 
@@ -1216,17 +1213,17 @@ asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
 	if (IS_ERR(dir_page))
 		goto out1;
 
-	retval = copy_mount_options (dev_name, &dev_page);
+	retval = copy_mount_options(dev_name, &dev_page);
 	if (retval < 0)
 		goto out2;
 
-	retval = copy_mount_options (data, &data_page);
+	retval = copy_mount_options(data, &data_page);
 	if (retval < 0)
 		goto out3;
 
 	lock_kernel();
-	retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
-			  flags, (void*)data_page);
+	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
+			  flags, (void *)data_page);
 	unlock_kernel();
 	free_page(data_page);
 
@@ -1295,9 +1292,11 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
 		if (fs) {
 			atomic_inc(&fs->count);
 			task_unlock(p);
-			if (fs->root==old_nd->dentry&&fs->rootmnt==old_nd->mnt)
+			if (fs->root == old_nd->dentry
+			    && fs->rootmnt == old_nd->mnt)
 				set_fs_root(fs, new_nd->mnt, new_nd->dentry);
-			if (fs->pwd==old_nd->dentry&&fs->pwdmnt==old_nd->mnt)
+			if (fs->pwd == old_nd->dentry
+			    && fs->pwdmnt == old_nd->mnt)
 				set_fs_pwd(fs, new_nd->mnt, new_nd->dentry);
 			put_fs_struct(fs);
 		} else
@@ -1327,8 +1326,8 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
-
-asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *put_old)
+asmlinkage long sys_pivot_root(const char __user * new_root,
+			       const char __user * put_old)
 {
 	struct vfsmount *tmp;
 	struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
@@ -1339,14 +1338,15 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
 
 	lock_kernel();
 
-	error = __user_walk(new_root, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd);
+	error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+			    &new_nd);
 	if (error)
 		goto out0;
 	error = -EINVAL;
 	if (!check_mnt(new_nd.mnt))
 		goto out1;
 
-	error = __user_walk(put_old, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd);
+	error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd);
 	if (error)
 		goto out1;
 
@@ -1464,10 +1464,9 @@ void __init mnt_init(unsigned long mempages)
 	int i;
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
 
-	mount_hashtable = (struct list_head *)
-		__get_free_page(GFP_ATOMIC);
+	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
 
 	if (!mount_hashtable)
 		panic("Failed to allocate mount hash table\n");
@@ -1489,7 +1488,7 @@ void __init mnt_init(unsigned long mempages)
 	 * from the number of bits we can fit.
 	 */
 	nr_hash = 1UL << hash_bits;
-	hash_mask = nr_hash-1;
+	hash_mask = nr_hash - 1;
 
 	printk("Mount-cache hash table entries: %d\n", nr_hash);
 
-- 
cgit v1.2.2


From 5b83d2c5c0afcf5a3517cf00d9ceb41b8345e01b Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:16:29 -0500
Subject: [PATCH] sanitize the interface of graft_tree().

Old semantics: graft_tree() grabs a reference on the vfsmount before
returning success.

New one: graft_tree() leaves that to caller.

All the callers of graft_tree() immediately dropped that reference
anyway.  Changing the interface takes care of this unnecessary overhead.

Idea proposed by Al Viro.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 685687dccbf1..dfeeab964e84 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -650,7 +650,6 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		attach_mnt(mnt, nd);
 		list_add_tail(&head, &mnt->mnt_list);
 		list_splice(&head, current->namespace->list.prev);
-		mntget(mnt);
 		err = 0;
 		touch_namespace(current->namespace);
 	}
@@ -702,8 +701,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 		spin_lock(&vfsmount_lock);
 		umount_tree(mnt);
 		spin_unlock(&vfsmount_lock);
-	} else
-		mntput(mnt);
+	}
 
 out:
 	up_write(&current->namespace->sem);
@@ -857,15 +855,17 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 		goto unlock;
 
 	newmnt->mnt_flags = mnt_flags;
-	newmnt->mnt_namespace = current->namespace;
-	err = graft_tree(newmnt, nd);
+	if ((err = graft_tree(newmnt, nd)))
+		goto unlock;
 
-	if (err == 0 && fslist) {
+	if (fslist) {
 		/* add to the specified expiration list */
 		spin_lock(&vfsmount_lock);
 		list_add_tail(&newmnt->mnt_expire, fslist);
 		spin_unlock(&vfsmount_lock);
 	}
+	up_write(&current->namespace->sem);
+	return 0;
 
 unlock:
 	up_write(&current->namespace->sem);
-- 
cgit v1.2.2


From 70fbcdf4d252c6b17cc249cb9ac9b220cb0b863d Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:17:04 -0500
Subject: [PATCH] umount_tree() locking change

umount is done under the protection of the namespace semaphore.  This
can lead to intresting deadlocks when the last reference to a mount is
released, if filesystem code is in sufficiently nasty state.

This collects all the to-be-released-mounts and releases them after
releasing the namespace semaphore.  That both reduces the time we are
holding namespace semaphore and gets the things more robust.

Idea proposed by Al Viro.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 84 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 51 insertions(+), 33 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index dfeeab964e84..c2ffa0f349fd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -394,32 +394,45 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-static void umount_tree(struct vfsmount *mnt)
+static void release_mounts(struct list_head *head)
+{
+	struct vfsmount *mnt;
+	while(!list_empty(head)) {
+		mnt = list_entry(head->next, struct vfsmount, mnt_hash);
+		list_del_init(&mnt->mnt_hash);
+		if (mnt->mnt_parent != mnt) {
+			struct dentry *dentry;
+			struct vfsmount *m;
+			spin_lock(&vfsmount_lock);
+			dentry = mnt->mnt_mountpoint;
+			m = mnt->mnt_parent;
+			mnt->mnt_mountpoint = mnt->mnt_root;
+			mnt->mnt_parent = mnt;
+			spin_unlock(&vfsmount_lock);
+			dput(dentry);
+			mntput(m);
+		}
+		mntput(mnt);
+	}
+}
+
+static void umount_tree(struct vfsmount *mnt, struct list_head *kill)
 {
 	struct vfsmount *p;
-	LIST_HEAD(kill);
 
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		list_del(&p->mnt_list);
-		list_add(&p->mnt_list, &kill);
-		__touch_namespace(p->mnt_namespace);
-		p->mnt_namespace = NULL;
+		list_del(&p->mnt_hash);
+		list_add(&p->mnt_hash, kill);
 	}
 
-	while (!list_empty(&kill)) {
-		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
-		list_del_init(&mnt->mnt_list);
-		list_del_init(&mnt->mnt_expire);
-		if (mnt->mnt_parent == mnt) {
-			spin_unlock(&vfsmount_lock);
-		} else {
-			struct nameidata old_nd;
-			detach_mnt(mnt, &old_nd);
-			spin_unlock(&vfsmount_lock);
-			path_release(&old_nd);
-		}
-		mntput(mnt);
-		spin_lock(&vfsmount_lock);
+	list_for_each_entry(p, kill, mnt_hash) {
+		list_del_init(&p->mnt_expire);
+		list_del_init(&p->mnt_list);
+		__touch_namespace(p->mnt_namespace);
+		p->mnt_namespace = NULL;
+		list_del_init(&p->mnt_child);
+		if (p->mnt_parent != p)
+			mnt->mnt_mountpoint->d_mounted--;
 	}
 }
 
@@ -427,6 +440,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt_sb;
 	int retval;
+	LIST_HEAD(umount_list);
 
 	retval = security_sb_umount(mnt, flags);
 	if (retval)
@@ -497,13 +511,14 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	retval = -EBUSY;
 	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt);
+			umount_tree(mnt, &umount_list);
 		retval = 0;
 	}
 	spin_unlock(&vfsmount_lock);
 	if (retval)
 		security_sb_umount_busy(mnt);
 	up_write(&current->namespace->sem);
+	release_mounts(&umount_list);
 	return retval;
 }
 
@@ -616,9 +631,11 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
 	return res;
 Enomem:
 	if (res) {
+		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
-		umount_tree(res);
+		umount_tree(res, &umount_list);
 		spin_unlock(&vfsmount_lock);
+		release_mounts(&umount_list);
 	}
 	return NULL;
 }
@@ -698,9 +715,11 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 
 	err = graft_tree(mnt, nd);
 	if (err) {
+		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
-		umount_tree(mnt);
+		umount_tree(mnt, &umount_list);
 		spin_unlock(&vfsmount_lock);
+		release_mounts(&umount_list);
 	}
 
 out:
@@ -875,7 +894,8 @@ unlock:
 
 EXPORT_SYMBOL_GPL(do_add_mount);
 
-static void expire_mount(struct vfsmount *mnt, struct list_head *mounts)
+static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
+				struct list_head *umounts)
 {
 	spin_lock(&vfsmount_lock);
 
@@ -893,16 +913,12 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts)
 	 * contributed by the vfsmount parent and the mntget above
 	 */
 	if (atomic_read(&mnt->mnt_count) == 2) {
-		struct nameidata old_nd;
-
 		/* delete from the namespace */
 		touch_namespace(mnt->mnt_namespace);
 		list_del_init(&mnt->mnt_list);
 		mnt->mnt_namespace = NULL;
-		detach_mnt(mnt, &old_nd);
+		umount_tree(mnt, umounts);
 		spin_unlock(&vfsmount_lock);
-		path_release(&old_nd);
-		mntput(mnt);
 	} else {
 		/*
 		 * Someone brought it back to life whilst we didn't have any
@@ -951,6 +967,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 	 * - dispose of the corpse
 	 */
 	while (!list_empty(&graveyard)) {
+		LIST_HEAD(umounts);
 		mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
 		list_del_init(&mnt->mnt_expire);
 
@@ -963,12 +980,11 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 
 		spin_unlock(&vfsmount_lock);
 		down_write(&namespace->sem);
-		expire_mount(mnt, mounts);
+		expire_mount(mnt, mounts, &umounts);
 		up_write(&namespace->sem);
-
+		release_mounts(&umounts);
 		mntput(mnt);
 		put_namespace(namespace);
-
 		spin_lock(&vfsmount_lock);
 	}
 
@@ -1508,12 +1524,14 @@ void __init mnt_init(unsigned long mempages)
 void __put_namespace(struct namespace *namespace)
 {
 	struct vfsmount *root = namespace->root;
+	LIST_HEAD(umount_list);
 	namespace->root = NULL;
 	spin_unlock(&vfsmount_lock);
 	down_write(&namespace->sem);
 	spin_lock(&vfsmount_lock);
-	umount_tree(root);
+	umount_tree(root, &umount_list);
 	spin_unlock(&vfsmount_lock);
 	up_write(&namespace->sem);
+	release_mounts(&umount_list);
 	kfree(namespace);
 }
-- 
cgit v1.2.2


From 36341f64569b0c4572478237ec5ed318f0762510 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:17:22 -0500
Subject: [PATCH] mount expiry fixes

 - clean up the ugliness in may_umount_tree()

 - fix a bug in do_loopback().  after cloning a tree, do_loopback()
   unlinks only the topmost mount of the cloned tree, leaving behind the
   children mounts on their corresponding expiry list.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 64 ++++++++++++++++++++--------------------------------------
 1 file changed, 22 insertions(+), 42 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index c2ffa0f349fd..65f9c0ecc21c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,8 @@
 
 extern int __init init_rootfs(void);
 
+#define CL_EXPIRE 	0x01
+
 #ifdef CONFIG_SYSFS
 extern int __init sysfs_init(void);
 #else
@@ -165,7 +167,8 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
 	return list_entry(next, struct vfsmount, mnt_child);
 }
 
-static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root)
+static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
+					int flag)
 {
 	struct super_block *sb = old->mnt_sb;
 	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
@@ -181,10 +184,12 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root)
 
 		/* stick the duplicate mount on the same expiry list
 		 * as the original if that was on one */
-		spin_lock(&vfsmount_lock);
-		if (!list_empty(&old->mnt_expire))
-			list_add(&mnt->mnt_expire, &old->mnt_expire);
-		spin_unlock(&vfsmount_lock);
+		if (flag & CL_EXPIRE) {
+			spin_lock(&vfsmount_lock);
+			if (!list_empty(&old->mnt_expire))
+				list_add(&mnt->mnt_expire, &old->mnt_expire);
+			spin_unlock(&vfsmount_lock);
+		}
 	}
 	return mnt;
 }
@@ -331,36 +336,14 @@ struct seq_operations mounts_op = {
  */
 int may_umount_tree(struct vfsmount *mnt)
 {
-	struct list_head *next;
-	struct vfsmount *this_parent = mnt;
-	int actual_refs;
-	int minimum_refs;
+	int actual_refs = 0;
+	int minimum_refs = 0;
+	struct vfsmount *p;
 
 	spin_lock(&vfsmount_lock);
-	actual_refs = atomic_read(&mnt->mnt_count);
-	minimum_refs = 2;
-repeat:
-	next = this_parent->mnt_mounts.next;
-resume:
-	while (next != &this_parent->mnt_mounts) {
-		struct vfsmount *p =
-		    list_entry(next, struct vfsmount, mnt_child);
-
-		next = next->next;
-
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += atomic_read(&p->mnt_count);
 		minimum_refs += 2;
-
-		if (!list_empty(&p->mnt_mounts)) {
-			this_parent = p;
-			goto repeat;
-		}
-	}
-
-	if (this_parent != mnt) {
-		next = this_parent->mnt_child.next;
-		this_parent = this_parent->mnt_parent;
-		goto resume;
 	}
 	spin_unlock(&vfsmount_lock);
 
@@ -596,12 +579,13 @@ static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
 	}
 }
 
-static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
+static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+					int flag)
 {
 	struct vfsmount *res, *p, *q, *r, *s;
 	struct nameidata nd;
 
-	res = q = clone_mnt(mnt, dentry);
+	res = q = clone_mnt(mnt, dentry, flag);
 	if (!q)
 		goto Enomem;
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
@@ -619,7 +603,7 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
 			p = s;
 			nd.mnt = q;
 			nd.dentry = p->mnt_mountpoint;
-			q = clone_mnt(p, p->mnt_root);
+			q = clone_mnt(p, p->mnt_root, flag);
 			if (!q)
 				goto Enomem;
 			spin_lock(&vfsmount_lock);
@@ -701,18 +685,13 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 
 	err = -ENOMEM;
 	if (recurse)
-		mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+		mnt = copy_tree(old_nd.mnt, old_nd.dentry, 0);
 	else
-		mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+		mnt = clone_mnt(old_nd.mnt, old_nd.dentry, 0);
 
 	if (!mnt)
 		goto out;
 
-	/* stop bind mounts from expiring */
-	spin_lock(&vfsmount_lock);
-	list_del_init(&mnt->mnt_expire);
-	spin_unlock(&vfsmount_lock);
-
 	err = graft_tree(mnt, nd);
 	if (err) {
 		LIST_HEAD(umount_list);
@@ -1155,7 +1134,8 @@ int copy_namespace(int flags, struct task_struct *tsk)
 
 	down_write(&tsk->namespace->sem);
 	/* First pass: copy the tree topology */
-	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
+	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root,
+					CL_EXPIRE);
 	if (!new_ns->root) {
 		up_write(&tsk->namespace->sem);
 		kfree(new_ns);
-- 
cgit v1.2.2


From 390c684367de37e1c2f9005cf92f7a746c69fdd3 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:17:51 -0500
Subject: [PATCH] making namespace_sem global

This removes the per-namespace semaphore in favor of a global semaphore.
This can have an effect on namespace scalability.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 65f9c0ecc21c..4abee9ab009f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -46,6 +46,7 @@ static int event;
 static struct list_head *mount_hashtable;
 static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache;
+static struct rw_semaphore namespace_sem;
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -250,7 +251,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	struct list_head *p;
 	loff_t l = *pos;
 
-	down_read(&n->sem);
+	down_read(&namespace_sem);
 	list_for_each(p, &n->list)
 		if (!l--)
 			return list_entry(p, struct vfsmount, mnt_list);
@@ -267,8 +268,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void m_stop(struct seq_file *m, void *v)
 {
-	struct namespace *n = m->private;
-	up_read(&n->sem);
+	up_read(&namespace_sem);
 }
 
 static inline void mangle(struct seq_file *m, const char *s)
@@ -487,7 +487,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		return retval;
 	}
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
 	event++;
 
@@ -500,7 +500,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	spin_unlock(&vfsmount_lock);
 	if (retval)
 		security_sb_umount_busy(mnt);
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
 }
@@ -678,7 +678,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 	if (err)
 		return err;
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	err = -EINVAL;
 	if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
 		goto out;
@@ -702,7 +702,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 	}
 
 out:
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	path_release(&old_nd);
 	return err;
 }
@@ -750,7 +750,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	if (err)
 		return err;
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
 		;
 	err = -EINVAL;
@@ -795,7 +795,7 @@ out2:
 out1:
 	up(&nd->dentry->d_inode->i_sem);
 out:
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	if (!err)
 		path_release(&parent_nd);
 	path_release(&old_nd);
@@ -834,7 +834,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 {
 	int err;
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
 	while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
 		;
@@ -862,11 +862,11 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 		list_add_tail(&newmnt->mnt_expire, fslist);
 		spin_unlock(&vfsmount_lock);
 	}
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	return 0;
 
 unlock:
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	mntput(newmnt);
 	return err;
 }
@@ -958,9 +958,9 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		get_namespace(namespace);
 
 		spin_unlock(&vfsmount_lock);
-		down_write(&namespace->sem);
+		down_write(&namespace_sem);
 		expire_mount(mnt, mounts, &umounts);
-		up_write(&namespace->sem);
+		up_write(&namespace_sem);
 		release_mounts(&umounts);
 		mntput(mnt);
 		put_namespace(namespace);
@@ -1127,17 +1127,16 @@ int copy_namespace(int flags, struct task_struct *tsk)
 		goto out;
 
 	atomic_set(&new_ns->count, 1);
-	init_rwsem(&new_ns->sem);
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
 
-	down_write(&tsk->namespace->sem);
+	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
 	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root,
 					CL_EXPIRE);
 	if (!new_ns->root) {
-		up_write(&tsk->namespace->sem);
+		up_write(&namespace_sem);
 		kfree(new_ns);
 		goto out;
 	}
@@ -1171,7 +1170,7 @@ int copy_namespace(int flags, struct task_struct *tsk)
 		p = next_mnt(p, namespace->root);
 		q = next_mnt(q, new_ns->root);
 	}
-	up_write(&tsk->namespace->sem);
+	up_write(&namespace_sem);
 
 	tsk->namespace = new_ns;
 
@@ -1356,7 +1355,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	user_nd.mnt = mntget(current->fs->rootmnt);
 	user_nd.dentry = dget(current->fs->root);
 	read_unlock(&current->fs->lock);
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	down(&old_nd.dentry->d_inode->i_sem);
 	error = -EINVAL;
 	if (!check_mnt(user_nd.mnt))
@@ -1407,7 +1406,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	path_release(&parent_nd);
 out2:
 	up(&old_nd.dentry->d_inode->i_sem);
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	path_release(&user_nd);
 	path_release(&old_nd);
 out1:
@@ -1434,7 +1433,6 @@ static void __init init_mount_tree(void)
 		panic("Can't allocate initial namespace");
 	atomic_set(&namespace->count, 1);
 	INIT_LIST_HEAD(&namespace->list);
-	init_rwsem(&namespace->sem);
 	init_waitqueue_head(&namespace->poll);
 	namespace->event = 0;
 	list_add(&mnt->mnt_list, &namespace->list);
@@ -1459,6 +1457,8 @@ void __init mnt_init(unsigned long mempages)
 	unsigned int nr_hash;
 	int i;
 
+	init_rwsem(&namespace_sem);
+
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
 
@@ -1507,11 +1507,11 @@ void __put_namespace(struct namespace *namespace)
 	LIST_HEAD(umount_list);
 	namespace->root = NULL;
 	spin_unlock(&vfsmount_lock);
-	down_write(&namespace->sem);
+	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
 	umount_tree(root, &umount_list);
 	spin_unlock(&vfsmount_lock);
-	up_write(&namespace->sem);
+	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(namespace);
 }
-- 
cgit v1.2.2


From 07b20889e3052c7e77d6a6a54e7e83446eb1ba84 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:19:07 -0500
Subject: [PATCH] beginning of the shared-subtree proper

A private mount does not forward or receive propagation.  This patch
provides user the ability to convert any mount to private.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 4abee9ab009f..3782923d6d4d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include "pnode.h"
 
 extern int __init init_rootfs(void);
 
@@ -662,6 +663,27 @@ out_unlock:
 	return err;
 }
 
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct nameidata *nd, int flag)
+{
+	struct vfsmount *m, *mnt = nd->mnt;
+	int recurse = flag & MS_REC;
+	int type = flag & ~MS_REC;
+
+	if (nd->dentry != nd->mnt->mnt_root)
+		return -EINVAL;
+
+	down_write(&namespace_sem);
+	spin_lock(&vfsmount_lock);
+	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
+		change_mnt_propagation(m, type);
+	spin_unlock(&vfsmount_lock);
+	up_write(&namespace_sem);
+	return 0;
+}
+
 /*
  * do loopback mount.
  */
@@ -1091,6 +1113,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+	else if (flags & MS_PRIVATE)
+		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
 	else
-- 
cgit v1.2.2


From 03e06e68ff76294e53ffa898cb844d2a997b043e Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:19:33 -0500
Subject: [PATCH] introduce shared mounts

This creates shared mounts.  A shared mount when bind-mounted to some
mountpoint, propagates mount/umount events to each other.  All the
shared mounts that propagate events to each other belong to the same
peer-group.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 3782923d6d4d..f6861a5487df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,6 +68,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
+		INIT_LIST_HEAD(&mnt->mnt_share);
 		if (name) {
 			int size = strlen(name) + 1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -1113,7 +1114,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
-	else if (flags & MS_PRIVATE)
+	else if (flags & (MS_SHARED | MS_PRIVATE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
-- 
cgit v1.2.2


From b90fa9ae8f51f098ee480bbaabd6867992e9fc58 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:19:50 -0500
Subject: [PATCH] shared mount handling: bind and rbind

Implement handling of MS_BIND in presense of shared mounts (see
Documentation/sharedsubtree.txt in the end of patch series for detailed
description).

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 105 insertions(+), 21 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index f6861a5487df..9f5a084b239f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,8 +28,6 @@
 
 extern int __init init_rootfs(void);
 
-#define CL_EXPIRE 	0x01
-
 #ifdef CONFIG_SYSFS
 extern int __init sysfs_init(void);
 #else
@@ -145,13 +143,43 @@ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
 	old_nd->dentry->d_mounted--;
 }
 
+void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
+			struct vfsmount *child_mnt)
+{
+	child_mnt->mnt_parent = mntget(mnt);
+	child_mnt->mnt_mountpoint = dget(dentry);
+	dentry->d_mounted++;
+}
+
 static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
 {
-	mnt->mnt_parent = mntget(nd->mnt);
-	mnt->mnt_mountpoint = dget(nd->dentry);
-	list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
+	mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+			hash(nd->mnt, nd->dentry));
 	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
-	nd->dentry->d_mounted++;
+}
+
+/*
+ * the caller must hold vfsmount_lock
+ */
+static void commit_tree(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	struct vfsmount *m;
+	LIST_HEAD(head);
+	struct namespace *n = parent->mnt_namespace;
+
+	BUG_ON(parent == mnt);
+
+	list_add_tail(&head, &mnt->mnt_list);
+	list_for_each_entry(m, &head, mnt_list)
+		m->mnt_namespace = n;
+	list_splice(&head, n->list.prev);
+
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+				hash(parent, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	touch_namespace(n);
 }
 
 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
@@ -183,7 +211,11 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 		mnt->mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt_root;
 		mnt->mnt_parent = mnt;
-		mnt->mnt_namespace = current->namespace;
+
+		if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+			list_add(&mnt->mnt_share, &old->mnt_share);
+		if (flag & CL_MAKE_SHARED)
+			set_mnt_shared(mnt);
 
 		/* stick the duplicate mount on the same expiry list
 		 * as the original if that was on one */
@@ -379,7 +411,7 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-static void release_mounts(struct list_head *head)
+void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
 	while(!list_empty(head)) {
@@ -401,7 +433,7 @@ static void release_mounts(struct list_head *head)
 	}
 }
 
-static void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, struct list_head *kill)
 {
 	struct vfsmount *p;
 
@@ -581,7 +613,7 @@ static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
 	}
 }
 
-static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 					int flag)
 {
 	struct vfsmount *res, *p, *q, *r, *s;
@@ -626,6 +658,67 @@ Enomem:
 	return NULL;
 }
 
+/*
+ *  @source_mnt : mount tree to be attached
+ *  @nd        : place the mount tree @source_mnt is attached
+ *
+ *  NOTE: in the table below explains the semantics when a source mount
+ *  of a given type is attached to a destination mount of a given type.
+ * 	---------------------------------------------
+ * 	|         BIND MOUNT OPERATION              |
+ * 	|********************************************
+ * 	| source-->| shared        |       private  |
+ * 	| dest     |               |                |
+ * 	|   |      |               |                |
+ * 	|   v      |               |                |
+ * 	|********************************************
+ * 	|  shared  | shared (++)   |     shared (+) |
+ * 	|          |               |                |
+ * 	|non-shared| shared (+)    |      private   |
+ * 	*********************************************
+ * A bind operation clones the source mount and mounts the clone on the
+ * destination mount.
+ *
+ * (++)  the cloned mount is propagated to all the mounts in the propagation
+ * 	 tree of the destination mount and the cloned mount is added to
+ * 	 the peer group of the source mount.
+ * (+)   the cloned mount is created under the destination mount and is marked
+ *       as shared. The cloned mount is added to the peer group of the source
+ *       mount.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each mount in the tree.
+ * Must be called without spinlocks held, since this function can sleep
+ * in allocations.
+ */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+				struct nameidata *nd)
+{
+	LIST_HEAD(tree_list);
+	struct vfsmount *dest_mnt = nd->mnt;
+	struct dentry *dest_dentry = nd->dentry;
+	struct vfsmount *child, *p;
+
+	if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
+		return -EINVAL;
+
+	if (IS_MNT_SHARED(dest_mnt)) {
+		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+			set_mnt_shared(p);
+	}
+
+	spin_lock(&vfsmount_lock);
+	mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+	commit_tree(source_mnt);
+
+	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
+		list_del_init(&child->mnt_hash);
+		commit_tree(child);
+	}
+	spin_unlock(&vfsmount_lock);
+	return 0;
+}
+
 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 {
 	int err;
@@ -646,17 +739,8 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		goto out_unlock;
 
 	err = -ENOENT;
-	spin_lock(&vfsmount_lock);
-	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
-		struct list_head head;
-
-		attach_mnt(mnt, nd);
-		list_add_tail(&head, &mnt->mnt_list);
-		list_splice(&head, current->namespace->list.prev);
-		err = 0;
-		touch_namespace(current->namespace);
-	}
-	spin_unlock(&vfsmount_lock);
+	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
+		err = attach_recursive_mnt(mnt, nd);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
-- 
cgit v1.2.2


From 2144440327fa01b2f3f65e355120a78211685702 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:20:03 -0500
Subject: [PATCH] shared mounts handling: move

Implement handling of mount --move in presense of shared mounts (see
Documentation/sharedsubtree.txt in the end of patch series for detailed
description).

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 63 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 17 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 9f5a084b239f..1487982dbc24 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -660,7 +660,10 @@ Enomem:
 
 /*
  *  @source_mnt : mount tree to be attached
- *  @nd        : place the mount tree @source_mnt is attached
+ *  @nd         : place the mount tree @source_mnt is attached
+ *  @parent_nd  : if non-null, detach the source_mnt from its parent and
+ *  		   store the parent mount and mountpoint dentry.
+ *  		   (done when source_mnt is moved)
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
@@ -685,6 +688,21 @@ Enomem:
  * (+)   the cloned mount is created under the destination mount and is marked
  *       as shared. The cloned mount is added to the peer group of the source
  *       mount.
+ * 	---------------------------------------------
+ * 	|         	MOVE MOUNT OPERATION        |
+ * 	|********************************************
+ * 	| source-->| shared        |       private  |
+ * 	| dest     |               |                |
+ * 	|   |      |               |                |
+ * 	|   v      |               |                |
+ * 	|********************************************
+ * 	|  shared  | shared (+)    |     shared (+) |
+ * 	|          |               |                |
+ * 	|non-shared| shared (+*)   |      private   |
+ * 	*********************************************
+ * (+)  the mount is moved to the destination. And is then propagated to all
+ * 	the mounts in the propagation tree of the destination mount.
+ * (+*)  the mount is moved to the destination.
  *
  * if the source mount is a tree, the operations explained above is
  * applied to each mount in the tree.
@@ -692,7 +710,7 @@ Enomem:
  * in allocations.
  */
 static int attach_recursive_mnt(struct vfsmount *source_mnt,
-				struct nameidata *nd)
+			struct nameidata *nd, struct nameidata *parent_nd)
 {
 	LIST_HEAD(tree_list);
 	struct vfsmount *dest_mnt = nd->mnt;
@@ -708,8 +726,14 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	}
 
 	spin_lock(&vfsmount_lock);
-	mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
-	commit_tree(source_mnt);
+	if (parent_nd) {
+		detach_mnt(source_mnt, parent_nd);
+		attach_mnt(source_mnt, nd);
+		touch_namespace(current->namespace);
+	} else {
+		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+		commit_tree(source_mnt);
+	}
 
 	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
 		list_del_init(&child->mnt_hash);
@@ -740,7 +764,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 
 	err = -ENOENT;
 	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
-		err = attach_recursive_mnt(mnt, nd);
+		err = attach_recursive_mnt(mnt, nd, NULL);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
@@ -869,35 +893,36 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	if (IS_DEADDIR(nd->dentry->d_inode))
 		goto out1;
 
-	spin_lock(&vfsmount_lock);
 	if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
-		goto out2;
+		goto out1;
 
 	err = -EINVAL;
 	if (old_nd.dentry != old_nd.mnt->mnt_root)
-		goto out2;
+		goto out1;
 
 	if (old_nd.mnt == old_nd.mnt->mnt_parent)
-		goto out2;
+		goto out1;
 
 	if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
 	      S_ISDIR(old_nd.dentry->d_inode->i_mode))
-		goto out2;
-
+		goto out1;
+	/*
+	 * Don't move a mount residing in a shared parent.
+	 */
+	if (old_nd.mnt->mnt_parent && IS_MNT_SHARED(old_nd.mnt->mnt_parent))
+		goto out1;
 	err = -ELOOP;
 	for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent)
 		if (p == old_nd.mnt)
-			goto out2;
-	err = 0;
+			goto out1;
 
-	detach_mnt(old_nd.mnt, &parent_nd);
-	attach_mnt(old_nd.mnt, nd);
-	touch_namespace(current->namespace);
+	if ((err = attach_recursive_mnt(old_nd.mnt, nd, &parent_nd)))
+		goto out1;
 
+	spin_lock(&vfsmount_lock);
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old_nd.mnt->mnt_expire);
-out2:
 	spin_unlock(&vfsmount_lock);
 out1:
 	up(&nd->dentry->d_inode->i_sem);
@@ -1467,6 +1492,10 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	down_write(&namespace_sem);
 	down(&old_nd.dentry->d_inode->i_sem);
 	error = -EINVAL;
+	if (IS_MNT_SHARED(old_nd.mnt) ||
+		IS_MNT_SHARED(new_nd.mnt->mnt_parent) ||
+		IS_MNT_SHARED(user_nd.mnt->mnt_parent))
+		goto out2;
 	if (!check_mnt(user_nd.mnt))
 		goto out2;
 	error = -ENOENT;
-- 
cgit v1.2.2


From a05964f3917c7c55368c229d7985f8e7c9977e97 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:20:17 -0500
Subject: [PATCH] shared mounts handling: umount

An unmount of a mount creates a umount event on the parent.  If the
parent is a shared mount, it gets propagated to all mounts in the peer
group.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 56 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1487982dbc24..4b1af01c2fb4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,31 +86,44 @@ void free_vfsmnt(struct vfsmount *mnt)
 }
 
 /*
- * Now, lookup_mnt increments the ref count before returning
- * the vfsmount struct.
+ * find the first or last mount at @dentry on vfsmount @mnt depending on
+ * @dir. If @dir is set return the first mount else return the last mount.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+			      int dir)
 {
 	struct list_head *head = mount_hashtable + hash(mnt, dentry);
 	struct list_head *tmp = head;
 	struct vfsmount *p, *found = NULL;
 
-	spin_lock(&vfsmount_lock);
 	for (;;) {
-		tmp = tmp->next;
+		tmp = dir ? tmp->next : tmp->prev;
 		p = NULL;
 		if (tmp == head)
 			break;
 		p = list_entry(tmp, struct vfsmount, mnt_hash);
 		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
-			found = mntget(p);
+			found = p;
 			break;
 		}
 	}
-	spin_unlock(&vfsmount_lock);
 	return found;
 }
 
+/*
+ * lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct vfsmount *child_mnt;
+	spin_lock(&vfsmount_lock);
+	if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+		mntget(child_mnt);
+	spin_unlock(&vfsmount_lock);
+	return child_mnt;
+}
+
 static inline int check_mnt(struct vfsmount *mnt)
 {
 	return mnt->mnt_namespace == current->namespace;
@@ -404,9 +417,12 @@ EXPORT_SYMBOL(may_umount_tree);
  */
 int may_umount(struct vfsmount *mnt)
 {
-	if (atomic_read(&mnt->mnt_count) > 2)
-		return -EBUSY;
-	return 0;
+	int ret = 0;
+	spin_lock(&vfsmount_lock);
+	if (propagate_mount_busy(mnt, 2))
+		ret = -EBUSY;
+	spin_unlock(&vfsmount_lock);
+	return ret;
 }
 
 EXPORT_SYMBOL(may_umount);
@@ -433,7 +449,7 @@ void release_mounts(struct list_head *head)
 	}
 }
 
-void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
 
@@ -442,6 +458,9 @@ void umount_tree(struct vfsmount *mnt, struct list_head *kill)
 		list_add(&p->mnt_hash, kill);
 	}
 
+	if (propagate)
+		propagate_umount(kill);
+
 	list_for_each_entry(p, kill, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
@@ -450,6 +469,7 @@ void umount_tree(struct vfsmount *mnt, struct list_head *kill)
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p)
 			mnt->mnt_mountpoint->d_mounted--;
+		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
 
@@ -526,9 +546,9 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	event++;
 
 	retval = -EBUSY;
-	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
+	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, &umount_list);
+			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
 	spin_unlock(&vfsmount_lock);
@@ -651,7 +671,7 @@ Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
-		umount_tree(res, &umount_list);
+		umount_tree(res, 0, &umount_list);
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
@@ -827,7 +847,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 	if (err) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
-		umount_tree(mnt, &umount_list);
+		umount_tree(mnt, 0, &umount_list);
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
@@ -1023,12 +1043,12 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 	 * Check that it is still dead: the count should now be 2 - as
 	 * contributed by the vfsmount parent and the mntget above
 	 */
-	if (atomic_read(&mnt->mnt_count) == 2) {
+	if (!propagate_mount_busy(mnt, 2)) {
 		/* delete from the namespace */
 		touch_namespace(mnt->mnt_namespace);
 		list_del_init(&mnt->mnt_list);
 		mnt->mnt_namespace = NULL;
-		umount_tree(mnt, umounts);
+		umount_tree(mnt, 1, umounts);
 		spin_unlock(&vfsmount_lock);
 	} else {
 		/*
@@ -1647,7 +1667,7 @@ void __put_namespace(struct namespace *namespace)
 	spin_unlock(&vfsmount_lock);
 	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
-	umount_tree(root, &umount_list);
+	umount_tree(root, 0, &umount_list);
 	spin_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
-- 
cgit v1.2.2


From a58b0eb8e64b78d9315a5491955e78b1391d42e5 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:20:48 -0500
Subject: [PATCH] introduce slave mounts

A slave mount always has a master mount from which it receives
mount/umount events.  Unlike shared mount the event propagation does not
flow from the slave mount to the master.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 4b1af01c2fb4..46f99bc585bd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -67,6 +67,8 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		INIT_LIST_HEAD(&mnt->mnt_share);
+		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+		INIT_LIST_HEAD(&mnt->mnt_slave);
 		if (name) {
 			int size = strlen(name) + 1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -1243,7 +1245,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
-	else if (flags & (MS_SHARED | MS_PRIVATE))
+	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
-- 
cgit v1.2.2


From 5afe00221389998a25d611dc7941c06580c29eb6 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:21:01 -0500
Subject: [PATCH] handling of slave mounts

This makes bind, rbind, move, clone namespace and umount operations
aware of the semantics of slave mount (see Documentation/sharedsubtree.txt
in the last patch of the series for detailed description).

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 77 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 28 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 46f99bc585bd..089670363704 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -227,8 +227,17 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 		mnt->mnt_mountpoint = mnt->mnt_root;
 		mnt->mnt_parent = mnt;
 
-		if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
-			list_add(&mnt->mnt_share, &old->mnt_share);
+		if (flag & CL_SLAVE) {
+			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+			mnt->mnt_master = old;
+			CLEAR_MNT_SHARED(mnt);
+		} else {
+			if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+				list_add(&mnt->mnt_share, &old->mnt_share);
+			if (IS_MNT_SLAVE(old))
+				list_add(&mnt->mnt_slave, &old->mnt_slave);
+			mnt->mnt_master = old->mnt_master;
+		}
 		if (flag & CL_MAKE_SHARED)
 			set_mnt_shared(mnt);
 
@@ -689,18 +698,18 @@ Enomem:
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
- * 	---------------------------------------------
- * 	|         BIND MOUNT OPERATION              |
- * 	|********************************************
- * 	| source-->| shared        |       private  |
- * 	| dest     |               |                |
- * 	|   |      |               |                |
- * 	|   v      |               |                |
- * 	|********************************************
- * 	|  shared  | shared (++)   |     shared (+) |
- * 	|          |               |                |
- * 	|non-shared| shared (+)    |      private   |
- * 	*********************************************
+ * 	-------------------------------------------------------------
+ * 	|         BIND MOUNT OPERATION                               |
+ * 	|*************************************************************
+ * 	| source-->| shared        |       private  |       slave    |
+ * 	| dest     |               |                |                |
+ * 	|   |      |               |                |                |
+ * 	|   v      |               |                |                |
+ * 	|*************************************************************
+ * 	|  shared  | shared (++)   |     shared (+) |     shared(+++)|
+ * 	|          |               |                |                |
+ * 	|non-shared| shared (+)    |      private   |      slave (*) |
+ * 	**************************************************************
  * A bind operation clones the source mount and mounts the clone on the
  * destination mount.
  *
@@ -710,21 +719,33 @@ Enomem:
  * (+)   the cloned mount is created under the destination mount and is marked
  *       as shared. The cloned mount is added to the peer group of the source
  *       mount.
- * 	---------------------------------------------
- * 	|         	MOVE MOUNT OPERATION        |
- * 	|********************************************
- * 	| source-->| shared        |       private  |
- * 	| dest     |               |                |
- * 	|   |      |               |                |
- * 	|   v      |               |                |
- * 	|********************************************
- * 	|  shared  | shared (+)    |     shared (+) |
- * 	|          |               |                |
- * 	|non-shared| shared (+*)   |      private   |
- * 	*********************************************
- * (+)  the mount is moved to the destination. And is then propagated to all
- * 	the mounts in the propagation tree of the destination mount.
+ * (+++) the mount is propagated to all the mounts in the propagation tree
+ *       of the destination mount and the cloned mount is made slave
+ *       of the same master as that of the source mount. The cloned mount
+ *       is marked as 'shared and slave'.
+ * (*)   the cloned mount is made a slave of the same master as that of the
+ * 	 source mount.
+ *
+ * 	--------------------------------------------------------------
+ * 	|         		MOVE MOUNT OPERATION                 |
+ * 	|*************************************************************
+ * 	| source-->| shared        |       private  |       slave    |
+ * 	| dest     |               |                |                |
+ * 	|   |      |               |                |                |
+ * 	|   v      |               |                |                |
+ * 	|*************************************************************
+ * 	|  shared  | shared (+)    |     shared (+) |    shared(+++) |
+ * 	|          |               |                |                |
+ * 	|non-shared| shared (+*)   |      private   |    slave (*)   |
+ * 	**************************************************************
+ *
+ * (+)  the mount is moved to the destination. And is then propagated to
+ * 	all the mounts in the propagation tree of the destination mount.
  * (+*)  the mount is moved to the destination.
+ * (+++)  the mount is moved to the destination and is then propagated to
+ * 	all the mounts belonging to the destination mount's propagation tree.
+ * 	the mount is marked as 'shared and slave'.
+ * (*)	the mount continues to be a slave at the new location.
  *
  * if the source mount is a tree, the operations explained above is
  * applied to each mount in the tree.
-- 
cgit v1.2.2


From 9676f0c6389b62bd6b24d77d4b3abdbcfa32d0f2 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 7 Nov 2005 17:21:20 -0500
Subject: [PATCH] unbindable mounts

An unbindable mount does not forward or receive propagation.  Also
unbindable mount disallows bind mounts.  The semantics is as follows.

Bind semantics:
  It is invalid to bind mount an unbindable mount.

Move semantics:
  It is invalid to move an unbindable mount under shared mount.

Clone-namespace semantics:
  If a mount is unbindable in the parent namespace, the corresponding
  cloned mount in the child namespace becomes unbindable too.  Note:
  there is subtle difference, unbindable mounts cannot be bind mounted
  but can be cloned during clone-namespace.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 88 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 26 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 089670363704..caa9187f67e5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -213,6 +213,16 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
 	return list_entry(next, struct vfsmount, mnt_child);
 }
 
+static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
+{
+	struct list_head *prev = p->mnt_mounts.prev;
+	while (prev != &p->mnt_mounts) {
+		p = list_entry(prev, struct vfsmount, mnt_child);
+		prev = p->mnt_mounts.prev;
+	}
+	return p;
+}
+
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 					int flag)
 {
@@ -650,6 +660,9 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 	struct vfsmount *res, *p, *q, *r, *s;
 	struct nameidata nd;
 
+	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
+		return NULL;
+
 	res = q = clone_mnt(mnt, dentry, flag);
 	if (!q)
 		goto Enomem;
@@ -661,6 +674,10 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
+			if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) {
+				s = skip_mnt_tree(s);
+				continue;
+			}
 			while (p != s->mnt_parent) {
 				p = p->mnt_parent;
 				q = q->mnt_parent;
@@ -698,18 +715,18 @@ Enomem:
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
- * 	-------------------------------------------------------------
- * 	|         BIND MOUNT OPERATION                               |
- * 	|*************************************************************
- * 	| source-->| shared        |       private  |       slave    |
- * 	| dest     |               |                |                |
- * 	|   |      |               |                |                |
- * 	|   v      |               |                |                |
- * 	|*************************************************************
- * 	|  shared  | shared (++)   |     shared (+) |     shared(+++)|
- * 	|          |               |                |                |
- * 	|non-shared| shared (+)    |      private   |      slave (*) |
- * 	**************************************************************
+ * ---------------------------------------------------------------------------
+ * |         BIND MOUNT OPERATION                                            |
+ * |**************************************************************************
+ * | source-->| shared        |       private  |       slave    | unbindable |
+ * | dest     |               |                |                |            |
+ * |   |      |               |                |                |            |
+ * |   v      |               |                |                |            |
+ * |**************************************************************************
+ * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
+ * |          |               |                |                |            |
+ * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
+ * ***************************************************************************
  * A bind operation clones the source mount and mounts the clone on the
  * destination mount.
  *
@@ -726,18 +743,18 @@ Enomem:
  * (*)   the cloned mount is made a slave of the same master as that of the
  * 	 source mount.
  *
- * 	--------------------------------------------------------------
- * 	|         		MOVE MOUNT OPERATION                 |
- * 	|*************************************************************
- * 	| source-->| shared        |       private  |       slave    |
- * 	| dest     |               |                |                |
- * 	|   |      |               |                |                |
- * 	|   v      |               |                |                |
- * 	|*************************************************************
- * 	|  shared  | shared (+)    |     shared (+) |    shared(+++) |
- * 	|          |               |                |                |
- * 	|non-shared| shared (+*)   |      private   |    slave (*)   |
- * 	**************************************************************
+ * ---------------------------------------------------------------------------
+ * |         		MOVE MOUNT OPERATION                                 |
+ * |**************************************************************************
+ * | source-->| shared        |       private  |       slave    | unbindable |
+ * | dest     |               |                |                |            |
+ * |   |      |               |                |                |            |
+ * |   v      |               |                |                |            |
+ * |**************************************************************************
+ * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
+ * |          |               |                |                |            |
+ * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
+ * ***************************************************************************
  *
  * (+)  the mount is moved to the destination. And is then propagated to
  * 	all the mounts in the propagation tree of the destination mount.
@@ -854,6 +871,9 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 
 	down_write(&namespace_sem);
 	err = -EINVAL;
+	if (IS_MNT_UNBINDABLE(old_nd.mnt))
+ 		goto out;
+
 	if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
 		goto out;
 
@@ -911,6 +931,16 @@ static int do_remount(struct nameidata *nd, int flags, int mnt_flags,
 	return err;
 }
 
+static inline int tree_contains_unbindable(struct vfsmount *mnt)
+{
+	struct vfsmount *p;
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		if (IS_MNT_UNBINDABLE(p))
+			return 1;
+	}
+	return 0;
+}
+
 static int do_move_mount(struct nameidata *nd, char *old_name)
 {
 	struct nameidata old_nd, parent_nd;
@@ -954,6 +984,12 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	 */
 	if (old_nd.mnt->mnt_parent && IS_MNT_SHARED(old_nd.mnt->mnt_parent))
 		goto out1;
+	/*
+	 * Don't move a mount tree containing unbindable mounts to a destination
+	 * mount which is shared.
+	 */
+	if (IS_MNT_SHARED(nd->mnt) && tree_contains_unbindable(old_nd.mnt))
+		goto out1;
 	err = -ELOOP;
 	for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent)
 		if (p == old_nd.mnt)
@@ -1266,7 +1302,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
-	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE))
+	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
@@ -1311,7 +1347,7 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
 	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root,
-					CL_EXPIRE);
+					CL_COPY_ALL | CL_EXPIRE);
 	if (!new_ns->root) {
 		up_write(&namespace_sem);
 		kfree(new_ns);
-- 
cgit v1.2.2