aboutsummaryrefslogtreecommitdiffstats
path: root/ipc
diff options
context:
space:
mode:
authorSerge E. Hallyn <serue@us.ibm.com>2009-04-06 22:01:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-07 11:31:09 -0400
commit7eafd7c74c3f2e67c27621b987b28397110d643f (patch)
treeb4621aab78b6303f20386096c230b993044a4db7 /ipc
parent614b84cf4e4a920d2af32b8f147ea1e3b8c27ea6 (diff)
namespaces: ipc namespaces: implement support for posix msqueues
Implement multiple mounts of the mqueue file system, and link it to usage of CLONE_NEWIPC. Each ipc ns has a corresponding mqueuefs superblock. When a user does clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the unshare will cause an internal mount of a new mqueuefs sb linked to the new ipc ns. When a user does 'mount -t mqueue mqueue /dev/mqueue', he mounts the mqueuefs superblock. Posix message queues can be worked with both through the mq_* system calls (see mq_overview(7)), and through the VFS through the mqueue mount. Any usage of mq_open() and friends will work with the acting task's ipc namespace. Any actions through the VFS will work with the mqueuefs in which the file was created. So if a user doesn't remount mqueuefs after unshare(CLONE_NEWIPC), mq_open("/ab") will not be reflected in "ls /dev/mqueue". If task a mounts mqueue for ipc_ns:1, then clones task b with a new ipcns, ipcns:2, and then task a is the last task in ipc_ns:1 to exit, then (1) ipc_ns:1 will be freed, (2) it's superblock will live on until task b umounts the corresponding mqueuefs, and vfs actions will continue to succeed, but (3) sb->s_fs_info will be NULL for the sb corresponding to the deceased ipc_ns:1. To make this happen, we must protect the ipc reference count when a) a task exits and drops its ipcns->count, since it might be dropping it to 0 and freeing the ipcns b) a task accesses the ipcns through its mqueuefs interface, since it bumps the ipcns refcount and might race with the last task in the ipcns exiting. So the kref is changed to an atomic_t so we can use atomic_dec_and_lock(&ns->count,mq_lock), and every access to the ipcns through ns = mqueuefs_sb->s_fs_info is protected by the same lock. Signed-off-by: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
-rw-r--r--ipc/mqueue.c111
-rw-r--r--ipc/msgutil.c9
-rw-r--r--ipc/namespace.c41
-rw-r--r--ipc/util.h6
4 files changed, 124 insertions, 43 deletions
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a3673a09069a..c82d7b51ef68 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -88,7 +88,6 @@ static const struct file_operations mqueue_file_operations;
88static struct super_operations mqueue_super_ops; 88static struct super_operations mqueue_super_ops;
89static void remove_notification(struct mqueue_inode_info *info); 89static void remove_notification(struct mqueue_inode_info *info);
90 90
91static spinlock_t mq_lock;
92static struct kmem_cache *mqueue_inode_cachep; 91static struct kmem_cache *mqueue_inode_cachep;
93 92
94static struct ctl_table_header * mq_sysctl_table; 93static struct ctl_table_header * mq_sysctl_table;
@@ -98,27 +97,30 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
98 return container_of(inode, struct mqueue_inode_info, vfs_inode); 97 return container_of(inode, struct mqueue_inode_info, vfs_inode);
99} 98}
100 99
101void mq_init_ns(struct ipc_namespace *ns) 100/*
101 * This routine should be called with the mq_lock held.
102 */
103static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
102{ 104{
103 ns->mq_queues_count = 0; 105 return get_ipc_ns(inode->i_sb->s_fs_info);
104 ns->mq_queues_max = DFLT_QUEUESMAX;
105 ns->mq_msg_max = DFLT_MSGMAX;
106 ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
107 ns->mq_mnt = mntget(init_ipc_ns.mq_mnt);
108} 106}
109 107
110void mq_exit_ns(struct ipc_namespace *ns) 108static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
111{ 109{
112 /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */ 110 struct ipc_namespace *ns;
113 mntput(ns->mq_mnt); 111
112 spin_lock(&mq_lock);
113 ns = __get_ns_from_inode(inode);
114 spin_unlock(&mq_lock);
115 return ns;
114} 116}
115 117
116static struct inode *mqueue_get_inode(struct super_block *sb, int mode, 118static struct inode *mqueue_get_inode(struct super_block *sb,
117 struct mq_attr *attr) 119 struct ipc_namespace *ipc_ns, int mode,
120 struct mq_attr *attr)
118{ 121{
119 struct user_struct *u = current_user(); 122 struct user_struct *u = current_user();
120 struct inode *inode; 123 struct inode *inode;
121 struct ipc_namespace *ipc_ns = &init_ipc_ns;
122 124
123 inode = new_inode(sb); 125 inode = new_inode(sb);
124 if (inode) { 126 if (inode) {
@@ -193,30 +195,38 @@ out_inode:
193static int mqueue_fill_super(struct super_block *sb, void *data, int silent) 195static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
194{ 196{
195 struct inode *inode; 197 struct inode *inode;
198 struct ipc_namespace *ns = data;
199 int error = 0;
196 200
197 sb->s_blocksize = PAGE_CACHE_SIZE; 201 sb->s_blocksize = PAGE_CACHE_SIZE;
198 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 202 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
199 sb->s_magic = MQUEUE_MAGIC; 203 sb->s_magic = MQUEUE_MAGIC;
200 sb->s_op = &mqueue_super_ops; 204 sb->s_op = &mqueue_super_ops;
201 205
202 inode = mqueue_get_inode(sb, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL); 206 inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
203 if (!inode) 207 NULL);
204 return -ENOMEM; 208 if (!inode) {
209 error = -ENOMEM;
210 goto out;
211 }
205 212
206 sb->s_root = d_alloc_root(inode); 213 sb->s_root = d_alloc_root(inode);
207 if (!sb->s_root) { 214 if (!sb->s_root) {
208 iput(inode); 215 iput(inode);
209 return -ENOMEM; 216 error = -ENOMEM;
210 } 217 }
211 218
212 return 0; 219out:
220 return error;
213} 221}
214 222
215static int mqueue_get_sb(struct file_system_type *fs_type, 223static int mqueue_get_sb(struct file_system_type *fs_type,
216 int flags, const char *dev_name, 224 int flags, const char *dev_name,
217 void *data, struct vfsmount *mnt) 225 void *data, struct vfsmount *mnt)
218{ 226{
219 return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt); 227 if (!(flags & MS_KERNMOUNT))
228 data = current->nsproxy->ipc_ns;
229 return get_sb_ns(fs_type, flags, data, mqueue_fill_super, mnt);
220} 230}
221 231
222static void init_once(void *foo) 232static void init_once(void *foo)
@@ -247,12 +257,13 @@ static void mqueue_delete_inode(struct inode *inode)
247 struct user_struct *user; 257 struct user_struct *user;
248 unsigned long mq_bytes; 258 unsigned long mq_bytes;
249 int i; 259 int i;
250 struct ipc_namespace *ipc_ns = &init_ipc_ns; 260 struct ipc_namespace *ipc_ns;
251 261
252 if (S_ISDIR(inode->i_mode)) { 262 if (S_ISDIR(inode->i_mode)) {
253 clear_inode(inode); 263 clear_inode(inode);
254 return; 264 return;
255 } 265 }
266 ipc_ns = get_ns_from_inode(inode);
256 info = MQUEUE_I(inode); 267 info = MQUEUE_I(inode);
257 spin_lock(&info->lock); 268 spin_lock(&info->lock);
258 for (i = 0; i < info->attr.mq_curmsgs; i++) 269 for (i = 0; i < info->attr.mq_curmsgs; i++)
@@ -268,10 +279,19 @@ static void mqueue_delete_inode(struct inode *inode)
268 if (user) { 279 if (user) {
269 spin_lock(&mq_lock); 280 spin_lock(&mq_lock);
270 user->mq_bytes -= mq_bytes; 281 user->mq_bytes -= mq_bytes;
271 ipc_ns->mq_queues_count--; 282 /*
283 * get_ns_from_inode() ensures that the
284 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
285 * to which we now hold a reference, or it is NULL.
286 * We can't put it here under mq_lock, though.
287 */
288 if (ipc_ns)
289 ipc_ns->mq_queues_count--;
272 spin_unlock(&mq_lock); 290 spin_unlock(&mq_lock);
273 free_uid(user); 291 free_uid(user);
274 } 292 }
293 if (ipc_ns)
294 put_ipc_ns(ipc_ns);
275} 295}
276 296
277static int mqueue_create(struct inode *dir, struct dentry *dentry, 297static int mqueue_create(struct inode *dir, struct dentry *dentry,
@@ -280,9 +300,14 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
280 struct inode *inode; 300 struct inode *inode;
281 struct mq_attr *attr = dentry->d_fsdata; 301 struct mq_attr *attr = dentry->d_fsdata;
282 int error; 302 int error;
283 struct ipc_namespace *ipc_ns = &init_ipc_ns; 303 struct ipc_namespace *ipc_ns;
284 304
285 spin_lock(&mq_lock); 305 spin_lock(&mq_lock);
306 ipc_ns = __get_ns_from_inode(dir);
307 if (!ipc_ns) {
308 error = -EACCES;
309 goto out_unlock;
310 }
286 if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && 311 if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
287 !capable(CAP_SYS_RESOURCE)) { 312 !capable(CAP_SYS_RESOURCE)) {
288 error = -ENOSPC; 313 error = -ENOSPC;
@@ -291,7 +316,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
291 ipc_ns->mq_queues_count++; 316 ipc_ns->mq_queues_count++;
292 spin_unlock(&mq_lock); 317 spin_unlock(&mq_lock);
293 318
294 inode = mqueue_get_inode(dir->i_sb, mode, attr); 319 inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
295 if (!inode) { 320 if (!inode) {
296 error = -ENOMEM; 321 error = -ENOMEM;
297 spin_lock(&mq_lock); 322 spin_lock(&mq_lock);
@@ -299,6 +324,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
299 goto out_unlock; 324 goto out_unlock;
300 } 325 }
301 326
327 put_ipc_ns(ipc_ns);
302 dir->i_size += DIRENT_SIZE; 328 dir->i_size += DIRENT_SIZE;
303 dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME; 329 dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
304 330
@@ -307,6 +333,8 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
307 return 0; 333 return 0;
308out_unlock: 334out_unlock:
309 spin_unlock(&mq_lock); 335 spin_unlock(&mq_lock);
336 if (ipc_ns)
337 put_ipc_ns(ipc_ns);
310 return error; 338 return error;
311} 339}
312 340
@@ -668,7 +696,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
668 char *name; 696 char *name;
669 struct mq_attr attr; 697 struct mq_attr attr;
670 int fd, error; 698 int fd, error;
671 struct ipc_namespace *ipc_ns = &init_ipc_ns; 699 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
672 700
673 if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr))) 701 if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
674 return -EFAULT; 702 return -EFAULT;
@@ -738,7 +766,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
738 char *name; 766 char *name;
739 struct dentry *dentry; 767 struct dentry *dentry;
740 struct inode *inode = NULL; 768 struct inode *inode = NULL;
741 struct ipc_namespace *ipc_ns = &init_ipc_ns; 769 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
742 770
743 name = getname(u_name); 771 name = getname(u_name);
744 if (IS_ERR(name)) 772 if (IS_ERR(name))
@@ -1217,6 +1245,32 @@ static struct file_system_type mqueue_fs_type = {
1217 .kill_sb = kill_litter_super, 1245 .kill_sb = kill_litter_super,
1218}; 1246};
1219 1247
1248int mq_init_ns(struct ipc_namespace *ns)
1249{
1250 ns->mq_queues_count = 0;
1251 ns->mq_queues_max = DFLT_QUEUESMAX;
1252 ns->mq_msg_max = DFLT_MSGMAX;
1253 ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
1254
1255 ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
1256 if (IS_ERR(ns->mq_mnt)) {
1257 int err = PTR_ERR(ns->mq_mnt);
1258 ns->mq_mnt = NULL;
1259 return err;
1260 }
1261 return 0;
1262}
1263
1264void mq_clear_sbinfo(struct ipc_namespace *ns)
1265{
1266 ns->mq_mnt->mnt_sb->s_fs_info = NULL;
1267}
1268
1269void mq_put_mnt(struct ipc_namespace *ns)
1270{
1271 mntput(ns->mq_mnt);
1272}
1273
1220static int msg_max_limit_min = MIN_MSGMAX; 1274static int msg_max_limit_min = MIN_MSGMAX;
1221static int msg_max_limit_max = MAX_MSGMAX; 1275static int msg_max_limit_max = MAX_MSGMAX;
1222 1276
@@ -1288,15 +1342,14 @@ static int __init init_mqueue_fs(void)
1288 if (error) 1342 if (error)
1289 goto out_sysctl; 1343 goto out_sysctl;
1290 1344
1291 init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type); 1345 spin_lock_init(&mq_lock);
1346
1347 init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
1292 if (IS_ERR(init_ipc_ns.mq_mnt)) { 1348 if (IS_ERR(init_ipc_ns.mq_mnt)) {
1293 error = PTR_ERR(init_ipc_ns.mq_mnt); 1349 error = PTR_ERR(init_ipc_ns.mq_mnt);
1294 goto out_filesystem; 1350 goto out_filesystem;
1295 } 1351 }
1296 1352
1297 /* internal initialization - not common for vfs */
1298 spin_lock_init(&mq_lock);
1299
1300 return 0; 1353 return 0;
1301 1354
1302out_filesystem: 1355out_filesystem:
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 73c316cb8613..f095ee268833 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,19 +18,16 @@
18 18
19#include "util.h" 19#include "util.h"
20 20
21DEFINE_SPINLOCK(mq_lock);
22
21/* 23/*
22 * The next 2 defines are here bc this is the only file 24 * The next 2 defines are here bc this is the only file
23 * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE 25 * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
24 * and not CONFIG_IPC_NS. 26 * and not CONFIG_IPC_NS.
25 */ 27 */
26struct ipc_namespace init_ipc_ns = { 28struct ipc_namespace init_ipc_ns = {
27 .kref = { 29 .count = ATOMIC_INIT(1),
28 /* It's not for this patch to change, but should this be 1? */
29 .refcount = ATOMIC_INIT(2),
30 },
31#ifdef CONFIG_POSIX_MQUEUE 30#ifdef CONFIG_POSIX_MQUEUE
32 .mq_mnt = NULL,
33 .mq_queues_count = 0,
34 .mq_queues_max = DFLT_QUEUESMAX, 31 .mq_queues_max = DFLT_QUEUESMAX,
35 .mq_msg_max = DFLT_MSGMAX, 32 .mq_msg_max = DFLT_MSGMAX,
36 .mq_msgsize_max = DFLT_MSGSIZEMAX, 33 .mq_msgsize_max = DFLT_MSGSIZEMAX,
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4b4dc6d847f1..4a5e752a9276 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -9,23 +9,31 @@
9#include <linux/rcupdate.h> 9#include <linux/rcupdate.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/mount.h>
12 14
13#include "util.h" 15#include "util.h"
14 16
15static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns) 17static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
16{ 18{
17 struct ipc_namespace *ns; 19 struct ipc_namespace *ns;
20 int err;
18 21
19 ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL); 22 ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
20 if (ns == NULL) 23 if (ns == NULL)
21 return ERR_PTR(-ENOMEM); 24 return ERR_PTR(-ENOMEM);
22 25
26 atomic_set(&ns->count, 1);
27 err = mq_init_ns(ns);
28 if (err) {
29 kfree(ns);
30 return ERR_PTR(err);
31 }
23 atomic_inc(&nr_ipc_ns); 32 atomic_inc(&nr_ipc_ns);
24 33
25 sem_init_ns(ns); 34 sem_init_ns(ns);
26 msg_init_ns(ns); 35 msg_init_ns(ns);
27 shm_init_ns(ns); 36 shm_init_ns(ns);
28 mq_init_ns(ns);
29 37
30 /* 38 /*
31 * msgmni has already been computed for the new ipc ns. 39 * msgmni has already been computed for the new ipc ns.
@@ -35,7 +43,6 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
35 ipcns_notify(IPCNS_CREATED); 43 ipcns_notify(IPCNS_CREATED);
36 register_ipcns_notifier(ns); 44 register_ipcns_notifier(ns);
37 45
38 kref_init(&ns->kref);
39 return ns; 46 return ns;
40} 47}
41 48
@@ -85,11 +92,34 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
85 up_write(&ids->rw_mutex); 92 up_write(&ids->rw_mutex);
86} 93}
87 94
88void free_ipc_ns(struct kref *kref) 95/*
96 * put_ipc_ns - drop a reference to an ipc namespace.
97 * @ns: the namespace to put
98 *
99 * If this is the last task in the namespace exiting, and
100 * it is dropping the refcount to 0, then it can race with
101 * a task in another ipc namespace but in a mounts namespace
102 * which has this ipcns's mqueuefs mounted, doing some action
103 * with one of the mqueuefs files. That can raise the refcount.
104 * So dropping the refcount, and raising the refcount when
105 * accessing it through the VFS, are protected with mq_lock.
106 *
107 * (Clearly, a task raising the refcount on its own ipc_ns
108 * needn't take mq_lock since it can't race with the last task
109 * in the ipcns exiting).
110 */
111void put_ipc_ns(struct ipc_namespace *ns)
89{ 112{
90 struct ipc_namespace *ns; 113 if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
114 mq_clear_sbinfo(ns);
115 spin_unlock(&mq_lock);
116 mq_put_mnt(ns);
117 free_ipc_ns(ns);
118 }
119}
91 120
92 ns = container_of(kref, struct ipc_namespace, kref); 121void free_ipc_ns(struct ipc_namespace *ns)
122{
93 /* 123 /*
94 * Unregistering the hotplug notifier at the beginning guarantees 124 * Unregistering the hotplug notifier at the beginning guarantees
95 * that the ipc namespace won't be freed while we are inside the 125 * that the ipc namespace won't be freed while we are inside the
@@ -102,7 +132,6 @@ void free_ipc_ns(struct kref *kref)
102 sem_exit_ns(ns); 132 sem_exit_ns(ns);
103 msg_exit_ns(ns); 133 msg_exit_ns(ns);
104 shm_exit_ns(ns); 134 shm_exit_ns(ns);
105 mq_exit_ns(ns);
106 kfree(ns); 135 kfree(ns);
107 atomic_dec(&nr_ipc_ns); 136 atomic_dec(&nr_ipc_ns);
108 137
diff --git a/ipc/util.h b/ipc/util.h
index 0e7d9223acc1..1187332a89d2 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -21,9 +21,11 @@ void shm_init (void);
21struct ipc_namespace; 21struct ipc_namespace;
22 22
23#ifdef CONFIG_POSIX_MQUEUE 23#ifdef CONFIG_POSIX_MQUEUE
24void mq_exit_ns(struct ipc_namespace *ns); 24extern void mq_clear_sbinfo(struct ipc_namespace *ns);
25extern void mq_put_mnt(struct ipc_namespace *ns);
25#else 26#else
26static inline void mq_exit_ns(struct ipc_namespace *ns) { } 27static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
28static inline void mq_put_mnt(struct ipc_namespace *ns) { }
27#endif 29#endif
28 30
29#ifdef CONFIG_SYSVIPC 31#ifdef CONFIG_SYSVIPC