aboutsummaryrefslogtreecommitdiffstats
path: root/ipc/mqueue.c
diff options
context:
space:
mode:
Diffstat (limited to 'ipc/mqueue.c')
-rw-r--r--ipc/mqueue.c292
1 files changed, 232 insertions, 60 deletions
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a2757d4ab773..8ce57691e7b6 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -24,6 +24,7 @@
24#include <linux/mqueue.h> 24#include <linux/mqueue.h>
25#include <linux/msg.h> 25#include <linux/msg.h>
26#include <linux/skbuff.h> 26#include <linux/skbuff.h>
27#include <linux/vmalloc.h>
27#include <linux/netlink.h> 28#include <linux/netlink.h>
28#include <linux/syscalls.h> 29#include <linux/syscalls.h>
29#include <linux/audit.h> 30#include <linux/audit.h>
@@ -49,6 +50,12 @@
49#define STATE_PENDING 1 50#define STATE_PENDING 1
50#define STATE_READY 2 51#define STATE_READY 2
51 52
53struct posix_msg_tree_node {
54 struct rb_node rb_node;
55 struct list_head msg_list;
56 int priority;
57};
58
52struct ext_wait_queue { /* queue of sleeping tasks */ 59struct ext_wait_queue { /* queue of sleeping tasks */
53 struct task_struct *task; 60 struct task_struct *task;
54 struct list_head list; 61 struct list_head list;
@@ -61,7 +68,8 @@ struct mqueue_inode_info {
61 struct inode vfs_inode; 68 struct inode vfs_inode;
62 wait_queue_head_t wait_q; 69 wait_queue_head_t wait_q;
63 70
64 struct msg_msg **messages; 71 struct rb_root msg_tree;
72 struct posix_msg_tree_node *node_cache;
65 struct mq_attr attr; 73 struct mq_attr attr;
66 74
67 struct sigevent notify; 75 struct sigevent notify;
@@ -109,6 +117,103 @@ static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
109 return ns; 117 return ns;
110} 118}
111 119
120/* Auxiliary functions to manipulate messages' list */
121static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
122{
123 struct rb_node **p, *parent = NULL;
124 struct posix_msg_tree_node *leaf;
125
126 p = &info->msg_tree.rb_node;
127 while (*p) {
128 parent = *p;
129 leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
130
131 if (likely(leaf->priority == msg->m_type))
132 goto insert_msg;
133 else if (msg->m_type < leaf->priority)
134 p = &(*p)->rb_left;
135 else
136 p = &(*p)->rb_right;
137 }
138 if (info->node_cache) {
139 leaf = info->node_cache;
140 info->node_cache = NULL;
141 } else {
142 leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
143 if (!leaf)
144 return -ENOMEM;
145 rb_init_node(&leaf->rb_node);
146 INIT_LIST_HEAD(&leaf->msg_list);
147 info->qsize += sizeof(*leaf);
148 }
149 leaf->priority = msg->m_type;
150 rb_link_node(&leaf->rb_node, parent, p);
151 rb_insert_color(&leaf->rb_node, &info->msg_tree);
152insert_msg:
153 info->attr.mq_curmsgs++;
154 info->qsize += msg->m_ts;
155 list_add_tail(&msg->m_list, &leaf->msg_list);
156 return 0;
157}
158
159static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
160{
161 struct rb_node **p, *parent = NULL;
162 struct posix_msg_tree_node *leaf;
163 struct msg_msg *msg;
164
165try_again:
166 p = &info->msg_tree.rb_node;
167 while (*p) {
168 parent = *p;
169 /*
170 * During insert, low priorities go to the left and high to the
171 * right. On receive, we want the highest priorities first, so
172 * walk all the way to the right.
173 */
174 p = &(*p)->rb_right;
175 }
176 if (!parent) {
177 if (info->attr.mq_curmsgs) {
178 pr_warn_once("Inconsistency in POSIX message queue, "
179 "no tree element, but supposedly messages "
180 "should exist!\n");
181 info->attr.mq_curmsgs = 0;
182 }
183 return NULL;
184 }
185 leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
186 if (unlikely(list_empty(&leaf->msg_list))) {
187 pr_warn_once("Inconsistency in POSIX message queue, "
188 "empty leaf node but we haven't implemented "
189 "lazy leaf delete!\n");
190 rb_erase(&leaf->rb_node, &info->msg_tree);
191 if (info->node_cache) {
192 info->qsize -= sizeof(*leaf);
193 kfree(leaf);
194 } else {
195 info->node_cache = leaf;
196 }
197 goto try_again;
198 } else {
199 msg = list_first_entry(&leaf->msg_list,
200 struct msg_msg, m_list);
201 list_del(&msg->m_list);
202 if (list_empty(&leaf->msg_list)) {
203 rb_erase(&leaf->rb_node, &info->msg_tree);
204 if (info->node_cache) {
205 info->qsize -= sizeof(*leaf);
206 kfree(leaf);
207 } else {
208 info->node_cache = leaf;
209 }
210 }
211 }
212 info->attr.mq_curmsgs--;
213 info->qsize -= msg->m_ts;
214 return msg;
215}
216
112static struct inode *mqueue_get_inode(struct super_block *sb, 217static struct inode *mqueue_get_inode(struct super_block *sb,
113 struct ipc_namespace *ipc_ns, umode_t mode, 218 struct ipc_namespace *ipc_ns, umode_t mode,
114 struct mq_attr *attr) 219 struct mq_attr *attr)
@@ -129,7 +234,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
129 234
130 if (S_ISREG(mode)) { 235 if (S_ISREG(mode)) {
131 struct mqueue_inode_info *info; 236 struct mqueue_inode_info *info;
132 unsigned long mq_bytes, mq_msg_tblsz; 237 unsigned long mq_bytes, mq_treesize;
133 238
134 inode->i_fop = &mqueue_file_operations; 239 inode->i_fop = &mqueue_file_operations;
135 inode->i_size = FILENT_SIZE; 240 inode->i_size = FILENT_SIZE;
@@ -143,20 +248,36 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
143 info->notify_user_ns = NULL; 248 info->notify_user_ns = NULL;
144 info->qsize = 0; 249 info->qsize = 0;
145 info->user = NULL; /* set when all is ok */ 250 info->user = NULL; /* set when all is ok */
251 info->msg_tree = RB_ROOT;
252 info->node_cache = NULL;
146 memset(&info->attr, 0, sizeof(info->attr)); 253 memset(&info->attr, 0, sizeof(info->attr));
147 info->attr.mq_maxmsg = ipc_ns->mq_msg_max; 254 info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
148 info->attr.mq_msgsize = ipc_ns->mq_msgsize_max; 255 ipc_ns->mq_msg_default);
256 info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
257 ipc_ns->mq_msgsize_default);
149 if (attr) { 258 if (attr) {
150 info->attr.mq_maxmsg = attr->mq_maxmsg; 259 info->attr.mq_maxmsg = attr->mq_maxmsg;
151 info->attr.mq_msgsize = attr->mq_msgsize; 260 info->attr.mq_msgsize = attr->mq_msgsize;
152 } 261 }
153 mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *); 262 /*
154 info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL); 263 * We used to allocate a static array of pointers and account
155 if (!info->messages) 264 * the size of that array as well as one msg_msg struct per
156 goto out_inode; 265 * possible message into the queue size. That's no longer
266 * accurate as the queue is now an rbtree and will grow and
267 * shrink depending on usage patterns. We can, however, still
268 * account one msg_msg struct per message, but the nodes are
269 * allocated depending on priority usage, and most programs
270 * only use one, or a handful, of priorities. However, since
271 * this is pinned memory, we need to assume worst case, so
272 * that means the min(mq_maxmsg, max_priorities) * struct
273 * posix_msg_tree_node.
274 */
275 mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
276 min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
277 sizeof(struct posix_msg_tree_node);
157 278
158 mq_bytes = (mq_msg_tblsz + 279 mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
159 (info->attr.mq_maxmsg * info->attr.mq_msgsize)); 280 info->attr.mq_msgsize);
160 281
161 spin_lock(&mq_lock); 282 spin_lock(&mq_lock);
162 if (u->mq_bytes + mq_bytes < u->mq_bytes || 283 if (u->mq_bytes + mq_bytes < u->mq_bytes ||
@@ -247,9 +368,9 @@ static void mqueue_evict_inode(struct inode *inode)
247{ 368{
248 struct mqueue_inode_info *info; 369 struct mqueue_inode_info *info;
249 struct user_struct *user; 370 struct user_struct *user;
250 unsigned long mq_bytes; 371 unsigned long mq_bytes, mq_treesize;
251 int i;
252 struct ipc_namespace *ipc_ns; 372 struct ipc_namespace *ipc_ns;
373 struct msg_msg *msg;
253 374
254 clear_inode(inode); 375 clear_inode(inode);
255 376
@@ -259,14 +380,19 @@ static void mqueue_evict_inode(struct inode *inode)
259 ipc_ns = get_ns_from_inode(inode); 380 ipc_ns = get_ns_from_inode(inode);
260 info = MQUEUE_I(inode); 381 info = MQUEUE_I(inode);
261 spin_lock(&info->lock); 382 spin_lock(&info->lock);
262 for (i = 0; i < info->attr.mq_curmsgs; i++) 383 while ((msg = msg_get(info)) != NULL)
263 free_msg(info->messages[i]); 384 free_msg(msg);
264 kfree(info->messages); 385 kfree(info->node_cache);
265 spin_unlock(&info->lock); 386 spin_unlock(&info->lock);
266 387
267 /* Total amount of bytes accounted for the mqueue */ 388 /* Total amount of bytes accounted for the mqueue */
268 mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *) 389 mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
269 + info->attr.mq_msgsize); 390 min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
391 sizeof(struct posix_msg_tree_node);
392
393 mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
394 info->attr.mq_msgsize);
395
270 user = info->user; 396 user = info->user;
271 if (user) { 397 if (user) {
272 spin_lock(&mq_lock); 398 spin_lock(&mq_lock);
@@ -300,8 +426,9 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
300 error = -EACCES; 426 error = -EACCES;
301 goto out_unlock; 427 goto out_unlock;
302 } 428 }
303 if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && 429 if (ipc_ns->mq_queues_count >= HARD_QUEUESMAX ||
304 !capable(CAP_SYS_RESOURCE)) { 430 (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
431 !capable(CAP_SYS_RESOURCE))) {
305 error = -ENOSPC; 432 error = -ENOSPC;
306 goto out_unlock; 433 goto out_unlock;
307 } 434 }
@@ -485,26 +612,6 @@ static struct ext_wait_queue *wq_get_first_waiter(
485 return list_entry(ptr, struct ext_wait_queue, list); 612 return list_entry(ptr, struct ext_wait_queue, list);
486} 613}
487 614
488/* Auxiliary functions to manipulate messages' list */
489static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
490{
491 int k;
492
493 k = info->attr.mq_curmsgs - 1;
494 while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
495 info->messages[k + 1] = info->messages[k];
496 k--;
497 }
498 info->attr.mq_curmsgs++;
499 info->qsize += ptr->m_ts;
500 info->messages[k + 1] = ptr;
501}
502
503static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
504{
505 info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
506 return info->messages[info->attr.mq_curmsgs];
507}
508 615
509static inline void set_cookie(struct sk_buff *skb, char code) 616static inline void set_cookie(struct sk_buff *skb, char code)
510{ 617{
@@ -585,24 +692,30 @@ static void remove_notification(struct mqueue_inode_info *info)
585 692
586static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr) 693static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
587{ 694{
695 int mq_treesize;
696 unsigned long total_size;
697
588 if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0) 698 if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
589 return 0; 699 return -EINVAL;
590 if (capable(CAP_SYS_RESOURCE)) { 700 if (capable(CAP_SYS_RESOURCE)) {
591 if (attr->mq_maxmsg > HARD_MSGMAX) 701 if (attr->mq_maxmsg > HARD_MSGMAX ||
592 return 0; 702 attr->mq_msgsize > HARD_MSGSIZEMAX)
703 return -EINVAL;
593 } else { 704 } else {
594 if (attr->mq_maxmsg > ipc_ns->mq_msg_max || 705 if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
595 attr->mq_msgsize > ipc_ns->mq_msgsize_max) 706 attr->mq_msgsize > ipc_ns->mq_msgsize_max)
596 return 0; 707 return -EINVAL;
597 } 708 }
598 /* check for overflow */ 709 /* check for overflow */
599 if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg) 710 if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
600 return 0; 711 return -EOVERFLOW;
601 if ((unsigned long)(attr->mq_maxmsg * (attr->mq_msgsize 712 mq_treesize = attr->mq_maxmsg * sizeof(struct msg_msg) +
602 + sizeof (struct msg_msg *))) < 713 min_t(unsigned int, attr->mq_maxmsg, MQ_PRIO_MAX) *
603 (unsigned long)(attr->mq_maxmsg * attr->mq_msgsize)) 714 sizeof(struct posix_msg_tree_node);
604 return 0; 715 total_size = attr->mq_maxmsg * attr->mq_msgsize;
605 return 1; 716 if (total_size + mq_treesize < total_size)
717 return -EOVERFLOW;
718 return 0;
606} 719}
607 720
608/* 721/*
@@ -617,12 +730,21 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
617 int ret; 730 int ret;
618 731
619 if (attr) { 732 if (attr) {
620 if (!mq_attr_ok(ipc_ns, attr)) { 733 ret = mq_attr_ok(ipc_ns, attr);
621 ret = -EINVAL; 734 if (ret)
622 goto out; 735 goto out;
623 }
624 /* store for use during create */ 736 /* store for use during create */
625 dentry->d_fsdata = attr; 737 dentry->d_fsdata = attr;
738 } else {
739 struct mq_attr def_attr;
740
741 def_attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
742 ipc_ns->mq_msg_default);
743 def_attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
744 ipc_ns->mq_msgsize_default);
745 ret = mq_attr_ok(ipc_ns, &def_attr);
746 if (ret)
747 goto out;
626 } 748 }
627 749
628 mode &= ~current_umask(); 750 mode &= ~current_umask();
@@ -837,7 +959,8 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
837 wake_up_interruptible(&info->wait_q); 959 wake_up_interruptible(&info->wait_q);
838 return; 960 return;
839 } 961 }
840 msg_insert(sender->msg, info); 962 if (msg_insert(sender->msg, info))
963 return;
841 list_del(&sender->list); 964 list_del(&sender->list);
842 sender->state = STATE_PENDING; 965 sender->state = STATE_PENDING;
843 wake_up_process(sender->task); 966 wake_up_process(sender->task);
@@ -857,7 +980,8 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
857 struct mqueue_inode_info *info; 980 struct mqueue_inode_info *info;
858 ktime_t expires, *timeout = NULL; 981 ktime_t expires, *timeout = NULL;
859 struct timespec ts; 982 struct timespec ts;
860 int ret; 983 struct posix_msg_tree_node *new_leaf = NULL;
984 int ret = 0;
861 985
862 if (u_abs_timeout) { 986 if (u_abs_timeout) {
863 int res = prepare_timeout(u_abs_timeout, &expires, &ts); 987 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -905,34 +1029,60 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
905 msg_ptr->m_ts = msg_len; 1029 msg_ptr->m_ts = msg_len;
906 msg_ptr->m_type = msg_prio; 1030 msg_ptr->m_type = msg_prio;
907 1031
1032 /*
1033 * msg_insert really wants us to have a valid, spare node struct so
1034 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1035 * fall back to that if necessary.
1036 */
1037 if (!info->node_cache)
1038 new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1039
908 spin_lock(&info->lock); 1040 spin_lock(&info->lock);
909 1041
1042 if (!info->node_cache && new_leaf) {
1043 /* Save our speculative allocation into the cache */
1044 rb_init_node(&new_leaf->rb_node);
1045 INIT_LIST_HEAD(&new_leaf->msg_list);
1046 info->node_cache = new_leaf;
1047 info->qsize += sizeof(*new_leaf);
1048 new_leaf = NULL;
1049 } else {
1050 kfree(new_leaf);
1051 }
1052
910 if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { 1053 if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
911 if (filp->f_flags & O_NONBLOCK) { 1054 if (filp->f_flags & O_NONBLOCK) {
912 spin_unlock(&info->lock);
913 ret = -EAGAIN; 1055 ret = -EAGAIN;
914 } else { 1056 } else {
915 wait.task = current; 1057 wait.task = current;
916 wait.msg = (void *) msg_ptr; 1058 wait.msg = (void *) msg_ptr;
917 wait.state = STATE_NONE; 1059 wait.state = STATE_NONE;
918 ret = wq_sleep(info, SEND, timeout, &wait); 1060 ret = wq_sleep(info, SEND, timeout, &wait);
1061 /*
1062 * wq_sleep must be called with info->lock held, and
1063 * returns with the lock released
1064 */
1065 goto out_free;
919 } 1066 }
920 if (ret < 0)
921 free_msg(msg_ptr);
922 } else { 1067 } else {
923 receiver = wq_get_first_waiter(info, RECV); 1068 receiver = wq_get_first_waiter(info, RECV);
924 if (receiver) { 1069 if (receiver) {
925 pipelined_send(info, msg_ptr, receiver); 1070 pipelined_send(info, msg_ptr, receiver);
926 } else { 1071 } else {
927 /* adds message to the queue */ 1072 /* adds message to the queue */
928 msg_insert(msg_ptr, info); 1073 ret = msg_insert(msg_ptr, info);
1074 if (ret)
1075 goto out_unlock;
929 __do_notify(info); 1076 __do_notify(info);
930 } 1077 }
931 inode->i_atime = inode->i_mtime = inode->i_ctime = 1078 inode->i_atime = inode->i_mtime = inode->i_ctime =
932 CURRENT_TIME; 1079 CURRENT_TIME;
933 spin_unlock(&info->lock);
934 ret = 0;
935 } 1080 }
1081out_unlock:
1082 spin_unlock(&info->lock);
1083out_free:
1084 if (ret)
1085 free_msg(msg_ptr);
936out_fput: 1086out_fput:
937 fput(filp); 1087 fput(filp);
938out: 1088out:
@@ -951,6 +1101,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
951 struct ext_wait_queue wait; 1101 struct ext_wait_queue wait;
952 ktime_t expires, *timeout = NULL; 1102 ktime_t expires, *timeout = NULL;
953 struct timespec ts; 1103 struct timespec ts;
1104 struct posix_msg_tree_node *new_leaf = NULL;
954 1105
955 if (u_abs_timeout) { 1106 if (u_abs_timeout) {
956 int res = prepare_timeout(u_abs_timeout, &expires, &ts); 1107 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -986,7 +1137,26 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
986 goto out_fput; 1137 goto out_fput;
987 } 1138 }
988 1139
1140 /*
1141 * msg_insert really wants us to have a valid, spare node struct so
1142 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1143 * fall back to that if necessary.
1144 */
1145 if (!info->node_cache)
1146 new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1147
989 spin_lock(&info->lock); 1148 spin_lock(&info->lock);
1149
1150 if (!info->node_cache && new_leaf) {
1151 /* Save our speculative allocation into the cache */
1152 rb_init_node(&new_leaf->rb_node);
1153 INIT_LIST_HEAD(&new_leaf->msg_list);
1154 info->node_cache = new_leaf;
1155 info->qsize += sizeof(*new_leaf);
1156 } else {
1157 kfree(new_leaf);
1158 }
1159
990 if (info->attr.mq_curmsgs == 0) { 1160 if (info->attr.mq_curmsgs == 0) {
991 if (filp->f_flags & O_NONBLOCK) { 1161 if (filp->f_flags & O_NONBLOCK) {
992 spin_unlock(&info->lock); 1162 spin_unlock(&info->lock);
@@ -1251,6 +1421,8 @@ int mq_init_ns(struct ipc_namespace *ns)
1251 ns->mq_queues_max = DFLT_QUEUESMAX; 1421 ns->mq_queues_max = DFLT_QUEUESMAX;
1252 ns->mq_msg_max = DFLT_MSGMAX; 1422 ns->mq_msg_max = DFLT_MSGMAX;
1253 ns->mq_msgsize_max = DFLT_MSGSIZEMAX; 1423 ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
1424 ns->mq_msg_default = DFLT_MSG;
1425 ns->mq_msgsize_default = DFLT_MSGSIZE;
1254 1426
1255 ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns); 1427 ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
1256 if (IS_ERR(ns->mq_mnt)) { 1428 if (IS_ERR(ns->mq_mnt)) {