aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2013-04-17 02:47:01 -0400
committerDavid S. Miller <davem@davemloft.net>2013-04-19 14:57:57 -0400
commitccdfcc398594ddf3f77348c5a10938dbe9efefbe (patch)
tree5458e0eca52d0488e8c24c8587028b5bd29b60de /net
parentcf0a018ac669955c10e4fca24fa55dde58434e9a (diff)
netlink: mmaped netlink: ring setup
Add support for mmap'ed RX and TX ring setup and teardown based on the af_packet.c code. The following patches will use this to add the real mmap'ed receive and transmit functionality. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig9
-rw-r--r--net/netlink/af_netlink.c268
-rw-r--r--net/netlink/af_netlink.h20
3 files changed, 295 insertions, 2 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc9046868e..1a2221630e6a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -23,6 +23,15 @@ menuconfig NET
23 23
24if NET 24if NET
25 25
26config NETLINK_MMAP
27 bool "Netlink: mmaped IO"
28 help
29 This option enables support for memory mapped netlink IO. This
30 reduces overhead by avoiding copying data between kernel- and
31 userspace.
32
33 If unsure, say N.
34
26config WANT_COMPAT_NETLINK_MESSAGES 35config WANT_COMPAT_NETLINK_MESSAGES
27 bool 36 bool
28 help 37 help
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 58b9025978fa..1d3c7128e90e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -55,6 +55,7 @@
55#include <linux/types.h> 55#include <linux/types.h>
56#include <linux/audit.h> 56#include <linux/audit.h>
57#include <linux/mutex.h> 57#include <linux/mutex.h>
58#include <linux/vmalloc.h>
58 59
59#include <net/net_namespace.h> 60#include <net/net_namespace.h>
60#include <net/sock.h> 61#include <net/sock.h>
@@ -107,6 +108,234 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
107 return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; 108 return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
108} 109}
109 110
111#ifdef CONFIG_NETLINK_MMAP
112static __pure struct page *pgvec_to_page(const void *addr)
113{
114 if (is_vmalloc_addr(addr))
115 return vmalloc_to_page(addr);
116 else
117 return virt_to_page(addr);
118}
119
120static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
121{
122 unsigned int i;
123
124 for (i = 0; i < len; i++) {
125 if (pg_vec[i] != NULL) {
126 if (is_vmalloc_addr(pg_vec[i]))
127 vfree(pg_vec[i]);
128 else
129 free_pages((unsigned long)pg_vec[i], order);
130 }
131 }
132 kfree(pg_vec);
133}
134
135static void *alloc_one_pg_vec_page(unsigned long order)
136{
137 void *buffer;
138 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
139 __GFP_NOWARN | __GFP_NORETRY;
140
141 buffer = (void *)__get_free_pages(gfp_flags, order);
142 if (buffer != NULL)
143 return buffer;
144
145 buffer = vzalloc((1 << order) * PAGE_SIZE);
146 if (buffer != NULL)
147 return buffer;
148
149 gfp_flags &= ~__GFP_NORETRY;
150 return (void *)__get_free_pages(gfp_flags, order);
151}
152
153static void **alloc_pg_vec(struct netlink_sock *nlk,
154 struct nl_mmap_req *req, unsigned int order)
155{
156 unsigned int block_nr = req->nm_block_nr;
157 unsigned int i;
158 void **pg_vec, *ptr;
159
160 pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
161 if (pg_vec == NULL)
162 return NULL;
163
164 for (i = 0; i < block_nr; i++) {
165 pg_vec[i] = ptr = alloc_one_pg_vec_page(order);
166 if (pg_vec[i] == NULL)
167 goto err1;
168 }
169
170 return pg_vec;
171err1:
172 free_pg_vec(pg_vec, order, block_nr);
173 return NULL;
174}
175
176static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
177 bool closing, bool tx_ring)
178{
179 struct netlink_sock *nlk = nlk_sk(sk);
180 struct netlink_ring *ring;
181 struct sk_buff_head *queue;
182 void **pg_vec = NULL;
183 unsigned int order = 0;
184 int err;
185
186 ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
187 queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
188
189 if (!closing) {
190 if (atomic_read(&nlk->mapped))
191 return -EBUSY;
192 if (atomic_read(&ring->pending))
193 return -EBUSY;
194 }
195
196 if (req->nm_block_nr) {
197 if (ring->pg_vec != NULL)
198 return -EBUSY;
199
200 if ((int)req->nm_block_size <= 0)
201 return -EINVAL;
202 if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
203 return -EINVAL;
204 if (req->nm_frame_size < NL_MMAP_HDRLEN)
205 return -EINVAL;
206 if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
207 return -EINVAL;
208
209 ring->frames_per_block = req->nm_block_size /
210 req->nm_frame_size;
211 if (ring->frames_per_block == 0)
212 return -EINVAL;
213 if (ring->frames_per_block * req->nm_block_nr !=
214 req->nm_frame_nr)
215 return -EINVAL;
216
217 order = get_order(req->nm_block_size);
218 pg_vec = alloc_pg_vec(nlk, req, order);
219 if (pg_vec == NULL)
220 return -ENOMEM;
221 } else {
222 if (req->nm_frame_nr)
223 return -EINVAL;
224 }
225
226 err = -EBUSY;
227 mutex_lock(&nlk->pg_vec_lock);
228 if (closing || atomic_read(&nlk->mapped) == 0) {
229 err = 0;
230 spin_lock_bh(&queue->lock);
231
232 ring->frame_max = req->nm_frame_nr - 1;
233 ring->head = 0;
234 ring->frame_size = req->nm_frame_size;
235 ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
236
237 swap(ring->pg_vec_len, req->nm_block_nr);
238 swap(ring->pg_vec_order, order);
239 swap(ring->pg_vec, pg_vec);
240
241 __skb_queue_purge(queue);
242 spin_unlock_bh(&queue->lock);
243
244 WARN_ON(atomic_read(&nlk->mapped));
245 }
246 mutex_unlock(&nlk->pg_vec_lock);
247
248 if (pg_vec)
249 free_pg_vec(pg_vec, order, req->nm_block_nr);
250 return err;
251}
252
253static void netlink_mm_open(struct vm_area_struct *vma)
254{
255 struct file *file = vma->vm_file;
256 struct socket *sock = file->private_data;
257 struct sock *sk = sock->sk;
258
259 if (sk)
260 atomic_inc(&nlk_sk(sk)->mapped);
261}
262
263static void netlink_mm_close(struct vm_area_struct *vma)
264{
265 struct file *file = vma->vm_file;
266 struct socket *sock = file->private_data;
267 struct sock *sk = sock->sk;
268
269 if (sk)
270 atomic_dec(&nlk_sk(sk)->mapped);
271}
272
273static const struct vm_operations_struct netlink_mmap_ops = {
274 .open = netlink_mm_open,
275 .close = netlink_mm_close,
276};
277
278static int netlink_mmap(struct file *file, struct socket *sock,
279 struct vm_area_struct *vma)
280{
281 struct sock *sk = sock->sk;
282 struct netlink_sock *nlk = nlk_sk(sk);
283 struct netlink_ring *ring;
284 unsigned long start, size, expected;
285 unsigned int i;
286 int err = -EINVAL;
287
288 if (vma->vm_pgoff)
289 return -EINVAL;
290
291 mutex_lock(&nlk->pg_vec_lock);
292
293 expected = 0;
294 for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
295 if (ring->pg_vec == NULL)
296 continue;
297 expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
298 }
299
300 if (expected == 0)
301 goto out;
302
303 size = vma->vm_end - vma->vm_start;
304 if (size != expected)
305 goto out;
306
307 start = vma->vm_start;
308 for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
309 if (ring->pg_vec == NULL)
310 continue;
311
312 for (i = 0; i < ring->pg_vec_len; i++) {
313 struct page *page;
314 void *kaddr = ring->pg_vec[i];
315 unsigned int pg_num;
316
317 for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
318 page = pgvec_to_page(kaddr);
319 err = vm_insert_page(vma, start, page);
320 if (err < 0)
321 goto out;
322 start += PAGE_SIZE;
323 kaddr += PAGE_SIZE;
324 }
325 }
326 }
327
328 atomic_inc(&nlk->mapped);
329 vma->vm_ops = &netlink_mmap_ops;
330 err = 0;
331out:
332 mutex_unlock(&nlk->pg_vec_lock);
333 return 0;
334}
335#else /* CONFIG_NETLINK_MMAP */
336#define netlink_mmap sock_no_mmap
337#endif /* CONFIG_NETLINK_MMAP */
338
110static void netlink_destroy_callback(struct netlink_callback *cb) 339static void netlink_destroy_callback(struct netlink_callback *cb)
111{ 340{
112 kfree_skb(cb->skb); 341 kfree_skb(cb->skb);
@@ -146,6 +375,18 @@ static void netlink_sock_destruct(struct sock *sk)
146 } 375 }
147 376
148 skb_queue_purge(&sk->sk_receive_queue); 377 skb_queue_purge(&sk->sk_receive_queue);
378#ifdef CONFIG_NETLINK_MMAP
379 if (1) {
380 struct nl_mmap_req req;
381
382 memset(&req, 0, sizeof(req));
383 if (nlk->rx_ring.pg_vec)
384 netlink_set_ring(sk, &req, true, false);
385 memset(&req, 0, sizeof(req));
386 if (nlk->tx_ring.pg_vec)
387 netlink_set_ring(sk, &req, true, true);
388 }
389#endif /* CONFIG_NETLINK_MMAP */
149 390
150 if (!sock_flag(sk, SOCK_DEAD)) { 391 if (!sock_flag(sk, SOCK_DEAD)) {
151 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); 392 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -409,6 +650,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
409 mutex_init(nlk->cb_mutex); 650 mutex_init(nlk->cb_mutex);
410 } 651 }
411 init_waitqueue_head(&nlk->wait); 652 init_waitqueue_head(&nlk->wait);
653#ifdef CONFIG_NETLINK_MMAP
654 mutex_init(&nlk->pg_vec_lock);
655#endif
412 656
413 sk->sk_destruct = netlink_sock_destruct; 657 sk->sk_destruct = netlink_sock_destruct;
414 sk->sk_protocol = protocol; 658 sk->sk_protocol = protocol;
@@ -1211,7 +1455,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1211 if (level != SOL_NETLINK) 1455 if (level != SOL_NETLINK)
1212 return -ENOPROTOOPT; 1456 return -ENOPROTOOPT;
1213 1457
1214 if (optlen >= sizeof(int) && 1458 if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
1459 optlen >= sizeof(int) &&
1215 get_user(val, (unsigned int __user *)optval)) 1460 get_user(val, (unsigned int __user *)optval))
1216 return -EFAULT; 1461 return -EFAULT;
1217 1462
@@ -1260,6 +1505,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1260 } 1505 }
1261 err = 0; 1506 err = 0;
1262 break; 1507 break;
1508#ifdef CONFIG_NETLINK_MMAP
1509 case NETLINK_RX_RING:
1510 case NETLINK_TX_RING: {
1511 struct nl_mmap_req req;
1512
1513 /* Rings might consume more memory than queue limits, require
1514 * CAP_NET_ADMIN.
1515 */
1516 if (!capable(CAP_NET_ADMIN))
1517 return -EPERM;
1518 if (optlen < sizeof(req))
1519 return -EINVAL;
1520 if (copy_from_user(&req, optval, sizeof(req)))
1521 return -EFAULT;
1522 err = netlink_set_ring(sk, &req, false,
1523 optname == NETLINK_TX_RING);
1524 break;
1525 }
1526#endif /* CONFIG_NETLINK_MMAP */
1263 default: 1527 default:
1264 err = -ENOPROTOOPT; 1528 err = -ENOPROTOOPT;
1265 } 1529 }
@@ -2093,7 +2357,7 @@ static const struct proto_ops netlink_ops = {
2093 .getsockopt = netlink_getsockopt, 2357 .getsockopt = netlink_getsockopt,
2094 .sendmsg = netlink_sendmsg, 2358 .sendmsg = netlink_sendmsg,
2095 .recvmsg = netlink_recvmsg, 2359 .recvmsg = netlink_recvmsg,
2096 .mmap = sock_no_mmap, 2360 .mmap = netlink_mmap,
2097 .sendpage = sock_no_sendpage, 2361 .sendpage = sock_no_sendpage,
2098}; 2362};
2099 2363
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index d9acb2a1d855..ed8522265f4e 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -6,6 +6,20 @@
6#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) 6#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
7#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) 7#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
8 8
9struct netlink_ring {
10 void **pg_vec;
11 unsigned int head;
12 unsigned int frames_per_block;
13 unsigned int frame_size;
14 unsigned int frame_max;
15
16 unsigned int pg_vec_order;
17 unsigned int pg_vec_pages;
18 unsigned int pg_vec_len;
19
20 atomic_t pending;
21};
22
9struct netlink_sock { 23struct netlink_sock {
10 /* struct sock has to be the first member of netlink_sock */ 24 /* struct sock has to be the first member of netlink_sock */
11 struct sock sk; 25 struct sock sk;
@@ -24,6 +38,12 @@ struct netlink_sock {
24 void (*netlink_rcv)(struct sk_buff *skb); 38 void (*netlink_rcv)(struct sk_buff *skb);
25 void (*netlink_bind)(int group); 39 void (*netlink_bind)(int group);
26 struct module *module; 40 struct module *module;
41#ifdef CONFIG_NETLINK_MMAP
42 struct mutex pg_vec_lock;
43 struct netlink_ring rx_ring;
44 struct netlink_ring tx_ring;
45 atomic_t mapped;
46#endif /* CONFIG_NETLINK_MMAP */
27}; 47};
28 48
29static inline struct netlink_sock *nlk_sk(struct sock *sk) 49static inline struct netlink_sock *nlk_sk(struct sock *sk)