diff options
author | Patrick McHardy <kaber@trash.net> | 2013-04-17 02:47:01 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-04-19 14:57:57 -0400 |
commit | ccdfcc398594ddf3f77348c5a10938dbe9efefbe (patch) | |
tree | 5458e0eca52d0488e8c24c8587028b5bd29b60de /net | |
parent | cf0a018ac669955c10e4fca24fa55dde58434e9a (diff) |
netlink: mmaped netlink: ring setup
Add support for mmap'ed RX and TX ring setup and teardown based on the
af_packet.c code. The following patches will use this to add the real
mmap'ed receive and transmit functionality.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/Kconfig | 9 | ||||
-rw-r--r-- | net/netlink/af_netlink.c | 268 | ||||
-rw-r--r-- | net/netlink/af_netlink.h | 20 |
3 files changed, 295 insertions, 2 deletions
diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..1a2221630e6a 100644 --- a/net/Kconfig +++ b/net/Kconfig | |||
@@ -23,6 +23,15 @@ menuconfig NET | |||
23 | 23 | ||
24 | if NET | 24 | if NET |
25 | 25 | ||
26 | config NETLINK_MMAP | ||
27 | bool "Netlink: mmaped IO" | ||
28 | help | ||
29 | This option enables support for memory mapped netlink IO. This | ||
30 | reduces overhead by avoiding copying data between kernel- and | ||
31 | userspace. | ||
32 | |||
33 | If unsure, say N. | ||
34 | |||
26 | config WANT_COMPAT_NETLINK_MESSAGES | 35 | config WANT_COMPAT_NETLINK_MESSAGES |
27 | bool | 36 | bool |
28 | help | 37 | help |
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 58b9025978fa..1d3c7128e90e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/types.h> | 55 | #include <linux/types.h> |
56 | #include <linux/audit.h> | 56 | #include <linux/audit.h> |
57 | #include <linux/mutex.h> | 57 | #include <linux/mutex.h> |
58 | #include <linux/vmalloc.h> | ||
58 | 59 | ||
59 | #include <net/net_namespace.h> | 60 | #include <net/net_namespace.h> |
60 | #include <net/sock.h> | 61 | #include <net/sock.h> |
@@ -107,6 +108,234 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u | |||
107 | return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; | 108 | return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; |
108 | } | 109 | } |
109 | 110 | ||
111 | #ifdef CONFIG_NETLINK_MMAP | ||
112 | static __pure struct page *pgvec_to_page(const void *addr) | ||
113 | { | ||
114 | if (is_vmalloc_addr(addr)) | ||
115 | return vmalloc_to_page(addr); | ||
116 | else | ||
117 | return virt_to_page(addr); | ||
118 | } | ||
119 | |||
120 | static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) | ||
121 | { | ||
122 | unsigned int i; | ||
123 | |||
124 | for (i = 0; i < len; i++) { | ||
125 | if (pg_vec[i] != NULL) { | ||
126 | if (is_vmalloc_addr(pg_vec[i])) | ||
127 | vfree(pg_vec[i]); | ||
128 | else | ||
129 | free_pages((unsigned long)pg_vec[i], order); | ||
130 | } | ||
131 | } | ||
132 | kfree(pg_vec); | ||
133 | } | ||
134 | |||
135 | static void *alloc_one_pg_vec_page(unsigned long order) | ||
136 | { | ||
137 | void *buffer; | ||
138 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | | ||
139 | __GFP_NOWARN | __GFP_NORETRY; | ||
140 | |||
141 | buffer = (void *)__get_free_pages(gfp_flags, order); | ||
142 | if (buffer != NULL) | ||
143 | return buffer; | ||
144 | |||
145 | buffer = vzalloc((1 << order) * PAGE_SIZE); | ||
146 | if (buffer != NULL) | ||
147 | return buffer; | ||
148 | |||
149 | gfp_flags &= ~__GFP_NORETRY; | ||
150 | return (void *)__get_free_pages(gfp_flags, order); | ||
151 | } | ||
152 | |||
153 | static void **alloc_pg_vec(struct netlink_sock *nlk, | ||
154 | struct nl_mmap_req *req, unsigned int order) | ||
155 | { | ||
156 | unsigned int block_nr = req->nm_block_nr; | ||
157 | unsigned int i; | ||
158 | void **pg_vec, *ptr; | ||
159 | |||
160 | pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); | ||
161 | if (pg_vec == NULL) | ||
162 | return NULL; | ||
163 | |||
164 | for (i = 0; i < block_nr; i++) { | ||
165 | pg_vec[i] = ptr = alloc_one_pg_vec_page(order); | ||
166 | if (pg_vec[i] == NULL) | ||
167 | goto err1; | ||
168 | } | ||
169 | |||
170 | return pg_vec; | ||
171 | err1: | ||
172 | free_pg_vec(pg_vec, order, block_nr); | ||
173 | return NULL; | ||
174 | } | ||
175 | |||
176 | static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, | ||
177 | bool closing, bool tx_ring) | ||
178 | { | ||
179 | struct netlink_sock *nlk = nlk_sk(sk); | ||
180 | struct netlink_ring *ring; | ||
181 | struct sk_buff_head *queue; | ||
182 | void **pg_vec = NULL; | ||
183 | unsigned int order = 0; | ||
184 | int err; | ||
185 | |||
186 | ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; | ||
187 | queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; | ||
188 | |||
189 | if (!closing) { | ||
190 | if (atomic_read(&nlk->mapped)) | ||
191 | return -EBUSY; | ||
192 | if (atomic_read(&ring->pending)) | ||
193 | return -EBUSY; | ||
194 | } | ||
195 | |||
196 | if (req->nm_block_nr) { | ||
197 | if (ring->pg_vec != NULL) | ||
198 | return -EBUSY; | ||
199 | |||
200 | if ((int)req->nm_block_size <= 0) | ||
201 | return -EINVAL; | ||
202 | if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) | ||
203 | return -EINVAL; | ||
204 | if (req->nm_frame_size < NL_MMAP_HDRLEN) | ||
205 | return -EINVAL; | ||
206 | if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) | ||
207 | return -EINVAL; | ||
208 | |||
209 | ring->frames_per_block = req->nm_block_size / | ||
210 | req->nm_frame_size; | ||
211 | if (ring->frames_per_block == 0) | ||
212 | return -EINVAL; | ||
213 | if (ring->frames_per_block * req->nm_block_nr != | ||
214 | req->nm_frame_nr) | ||
215 | return -EINVAL; | ||
216 | |||
217 | order = get_order(req->nm_block_size); | ||
218 | pg_vec = alloc_pg_vec(nlk, req, order); | ||
219 | if (pg_vec == NULL) | ||
220 | return -ENOMEM; | ||
221 | } else { | ||
222 | if (req->nm_frame_nr) | ||
223 | return -EINVAL; | ||
224 | } | ||
225 | |||
226 | err = -EBUSY; | ||
227 | mutex_lock(&nlk->pg_vec_lock); | ||
228 | if (closing || atomic_read(&nlk->mapped) == 0) { | ||
229 | err = 0; | ||
230 | spin_lock_bh(&queue->lock); | ||
231 | |||
232 | ring->frame_max = req->nm_frame_nr - 1; | ||
233 | ring->head = 0; | ||
234 | ring->frame_size = req->nm_frame_size; | ||
235 | ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; | ||
236 | |||
237 | swap(ring->pg_vec_len, req->nm_block_nr); | ||
238 | swap(ring->pg_vec_order, order); | ||
239 | swap(ring->pg_vec, pg_vec); | ||
240 | |||
241 | __skb_queue_purge(queue); | ||
242 | spin_unlock_bh(&queue->lock); | ||
243 | |||
244 | WARN_ON(atomic_read(&nlk->mapped)); | ||
245 | } | ||
246 | mutex_unlock(&nlk->pg_vec_lock); | ||
247 | |||
248 | if (pg_vec) | ||
249 | free_pg_vec(pg_vec, order, req->nm_block_nr); | ||
250 | return err; | ||
251 | } | ||
252 | |||
253 | static void netlink_mm_open(struct vm_area_struct *vma) | ||
254 | { | ||
255 | struct file *file = vma->vm_file; | ||
256 | struct socket *sock = file->private_data; | ||
257 | struct sock *sk = sock->sk; | ||
258 | |||
259 | if (sk) | ||
260 | atomic_inc(&nlk_sk(sk)->mapped); | ||
261 | } | ||
262 | |||
263 | static void netlink_mm_close(struct vm_area_struct *vma) | ||
264 | { | ||
265 | struct file *file = vma->vm_file; | ||
266 | struct socket *sock = file->private_data; | ||
267 | struct sock *sk = sock->sk; | ||
268 | |||
269 | if (sk) | ||
270 | atomic_dec(&nlk_sk(sk)->mapped); | ||
271 | } | ||
272 | |||
273 | static const struct vm_operations_struct netlink_mmap_ops = { | ||
274 | .open = netlink_mm_open, | ||
275 | .close = netlink_mm_close, | ||
276 | }; | ||
277 | |||
278 | static int netlink_mmap(struct file *file, struct socket *sock, | ||
279 | struct vm_area_struct *vma) | ||
280 | { | ||
281 | struct sock *sk = sock->sk; | ||
282 | struct netlink_sock *nlk = nlk_sk(sk); | ||
283 | struct netlink_ring *ring; | ||
284 | unsigned long start, size, expected; | ||
285 | unsigned int i; | ||
286 | int err = -EINVAL; | ||
287 | |||
288 | if (vma->vm_pgoff) | ||
289 | return -EINVAL; | ||
290 | |||
291 | mutex_lock(&nlk->pg_vec_lock); | ||
292 | |||
293 | expected = 0; | ||
294 | for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { | ||
295 | if (ring->pg_vec == NULL) | ||
296 | continue; | ||
297 | expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; | ||
298 | } | ||
299 | |||
300 | if (expected == 0) | ||
301 | goto out; | ||
302 | |||
303 | size = vma->vm_end - vma->vm_start; | ||
304 | if (size != expected) | ||
305 | goto out; | ||
306 | |||
307 | start = vma->vm_start; | ||
308 | for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { | ||
309 | if (ring->pg_vec == NULL) | ||
310 | continue; | ||
311 | |||
312 | for (i = 0; i < ring->pg_vec_len; i++) { | ||
313 | struct page *page; | ||
314 | void *kaddr = ring->pg_vec[i]; | ||
315 | unsigned int pg_num; | ||
316 | |||
317 | for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { | ||
318 | page = pgvec_to_page(kaddr); | ||
319 | err = vm_insert_page(vma, start, page); | ||
320 | if (err < 0) | ||
321 | goto out; | ||
322 | start += PAGE_SIZE; | ||
323 | kaddr += PAGE_SIZE; | ||
324 | } | ||
325 | } | ||
326 | } | ||
327 | |||
328 | atomic_inc(&nlk->mapped); | ||
329 | vma->vm_ops = &netlink_mmap_ops; | ||
330 | err = 0; | ||
331 | out: | ||
332 | mutex_unlock(&nlk->pg_vec_lock); | ||
333 | return 0; | ||
334 | } | ||
335 | #else /* CONFIG_NETLINK_MMAP */ | ||
336 | #define netlink_mmap sock_no_mmap | ||
337 | #endif /* CONFIG_NETLINK_MMAP */ | ||
338 | |||
110 | static void netlink_destroy_callback(struct netlink_callback *cb) | 339 | static void netlink_destroy_callback(struct netlink_callback *cb) |
111 | { | 340 | { |
112 | kfree_skb(cb->skb); | 341 | kfree_skb(cb->skb); |
@@ -146,6 +375,18 @@ static void netlink_sock_destruct(struct sock *sk) | |||
146 | } | 375 | } |
147 | 376 | ||
148 | skb_queue_purge(&sk->sk_receive_queue); | 377 | skb_queue_purge(&sk->sk_receive_queue); |
378 | #ifdef CONFIG_NETLINK_MMAP | ||
379 | if (1) { | ||
380 | struct nl_mmap_req req; | ||
381 | |||
382 | memset(&req, 0, sizeof(req)); | ||
383 | if (nlk->rx_ring.pg_vec) | ||
384 | netlink_set_ring(sk, &req, true, false); | ||
385 | memset(&req, 0, sizeof(req)); | ||
386 | if (nlk->tx_ring.pg_vec) | ||
387 | netlink_set_ring(sk, &req, true, true); | ||
388 | } | ||
389 | #endif /* CONFIG_NETLINK_MMAP */ | ||
149 | 390 | ||
150 | if (!sock_flag(sk, SOCK_DEAD)) { | 391 | if (!sock_flag(sk, SOCK_DEAD)) { |
151 | printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); | 392 | printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); |
@@ -409,6 +650,9 @@ static int __netlink_create(struct net *net, struct socket *sock, | |||
409 | mutex_init(nlk->cb_mutex); | 650 | mutex_init(nlk->cb_mutex); |
410 | } | 651 | } |
411 | init_waitqueue_head(&nlk->wait); | 652 | init_waitqueue_head(&nlk->wait); |
653 | #ifdef CONFIG_NETLINK_MMAP | ||
654 | mutex_init(&nlk->pg_vec_lock); | ||
655 | #endif | ||
412 | 656 | ||
413 | sk->sk_destruct = netlink_sock_destruct; | 657 | sk->sk_destruct = netlink_sock_destruct; |
414 | sk->sk_protocol = protocol; | 658 | sk->sk_protocol = protocol; |
@@ -1211,7 +1455,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, | |||
1211 | if (level != SOL_NETLINK) | 1455 | if (level != SOL_NETLINK) |
1212 | return -ENOPROTOOPT; | 1456 | return -ENOPROTOOPT; |
1213 | 1457 | ||
1214 | if (optlen >= sizeof(int) && | 1458 | if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && |
1459 | optlen >= sizeof(int) && | ||
1215 | get_user(val, (unsigned int __user *)optval)) | 1460 | get_user(val, (unsigned int __user *)optval)) |
1216 | return -EFAULT; | 1461 | return -EFAULT; |
1217 | 1462 | ||
@@ -1260,6 +1505,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, | |||
1260 | } | 1505 | } |
1261 | err = 0; | 1506 | err = 0; |
1262 | break; | 1507 | break; |
1508 | #ifdef CONFIG_NETLINK_MMAP | ||
1509 | case NETLINK_RX_RING: | ||
1510 | case NETLINK_TX_RING: { | ||
1511 | struct nl_mmap_req req; | ||
1512 | |||
1513 | /* Rings might consume more memory than queue limits, require | ||
1514 | * CAP_NET_ADMIN. | ||
1515 | */ | ||
1516 | if (!capable(CAP_NET_ADMIN)) | ||
1517 | return -EPERM; | ||
1518 | if (optlen < sizeof(req)) | ||
1519 | return -EINVAL; | ||
1520 | if (copy_from_user(&req, optval, sizeof(req))) | ||
1521 | return -EFAULT; | ||
1522 | err = netlink_set_ring(sk, &req, false, | ||
1523 | optname == NETLINK_TX_RING); | ||
1524 | break; | ||
1525 | } | ||
1526 | #endif /* CONFIG_NETLINK_MMAP */ | ||
1263 | default: | 1527 | default: |
1264 | err = -ENOPROTOOPT; | 1528 | err = -ENOPROTOOPT; |
1265 | } | 1529 | } |
@@ -2093,7 +2357,7 @@ static const struct proto_ops netlink_ops = { | |||
2093 | .getsockopt = netlink_getsockopt, | 2357 | .getsockopt = netlink_getsockopt, |
2094 | .sendmsg = netlink_sendmsg, | 2358 | .sendmsg = netlink_sendmsg, |
2095 | .recvmsg = netlink_recvmsg, | 2359 | .recvmsg = netlink_recvmsg, |
2096 | .mmap = sock_no_mmap, | 2360 | .mmap = netlink_mmap, |
2097 | .sendpage = sock_no_sendpage, | 2361 | .sendpage = sock_no_sendpage, |
2098 | }; | 2362 | }; |
2099 | 2363 | ||
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index d9acb2a1d855..ed8522265f4e 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h | |||
@@ -6,6 +6,20 @@ | |||
6 | #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) | 6 | #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) |
7 | #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) | 7 | #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) |
8 | 8 | ||
9 | struct netlink_ring { | ||
10 | void **pg_vec; | ||
11 | unsigned int head; | ||
12 | unsigned int frames_per_block; | ||
13 | unsigned int frame_size; | ||
14 | unsigned int frame_max; | ||
15 | |||
16 | unsigned int pg_vec_order; | ||
17 | unsigned int pg_vec_pages; | ||
18 | unsigned int pg_vec_len; | ||
19 | |||
20 | atomic_t pending; | ||
21 | }; | ||
22 | |||
9 | struct netlink_sock { | 23 | struct netlink_sock { |
10 | /* struct sock has to be the first member of netlink_sock */ | 24 | /* struct sock has to be the first member of netlink_sock */ |
11 | struct sock sk; | 25 | struct sock sk; |
@@ -24,6 +38,12 @@ struct netlink_sock { | |||
24 | void (*netlink_rcv)(struct sk_buff *skb); | 38 | void (*netlink_rcv)(struct sk_buff *skb); |
25 | void (*netlink_bind)(int group); | 39 | void (*netlink_bind)(int group); |
26 | struct module *module; | 40 | struct module *module; |
41 | #ifdef CONFIG_NETLINK_MMAP | ||
42 | struct mutex pg_vec_lock; | ||
43 | struct netlink_ring rx_ring; | ||
44 | struct netlink_ring tx_ring; | ||
45 | atomic_t mapped; | ||
46 | #endif /* CONFIG_NETLINK_MMAP */ | ||
27 | }; | 47 | }; |
28 | 48 | ||
29 | static inline struct netlink_sock *nlk_sk(struct sock *sk) | 49 | static inline struct netlink_sock *nlk_sk(struct sock *sk) |