aboutsummaryrefslogtreecommitdiffstats
path: root/net/netlink
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2016-02-18 09:03:24 -0500
committerDavid S. Miller <davem@davemloft.net>2016-02-18 11:42:18 -0500
commitd1b4c689d4130bcfd3532680b64db562300716b6 (patch)
tree720b4285212d2ecf0bd76ca1f30ccf4e3fc67bf1 /net/netlink
parent7e6e18fbc033e00a4d4af3d4ea7bad0db6b7ad1b (diff)
netlink: remove mmapped netlink support
mmapped netlink has a number of unresolved issues: - TX zerocopy support had to be disabled more than a year ago via commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.") because the content of the mmapped area can change after netlink attribute validation but before message processing. - RX support was implemented mainly to speed up nfqueue dumping packet payload to userspace. However, since commit ae08ce0021087a5d812d2 ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy with the socket-based interface too (via the skb_zerocopy helper). The other problem is that skbs attached to mmaped netlink socket behave different from normal skbs: - they don't have a shinfo area, so all functions that use skb_shinfo() (e.g. skb_clone) cannot be used. - reserving headroom prevents userspace from seeing the content as it expects message to start at skb->head. See for instance commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump"). - skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we crash because it needs the sk to check if a tx ring is attached. Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359 ("netfilter: nfnetlink: use original skbuff when acking batches"). mmaped netlink also didn't play nicely with the skb_zerocopy helper used by nfqueue and openvswitch. Daniel Borkmann fixed this via commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue zero-copy")' but at the cost of also needing to provide remaining length to the allocation function. nfqueue also has problems when used with mmaped rx netlink: - mmaped netlink doesn't allow use of nfqueue batch verdict messages. Problem is that in the mmap case, the allocation time also determines the ordering in which the frame will be seen by userspace (A allocating before B means that A is located in earlier ring slot, but this also means that B might get a lower sequence number then A since seqno is decided later. To fix this we would need to extend the spinlocked region to also cover the allocation and message setup which isn't desirable. - nfqueue can now be configured to queue large (GSO) skbs to userspace. Queing GSO packets is faster than having to force a software segmentation in the kernel, so this is a desirable option. However, with a mmap based ring one has to use 64kb per ring slot element, else mmap has to fall back to the socket path (NL_MMAP_STATUS_COPY) for all large packets. To use the mmap interface, userspace not only has to probe for mmap netlink support, it also has to implement a recv/socket receive path in order to handle messages that exceed the size of an rx ring element. Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com> Cc: Pablo Neira Ayuso <pablo@netfilter.org> Cc: Patrick McHardy <kaber@trash.net> Cc: Thomas Graf <tgraf@suug.ch> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/netlink')
-rw-r--r--net/netlink/Kconfig9
-rw-r--r--net/netlink/af_netlink.c754
-rw-r--r--net/netlink/af_netlink.h15
-rw-r--r--net/netlink/diag.c39
4 files changed, 9 insertions, 808 deletions
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
index 2c5e95e9bfbd..5d6e8c05b3d4 100644
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -2,15 +2,6 @@
2# Netlink Sockets 2# Netlink Sockets
3# 3#
4 4
5config NETLINK_MMAP
6 bool "NETLINK: mmaped IO"
7 ---help---
8 This option enables support for memory mapped netlink IO. This
9 reduces overhead by avoiding copying data between kernel- and
10 userspace.
11
12 If unsure, say N.
13
14config NETLINK_DIAG 5config NETLINK_DIAG
15 tristate "NETLINK: socket monitoring interface" 6 tristate "NETLINK: socket monitoring interface"
16 default n 7 default n
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1ffb34e253f..85aa6ef86dfd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
225 225
226 dev_hold(dev); 226 dev_hold(dev);
227 227
228 if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) 228 if (is_vmalloc_addr(skb->head))
229 nskb = netlink_to_full_skb(skb, GFP_ATOMIC); 229 nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
230 else 230 else
231 nskb = skb_clone(skb, GFP_ATOMIC); 231 nskb = skb_clone(skb, GFP_ATOMIC);
@@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
300 wake_up_interruptible(&nlk->wait); 300 wake_up_interruptible(&nlk->wait);
301} 301}
302 302
303#ifdef CONFIG_NETLINK_MMAP
304static bool netlink_rx_is_mmaped(struct sock *sk)
305{
306 return nlk_sk(sk)->rx_ring.pg_vec != NULL;
307}
308
309static bool netlink_tx_is_mmaped(struct sock *sk)
310{
311 return nlk_sk(sk)->tx_ring.pg_vec != NULL;
312}
313
314static __pure struct page *pgvec_to_page(const void *addr)
315{
316 if (is_vmalloc_addr(addr))
317 return vmalloc_to_page(addr);
318 else
319 return virt_to_page(addr);
320}
321
322static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
323{
324 unsigned int i;
325
326 for (i = 0; i < len; i++) {
327 if (pg_vec[i] != NULL) {
328 if (is_vmalloc_addr(pg_vec[i]))
329 vfree(pg_vec[i]);
330 else
331 free_pages((unsigned long)pg_vec[i], order);
332 }
333 }
334 kfree(pg_vec);
335}
336
337static void *alloc_one_pg_vec_page(unsigned long order)
338{
339 void *buffer;
340 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
341 __GFP_NOWARN | __GFP_NORETRY;
342
343 buffer = (void *)__get_free_pages(gfp_flags, order);
344 if (buffer != NULL)
345 return buffer;
346
347 buffer = vzalloc((1 << order) * PAGE_SIZE);
348 if (buffer != NULL)
349 return buffer;
350
351 gfp_flags &= ~__GFP_NORETRY;
352 return (void *)__get_free_pages(gfp_flags, order);
353}
354
355static void **alloc_pg_vec(struct netlink_sock *nlk,
356 struct nl_mmap_req *req, unsigned int order)
357{
358 unsigned int block_nr = req->nm_block_nr;
359 unsigned int i;
360 void **pg_vec;
361
362 pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
363 if (pg_vec == NULL)
364 return NULL;
365
366 for (i = 0; i < block_nr; i++) {
367 pg_vec[i] = alloc_one_pg_vec_page(order);
368 if (pg_vec[i] == NULL)
369 goto err1;
370 }
371
372 return pg_vec;
373err1:
374 free_pg_vec(pg_vec, order, block_nr);
375 return NULL;
376}
377
378
379static void
380__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
381 unsigned int order)
382{
383 struct netlink_sock *nlk = nlk_sk(sk);
384 struct sk_buff_head *queue;
385 struct netlink_ring *ring;
386
387 queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
388 ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
389
390 spin_lock_bh(&queue->lock);
391
392 ring->frame_max = req->nm_frame_nr - 1;
393 ring->head = 0;
394 ring->frame_size = req->nm_frame_size;
395 ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
396
397 swap(ring->pg_vec_len, req->nm_block_nr);
398 swap(ring->pg_vec_order, order);
399 swap(ring->pg_vec, pg_vec);
400
401 __skb_queue_purge(queue);
402 spin_unlock_bh(&queue->lock);
403
404 WARN_ON(atomic_read(&nlk->mapped));
405
406 if (pg_vec)
407 free_pg_vec(pg_vec, order, req->nm_block_nr);
408}
409
410static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
411 bool tx_ring)
412{
413 struct netlink_sock *nlk = nlk_sk(sk);
414 struct netlink_ring *ring;
415 void **pg_vec = NULL;
416 unsigned int order = 0;
417
418 ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
419
420 if (atomic_read(&nlk->mapped))
421 return -EBUSY;
422 if (atomic_read(&ring->pending))
423 return -EBUSY;
424
425 if (req->nm_block_nr) {
426 if (ring->pg_vec != NULL)
427 return -EBUSY;
428
429 if ((int)req->nm_block_size <= 0)
430 return -EINVAL;
431 if (!PAGE_ALIGNED(req->nm_block_size))
432 return -EINVAL;
433 if (req->nm_frame_size < NL_MMAP_HDRLEN)
434 return -EINVAL;
435 if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
436 return -EINVAL;
437
438 ring->frames_per_block = req->nm_block_size /
439 req->nm_frame_size;
440 if (ring->frames_per_block == 0)
441 return -EINVAL;
442 if (ring->frames_per_block * req->nm_block_nr !=
443 req->nm_frame_nr)
444 return -EINVAL;
445
446 order = get_order(req->nm_block_size);
447 pg_vec = alloc_pg_vec(nlk, req, order);
448 if (pg_vec == NULL)
449 return -ENOMEM;
450 } else {
451 if (req->nm_frame_nr)
452 return -EINVAL;
453 }
454
455 mutex_lock(&nlk->pg_vec_lock);
456 if (atomic_read(&nlk->mapped) == 0) {
457 __netlink_set_ring(sk, req, tx_ring, pg_vec, order);
458 mutex_unlock(&nlk->pg_vec_lock);
459 return 0;
460 }
461
462 mutex_unlock(&nlk->pg_vec_lock);
463
464 if (pg_vec)
465 free_pg_vec(pg_vec, order, req->nm_block_nr);
466
467 return -EBUSY;
468}
469
470static void netlink_mm_open(struct vm_area_struct *vma)
471{
472 struct file *file = vma->vm_file;
473 struct socket *sock = file->private_data;
474 struct sock *sk = sock->sk;
475
476 if (sk)
477 atomic_inc(&nlk_sk(sk)->mapped);
478}
479
480static void netlink_mm_close(struct vm_area_struct *vma)
481{
482 struct file *file = vma->vm_file;
483 struct socket *sock = file->private_data;
484 struct sock *sk = sock->sk;
485
486 if (sk)
487 atomic_dec(&nlk_sk(sk)->mapped);
488}
489
490static const struct vm_operations_struct netlink_mmap_ops = {
491 .open = netlink_mm_open,
492 .close = netlink_mm_close,
493};
494
495static int netlink_mmap(struct file *file, struct socket *sock,
496 struct vm_area_struct *vma)
497{
498 struct sock *sk = sock->sk;
499 struct netlink_sock *nlk = nlk_sk(sk);
500 struct netlink_ring *ring;
501 unsigned long start, size, expected;
502 unsigned int i;
503 int err = -EINVAL;
504
505 if (vma->vm_pgoff)
506 return -EINVAL;
507
508 mutex_lock(&nlk->pg_vec_lock);
509
510 expected = 0;
511 for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
512 if (ring->pg_vec == NULL)
513 continue;
514 expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
515 }
516
517 if (expected == 0)
518 goto out;
519
520 size = vma->vm_end - vma->vm_start;
521 if (size != expected)
522 goto out;
523
524 start = vma->vm_start;
525 for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
526 if (ring->pg_vec == NULL)
527 continue;
528
529 for (i = 0; i < ring->pg_vec_len; i++) {
530 struct page *page;
531 void *kaddr = ring->pg_vec[i];
532 unsigned int pg_num;
533
534 for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
535 page = pgvec_to_page(kaddr);
536 err = vm_insert_page(vma, start, page);
537 if (err < 0)
538 goto out;
539 start += PAGE_SIZE;
540 kaddr += PAGE_SIZE;
541 }
542 }
543 }
544
545 atomic_inc(&nlk->mapped);
546 vma->vm_ops = &netlink_mmap_ops;
547 err = 0;
548out:
549 mutex_unlock(&nlk->pg_vec_lock);
550 return err;
551}
552
553static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
554{
555#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
556 struct page *p_start, *p_end;
557
558 /* First page is flushed through netlink_{get,set}_status */
559 p_start = pgvec_to_page(hdr + PAGE_SIZE);
560 p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
561 while (p_start <= p_end) {
562 flush_dcache_page(p_start);
563 p_start++;
564 }
565#endif
566}
567
568static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
569{
570 smp_rmb();
571 flush_dcache_page(pgvec_to_page(hdr));
572 return hdr->nm_status;
573}
574
575static void netlink_set_status(struct nl_mmap_hdr *hdr,
576 enum nl_mmap_status status)
577{
578 smp_mb();
579 hdr->nm_status = status;
580 flush_dcache_page(pgvec_to_page(hdr));
581}
582
583static struct nl_mmap_hdr *
584__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
585{
586 unsigned int pg_vec_pos, frame_off;
587
588 pg_vec_pos = pos / ring->frames_per_block;
589 frame_off = pos % ring->frames_per_block;
590
591 return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
592}
593
594static struct nl_mmap_hdr *
595netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
596 enum nl_mmap_status status)
597{
598 struct nl_mmap_hdr *hdr;
599
600 hdr = __netlink_lookup_frame(ring, pos);
601 if (netlink_get_status(hdr) != status)
602 return NULL;
603
604 return hdr;
605}
606
607static struct nl_mmap_hdr *
608netlink_current_frame(const struct netlink_ring *ring,
609 enum nl_mmap_status status)
610{
611 return netlink_lookup_frame(ring, ring->head, status);
612}
613
614static void netlink_increment_head(struct netlink_ring *ring)
615{
616 ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
617}
618
619static void netlink_forward_ring(struct netlink_ring *ring)
620{
621 unsigned int head = ring->head;
622 const struct nl_mmap_hdr *hdr;
623
624 do {
625 hdr = __netlink_lookup_frame(ring, ring->head);
626 if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
627 break;
628 if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
629 break;
630 netlink_increment_head(ring);
631 } while (ring->head != head);
632}
633
634static bool netlink_has_valid_frame(struct netlink_ring *ring)
635{
636 unsigned int head = ring->head, pos = head;
637 const struct nl_mmap_hdr *hdr;
638
639 do {
640 hdr = __netlink_lookup_frame(ring, pos);
641 if (hdr->nm_status == NL_MMAP_STATUS_VALID)
642 return true;
643 pos = pos != 0 ? pos - 1 : ring->frame_max;
644 } while (pos != head);
645
646 return false;
647}
648
649static bool netlink_dump_space(struct netlink_sock *nlk)
650{
651 struct netlink_ring *ring = &nlk->rx_ring;
652 struct nl_mmap_hdr *hdr;
653 unsigned int n;
654
655 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
656 if (hdr == NULL)
657 return false;
658
659 n = ring->head + ring->frame_max / 2;
660 if (n > ring->frame_max)
661 n -= ring->frame_max;
662
663 hdr = __netlink_lookup_frame(ring, n);
664
665 return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
666}
667
668static unsigned int netlink_poll(struct file *file, struct socket *sock,
669 poll_table *wait)
670{
671 struct sock *sk = sock->sk;
672 struct netlink_sock *nlk = nlk_sk(sk);
673 unsigned int mask;
674 int err;
675
676 if (nlk->rx_ring.pg_vec != NULL) {
677 /* Memory mapped sockets don't call recvmsg(), so flow control
678 * for dumps is performed here. A dump is allowed to continue
679 * if at least half the ring is unused.
680 */
681 while (nlk->cb_running && netlink_dump_space(nlk)) {
682 err = netlink_dump(sk);
683 if (err < 0) {
684 sk->sk_err = -err;
685 sk->sk_error_report(sk);
686 break;
687 }
688 }
689 netlink_rcv_wake(sk);
690 }
691
692 mask = datagram_poll(file, sock, wait);
693
694 /* We could already have received frames in the normal receive
695 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
696 * so if mask contains pollin/etc already, there's no point
697 * walking the ring.
698 */
699 if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
700 spin_lock_bh(&sk->sk_receive_queue.lock);
701 if (nlk->rx_ring.pg_vec) {
702 if (netlink_has_valid_frame(&nlk->rx_ring))
703 mask |= POLLIN | POLLRDNORM;
704 }
705 spin_unlock_bh(&sk->sk_receive_queue.lock);
706 }
707
708 spin_lock_bh(&sk->sk_write_queue.lock);
709 if (nlk->tx_ring.pg_vec) {
710 if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
711 mask |= POLLOUT | POLLWRNORM;
712 }
713 spin_unlock_bh(&sk->sk_write_queue.lock);
714
715 return mask;
716}
717
718static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
719{
720 return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
721}
722
723static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
724 struct netlink_ring *ring,
725 struct nl_mmap_hdr *hdr)
726{
727 unsigned int size;
728 void *data;
729
730 size = ring->frame_size - NL_MMAP_HDRLEN;
731 data = (void *)hdr + NL_MMAP_HDRLEN;
732
733 skb->head = data;
734 skb->data = data;
735 skb_reset_tail_pointer(skb);
736 skb->end = skb->tail + size;
737 skb->len = 0;
738
739 skb->destructor = netlink_skb_destructor;
740 NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
741 NETLINK_CB(skb).sk = sk;
742}
743
744static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
745 u32 dst_portid, u32 dst_group,
746 struct scm_cookie *scm)
747{
748 struct netlink_sock *nlk = nlk_sk(sk);
749 struct netlink_ring *ring;
750 struct nl_mmap_hdr *hdr;
751 struct sk_buff *skb;
752 unsigned int maxlen;
753 int err = 0, len = 0;
754
755 mutex_lock(&nlk->pg_vec_lock);
756
757 ring = &nlk->tx_ring;
758 maxlen = ring->frame_size - NL_MMAP_HDRLEN;
759
760 do {
761 unsigned int nm_len;
762
763 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
764 if (hdr == NULL) {
765 if (!(msg->msg_flags & MSG_DONTWAIT) &&
766 atomic_read(&nlk->tx_ring.pending))
767 schedule();
768 continue;
769 }
770
771 nm_len = ACCESS_ONCE(hdr->nm_len);
772 if (nm_len > maxlen) {
773 err = -EINVAL;
774 goto out;
775 }
776
777 netlink_frame_flush_dcache(hdr, nm_len);
778
779 skb = alloc_skb(nm_len, GFP_KERNEL);
780 if (skb == NULL) {
781 err = -ENOBUFS;
782 goto out;
783 }
784 __skb_put(skb, nm_len);
785 memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
786 netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
787
788 netlink_increment_head(ring);
789
790 NETLINK_CB(skb).portid = nlk->portid;
791 NETLINK_CB(skb).dst_group = dst_group;
792 NETLINK_CB(skb).creds = scm->creds;
793
794 err = security_netlink_send(sk, skb);
795 if (err) {
796 kfree_skb(skb);
797 goto out;
798 }
799
800 if (unlikely(dst_group)) {
801 atomic_inc(&skb->users);
802 netlink_broadcast(sk, skb, dst_portid, dst_group,
803 GFP_KERNEL);
804 }
805 err = netlink_unicast(sk, skb, dst_portid,
806 msg->msg_flags & MSG_DONTWAIT);
807 if (err < 0)
808 goto out;
809 len += err;
810
811 } while (hdr != NULL ||
812 (!(msg->msg_flags & MSG_DONTWAIT) &&
813 atomic_read(&nlk->tx_ring.pending)));
814
815 if (len > 0)
816 err = len;
817out:
818 mutex_unlock(&nlk->pg_vec_lock);
819 return err;
820}
821
822static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
823{
824 struct nl_mmap_hdr *hdr;
825
826 hdr = netlink_mmap_hdr(skb);
827 hdr->nm_len = skb->len;
828 hdr->nm_group = NETLINK_CB(skb).dst_group;
829 hdr->nm_pid = NETLINK_CB(skb).creds.pid;
830 hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
831 hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
832 netlink_frame_flush_dcache(hdr, hdr->nm_len);
833 netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
834
835 NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
836 kfree_skb(skb);
837}
838
839static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
840{
841 struct netlink_sock *nlk = nlk_sk(sk);
842 struct netlink_ring *ring = &nlk->rx_ring;
843 struct nl_mmap_hdr *hdr;
844
845 spin_lock_bh(&sk->sk_receive_queue.lock);
846 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
847 if (hdr == NULL) {
848 spin_unlock_bh(&sk->sk_receive_queue.lock);
849 kfree_skb(skb);
850 netlink_overrun(sk);
851 return;
852 }
853 netlink_increment_head(ring);
854 __skb_queue_tail(&sk->sk_receive_queue, skb);
855 spin_unlock_bh(&sk->sk_receive_queue.lock);
856
857 hdr->nm_len = skb->len;
858 hdr->nm_group = NETLINK_CB(skb).dst_group;
859 hdr->nm_pid = NETLINK_CB(skb).creds.pid;
860 hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
861 hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
862 netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
863}
864
865#else /* CONFIG_NETLINK_MMAP */
866#define netlink_rx_is_mmaped(sk) false
867#define netlink_tx_is_mmaped(sk) false
868#define netlink_mmap sock_no_mmap
869#define netlink_poll datagram_poll
870#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0
871#endif /* CONFIG_NETLINK_MMAP */
872
873static void netlink_skb_destructor(struct sk_buff *skb) 303static void netlink_skb_destructor(struct sk_buff *skb)
874{ 304{
875#ifdef CONFIG_NETLINK_MMAP
876 struct nl_mmap_hdr *hdr;
877 struct netlink_ring *ring;
878 struct sock *sk;
879
880 /* If a packet from the kernel to userspace was freed because of an
881 * error without being delivered to userspace, the kernel must reset
882 * the status. In the direction userspace to kernel, the status is
883 * always reset here after the packet was processed and freed.
884 */
885 if (netlink_skb_is_mmaped(skb)) {
886 hdr = netlink_mmap_hdr(skb);
887 sk = NETLINK_CB(skb).sk;
888
889 if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
890 netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
891 ring = &nlk_sk(sk)->tx_ring;
892 } else {
893 if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
894 hdr->nm_len = 0;
895 netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
896 }
897 ring = &nlk_sk(sk)->rx_ring;
898 }
899
900 WARN_ON(atomic_read(&ring->pending) == 0);
901 atomic_dec(&ring->pending);
902 sock_put(sk);
903
904 skb->head = NULL;
905 }
906#endif
907 if (is_vmalloc_addr(skb->head)) { 305 if (is_vmalloc_addr(skb->head)) {
908 if (!skb->cloned || 306 if (!skb->cloned ||
909 !atomic_dec_return(&(skb_shinfo(skb)->dataref))) 307 !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
@@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk)
937 } 335 }
938 336
939 skb_queue_purge(&sk->sk_receive_queue); 337 skb_queue_purge(&sk->sk_receive_queue);
940#ifdef CONFIG_NETLINK_MMAP
941 if (1) {
942 struct nl_mmap_req req;
943
944 memset(&req, 0, sizeof(req));
945 if (nlk->rx_ring.pg_vec)
946 __netlink_set_ring(sk, &req, false, NULL, 0);
947 memset(&req, 0, sizeof(req));
948 if (nlk->tx_ring.pg_vec)
949 __netlink_set_ring(sk, &req, true, NULL, 0);
950 }
951#endif /* CONFIG_NETLINK_MMAP */
952 338
953 if (!sock_flag(sk, SOCK_DEAD)) { 339 if (!sock_flag(sk, SOCK_DEAD)) {
954 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); 340 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
1194 mutex_init(nlk->cb_mutex); 580 mutex_init(nlk->cb_mutex);
1195 } 581 }
1196 init_waitqueue_head(&nlk->wait); 582 init_waitqueue_head(&nlk->wait);
1197#ifdef CONFIG_NETLINK_MMAP
1198 mutex_init(&nlk->pg_vec_lock);
1199#endif
1200 583
1201 sk->sk_destruct = netlink_sock_destruct; 584 sk->sk_destruct = netlink_sock_destruct;
1202 sk->sk_protocol = protocol; 585 sk->sk_protocol = protocol;
@@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
1728 nlk = nlk_sk(sk); 1111 nlk = nlk_sk(sk);
1729 1112
1730 if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 1113 if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1731 test_bit(NETLINK_S_CONGESTED, &nlk->state)) && 1114 test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
1732 !netlink_skb_is_mmaped(skb)) {
1733 DECLARE_WAITQUEUE(wait, current); 1115 DECLARE_WAITQUEUE(wait, current);
1734 if (!*timeo) { 1116 if (!*timeo) {
1735 if (!ssk || netlink_is_kernel(ssk)) 1117 if (!ssk || netlink_is_kernel(ssk))
@@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
1767 1149
1768 netlink_deliver_tap(skb); 1150 netlink_deliver_tap(skb);
1769 1151
1770#ifdef CONFIG_NETLINK_MMAP 1152 skb_queue_tail(&sk->sk_receive_queue, skb);
1771 if (netlink_skb_is_mmaped(skb))
1772 netlink_queue_mmaped_skb(sk, skb);
1773 else if (netlink_rx_is_mmaped(sk))
1774 netlink_ring_set_copied(sk, skb);
1775 else
1776#endif /* CONFIG_NETLINK_MMAP */
1777 skb_queue_tail(&sk->sk_receive_queue, skb);
1778 sk->sk_data_ready(sk); 1153 sk->sk_data_ready(sk);
1779 return len; 1154 return len;
1780} 1155}
@@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
1798 int delta; 1173 int delta;
1799 1174
1800 WARN_ON(skb->sk != NULL); 1175 WARN_ON(skb->sk != NULL);
1801 if (netlink_skb_is_mmaped(skb))
1802 return skb;
1803
1804 delta = skb->end - skb->tail; 1176 delta = skb->end - skb->tail;
1805 if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) 1177 if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
1806 return skb; 1178 return skb;
@@ -1880,71 +1252,6 @@ struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
1880 unsigned int ldiff, u32 dst_portid, 1252 unsigned int ldiff, u32 dst_portid,
1881 gfp_t gfp_mask) 1253 gfp_t gfp_mask)
1882{ 1254{
1883#ifdef CONFIG_NETLINK_MMAP
1884 unsigned int maxlen, linear_size;
1885 struct sock *sk = NULL;
1886 struct sk_buff *skb;
1887 struct netlink_ring *ring;
1888 struct nl_mmap_hdr *hdr;
1889
1890 sk = netlink_getsockbyportid(ssk, dst_portid);
1891 if (IS_ERR(sk))
1892 goto out;
1893
1894 ring = &nlk_sk(sk)->rx_ring;
1895 /* fast-path without atomic ops for common case: non-mmaped receiver */
1896 if (ring->pg_vec == NULL)
1897 goto out_put;
1898
1899 /* We need to account the full linear size needed as a ring
1900 * slot cannot have non-linear parts.
1901 */
1902 linear_size = size + ldiff;
1903 if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
1904 goto out_put;
1905
1906 skb = alloc_skb_head(gfp_mask);
1907 if (skb == NULL)
1908 goto err1;
1909
1910 spin_lock_bh(&sk->sk_receive_queue.lock);
1911 /* check again under lock */
1912 if (ring->pg_vec == NULL)
1913 goto out_free;
1914
1915 /* check again under lock */
1916 maxlen = ring->frame_size - NL_MMAP_HDRLEN;
1917 if (maxlen < linear_size)
1918 goto out_free;
1919
1920 netlink_forward_ring(ring);
1921 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
1922 if (hdr == NULL)
1923 goto err2;
1924
1925 netlink_ring_setup_skb(skb, sk, ring, hdr);
1926 netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
1927 atomic_inc(&ring->pending);
1928 netlink_increment_head(ring);
1929
1930 spin_unlock_bh(&sk->sk_receive_queue.lock);
1931 return skb;
1932
1933err2:
1934 kfree_skb(skb);
1935 spin_unlock_bh(&sk->sk_receive_queue.lock);
1936 netlink_overrun(sk);
1937err1:
1938 sock_put(sk);
1939 return NULL;
1940
1941out_free:
1942 kfree_skb(skb);
1943 spin_unlock_bh(&sk->sk_receive_queue.lock);
1944out_put:
1945 sock_put(sk);
1946out:
1947#endif
1948 return alloc_skb(size, gfp_mask); 1255 return alloc_skb(size, gfp_mask);
1949} 1256}
1950EXPORT_SYMBOL_GPL(__netlink_alloc_skb); 1257EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
@@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
2225 if (level != SOL_NETLINK) 1532 if (level != SOL_NETLINK)
2226 return -ENOPROTOOPT; 1533 return -ENOPROTOOPT;
2227 1534
2228 if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && 1535 if (optlen >= sizeof(int) &&
2229 optlen >= sizeof(int) &&
2230 get_user(val, (unsigned int __user *)optval)) 1536 get_user(val, (unsigned int __user *)optval))
2231 return -EFAULT; 1537 return -EFAULT;
2232 1538
@@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
2279 } 1585 }
2280 err = 0; 1586 err = 0;
2281 break; 1587 break;
2282#ifdef CONFIG_NETLINK_MMAP
2283 case NETLINK_RX_RING:
2284 case NETLINK_TX_RING: {
2285 struct nl_mmap_req req;
2286
2287 /* Rings might consume more memory than queue limits, require
2288 * CAP_NET_ADMIN.
2289 */
2290 if (!capable(CAP_NET_ADMIN))
2291 return -EPERM;
2292 if (optlen < sizeof(req))
2293 return -EINVAL;
2294 if (copy_from_user(&req, optval, sizeof(req)))
2295 return -EFAULT;
2296 err = netlink_set_ring(sk, &req,
2297 optname == NETLINK_TX_RING);
2298 break;
2299 }
2300#endif /* CONFIG_NETLINK_MMAP */
2301 case NETLINK_LISTEN_ALL_NSID: 1588 case NETLINK_LISTEN_ALL_NSID:
2302 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) 1589 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
2303 return -EPERM; 1590 return -EPERM;
@@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2467 smp_rmb(); 1754 smp_rmb();
2468 } 1755 }
2469 1756
2470 /* It's a really convoluted way for userland to ask for mmaped
2471 * sendmsg(), but that's what we've got...
2472 */
2473 if (netlink_tx_is_mmaped(sk) &&
2474 iter_is_iovec(&msg->msg_iter) &&
2475 msg->msg_iter.nr_segs == 1 &&
2476 msg->msg_iter.iov->iov_base == NULL) {
2477 err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
2478 &scm);
2479 goto out;
2480 }
2481
2482 err = -EMSGSIZE; 1757 err = -EMSGSIZE;
2483 if (len > sk->sk_sndbuf - 32) 1758 if (len > sk->sk_sndbuf - 32)
2484 goto out; 1759 goto out;
@@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk)
2794 goto errout_skb; 2069 goto errout_skb;
2795 } 2070 }
2796 2071
2797 if (!netlink_rx_is_mmaped(sk) && 2072 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2798 atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2799 goto errout_skb; 2073 goto errout_skb;
2800 2074
2801 /* NLMSG_GOODSIZE is small to avoid high order allocations being 2075 /* NLMSG_GOODSIZE is small to avoid high order allocations being
@@ -2831,8 +2105,7 @@ static int netlink_dump(struct sock *sk)
2831 * reasonable static buffer based on the expected largest dump of a 2105 * reasonable static buffer based on the expected largest dump of a
2832 * single netdev. The outcome is MSG_TRUNC error. 2106 * single netdev. The outcome is MSG_TRUNC error.
2833 */ 2107 */
2834 if (!netlink_rx_is_mmaped(sk)) 2108 skb_reserve(skb, skb_tailroom(skb) - alloc_size);
2835 skb_reserve(skb, skb_tailroom(skb) - alloc_size);
2836 netlink_skb_set_owner_r(skb, sk); 2109 netlink_skb_set_owner_r(skb, sk);
2837 2110
2838 len = cb->dump(skb, cb); 2111 len = cb->dump(skb, cb);
@@ -2884,16 +2157,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
2884 struct netlink_sock *nlk; 2157 struct netlink_sock *nlk;
2885 int ret; 2158 int ret;
2886 2159
2887 /* Memory mapped dump requests need to be copied to avoid looping 2160 atomic_inc(&skb->users);
2888 * on the pending state in netlink_mmap_sendmsg() while the CB hold
2889 * a reference to the skb.
2890 */
2891 if (netlink_skb_is_mmaped(skb)) {
2892 skb = skb_copy(skb, GFP_KERNEL);
2893 if (skb == NULL)
2894 return -ENOBUFS;
2895 } else
2896 atomic_inc(&skb->users);
2897 2161
2898 sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); 2162 sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
2899 if (sk == NULL) { 2163 if (sk == NULL) {
@@ -3241,7 +2505,7 @@ static const struct proto_ops netlink_ops = {
3241 .socketpair = sock_no_socketpair, 2505 .socketpair = sock_no_socketpair,
3242 .accept = sock_no_accept, 2506 .accept = sock_no_accept,
3243 .getname = netlink_getname, 2507 .getname = netlink_getname,
3244 .poll = netlink_poll, 2508 .poll = datagram_poll,
3245 .ioctl = sock_no_ioctl, 2509 .ioctl = sock_no_ioctl,
3246 .listen = sock_no_listen, 2510 .listen = sock_no_listen,
3247 .shutdown = sock_no_shutdown, 2511 .shutdown = sock_no_shutdown,
@@ -3249,7 +2513,7 @@ static const struct proto_ops netlink_ops = {
3249 .getsockopt = netlink_getsockopt, 2513 .getsockopt = netlink_getsockopt,
3250 .sendmsg = netlink_sendmsg, 2514 .sendmsg = netlink_sendmsg,
3251 .recvmsg = netlink_recvmsg, 2515 .recvmsg = netlink_recvmsg,
3252 .mmap = netlink_mmap, 2516 .mmap = sock_no_mmap,
3253 .sendpage = sock_no_sendpage, 2517 .sendpage = sock_no_sendpage,
3254}; 2518};
3255 2519
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 14437d9b1965..e68ef9ccd703 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -44,12 +44,6 @@ struct netlink_sock {
44 int (*netlink_bind)(struct net *net, int group); 44 int (*netlink_bind)(struct net *net, int group);
45 void (*netlink_unbind)(struct net *net, int group); 45 void (*netlink_unbind)(struct net *net, int group);
46 struct module *module; 46 struct module *module;
47#ifdef CONFIG_NETLINK_MMAP
48 struct mutex pg_vec_lock;
49 struct netlink_ring rx_ring;
50 struct netlink_ring tx_ring;
51 atomic_t mapped;
52#endif /* CONFIG_NETLINK_MMAP */
53 47
54 struct rhash_head node; 48 struct rhash_head node;
55 struct rcu_head rcu; 49 struct rcu_head rcu;
@@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
60 return container_of(sk, struct netlink_sock, sk); 54 return container_of(sk, struct netlink_sock, sk);
61} 55}
62 56
63static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
64{
65#ifdef CONFIG_NETLINK_MMAP
66 return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
67#else
68 return false;
69#endif /* CONFIG_NETLINK_MMAP */
70}
71
72struct netlink_table { 57struct netlink_table {
73 struct rhashtable hash; 58 struct rhashtable hash;
74 struct hlist_head mc_list; 59 struct hlist_head mc_list;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3ee63a3cff30..8dd836a8dd60 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -8,41 +8,6 @@
8 8
9#include "af_netlink.h" 9#include "af_netlink.h"
10 10
11#ifdef CONFIG_NETLINK_MMAP
12static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
13 struct sk_buff *nlskb)
14{
15 struct netlink_diag_ring ndr;
16
17 ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
18 ndr.ndr_block_nr = ring->pg_vec_len;
19 ndr.ndr_frame_size = ring->frame_size;
20 ndr.ndr_frame_nr = ring->frame_max + 1;
21
22 return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
23}
24
25static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
26{
27 struct netlink_sock *nlk = nlk_sk(sk);
28 int ret;
29
30 mutex_lock(&nlk->pg_vec_lock);
31 ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
32 if (!ret)
33 ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
34 nlskb);
35 mutex_unlock(&nlk->pg_vec_lock);
36
37 return ret;
38}
39#else
40static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
41{
42 return 0;
43}
44#endif
45
46static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) 11static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
47{ 12{
48 struct netlink_sock *nlk = nlk_sk(sk); 13 struct netlink_sock *nlk = nlk_sk(sk);
@@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
87 sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) 52 sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
88 goto out_nlmsg_trim; 53 goto out_nlmsg_trim;
89 54
90 if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
91 sk_diag_put_rings_cfg(sk, skb))
92 goto out_nlmsg_trim;
93
94 nlmsg_end(skb, nlh); 55 nlmsg_end(skb, nlh);
95 return 0; 56 return 0;
96 57