aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-07-17 02:30:25 -0400
committerDavid S. Miller <davem@davemloft.net>2014-07-17 02:30:25 -0400
commite18a44590a363b6785f12f87167d46be4421ed5b (patch)
treecab09b28d188606139b1b50b661f42da157c52ba
parent3e1c0f0b06e38b50bfca197a6443d639353bb035 (diff)
parent2dc41cff7545d55c6294525c811594576f8e119c (diff)
Merge branch 'udp_hash'
David Held says: ==================== udp: Fix multicast performance issues. Fix performance issues with listening to many different multicast sockets on different addresses with the same port. Instead of always using hash1, fall back to hash2 lookup when hash1 lookup is long. Patch 1 is a general cleanup and simplification which also makes the main implementation in Patch 2 simpler. Eric's recent change 63c6f81cdde5 avoided this being an issue in early demux. This makes it work for regular delivery as well. v1->v2 - updated hash collision detection v2->v3 - avoid flushing under lock unnecessarily at ARRAY_SIZE boundary ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h14
-rw-r--r--net/ipv4/udp.c77
-rw-r--r--net/ipv6/udp.c116
3 files changed, 102 insertions, 105 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 29e48a6d1ded..28f734601b50 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
660#define sk_for_each_bound(__sk, list) \ 660#define sk_for_each_bound(__sk, list) \
661 hlist_for_each_entry(__sk, list, sk_bind_node) 661 hlist_for_each_entry(__sk, list, sk_bind_node)
662 662
663/**
664 * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
665 * @tpos: the type * to use as a loop cursor.
666 * @pos: the &struct hlist_node to use as a loop cursor.
667 * @head: the head for your list.
668 * @offset: offset of hlist_node within the struct.
669 *
670 */
671#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \
672 for (pos = (head)->first; \
673 (!is_a_nulls(pos)) && \
674 ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \
675 pos = pos->next)
676
663static inline struct user_namespace *sk_user_ns(struct sock *sk) 677static inline struct user_namespace *sk_user_ns(struct sock *sk)
664{ 678{
665 /* Careful only use this in a context where these parameters 679 /* Careful only use this in a context where these parameters
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 668af516f094..f31053b90ee0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
594 return true; 594 return true;
595} 595}
596 596
597static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
598 __be16 loc_port, __be32 loc_addr,
599 __be16 rmt_port, __be32 rmt_addr,
600 int dif)
601{
602 struct hlist_nulls_node *node;
603 unsigned short hnum = ntohs(loc_port);
604
605 sk_nulls_for_each_from(sk, node) {
606 if (__udp_is_mcast_sock(net, sk,
607 loc_port, loc_addr,
608 rmt_port, rmt_addr,
609 dif, hnum))
610 goto found;
611 }
612 sk = NULL;
613found:
614 return sk;
615}
616
617/* 597/*
618 * This routine is called by the ICMP module when it gets some 598 * This routine is called by the ICMP module when it gets some
619 * sort of error condition. If err < 0 then the socket should 599 * sort of error condition. If err < 0 then the socket should
@@ -1639,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
1639 1619
1640 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) 1620 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1641 skb1 = NULL; 1621 skb1 = NULL;
1622
1623 sock_put(sk);
1642 } 1624 }
1643 if (unlikely(skb1)) 1625 if (unlikely(skb1))
1644 kfree_skb(skb1); 1626 kfree_skb(skb1);
@@ -1667,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1667 struct udp_table *udptable) 1649 struct udp_table *udptable)
1668{ 1650{
1669 struct sock *sk, *stack[256 / sizeof(struct sock *)]; 1651 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1670 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); 1652 struct hlist_nulls_node *node;
1671 int dif; 1653 unsigned short hnum = ntohs(uh->dest);
1672 unsigned int i, count = 0; 1654 struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
1655 int dif = skb->dev->ifindex;
1656 unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
1657 unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
1658
1659 if (use_hash2) {
1660 hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
1661 udp_table.mask;
1662 hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
1663start_lookup:
1664 hslot = &udp_table.hash2[hash2];
1665 offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
1666 }
1673 1667
1674 spin_lock(&hslot->lock); 1668 spin_lock(&hslot->lock);
1675 sk = sk_nulls_head(&hslot->head); 1669 sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
1676 dif = skb->dev->ifindex; 1670 if (__udp_is_mcast_sock(net, sk,
1677 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1671 uh->dest, daddr,
1678 while (sk) { 1672 uh->source, saddr,
1679 stack[count++] = sk; 1673 dif, hnum)) {
1680 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, 1674 if (unlikely(count == ARRAY_SIZE(stack))) {
1681 daddr, uh->source, saddr, dif); 1675 flush_stack(stack, count, skb, ~0);
1682 if (unlikely(count == ARRAY_SIZE(stack))) { 1676 count = 0;
1683 if (!sk) 1677 }
1684 break; 1678 stack[count++] = sk;
1685 flush_stack(stack, count, skb, ~0); 1679 sock_hold(sk);
1686 count = 0;
1687 } 1680 }
1688 } 1681 }
1689 /*
1690 * before releasing chain lock, we must take a reference on sockets
1691 */
1692 for (i = 0; i < count; i++)
1693 sock_hold(stack[i]);
1694 1682
1695 spin_unlock(&hslot->lock); 1683 spin_unlock(&hslot->lock);
1696 1684
1685 /* Also lookup *:port if we are using hash2 and haven't done so yet. */
1686 if (use_hash2 && hash2 != hash2_any) {
1687 hash2 = hash2_any;
1688 goto start_lookup;
1689 }
1690
1697 /* 1691 /*
1698 * do the slow work with no lock held 1692 * do the slow work with no lock held
1699 */ 1693 */
1700 if (count) { 1694 if (count) {
1701 flush_stack(stack, count, skb, count - 1); 1695 flush_stack(stack, count, skb, count - 1);
1702
1703 for (i = 0; i < count; i++)
1704 sock_put(stack[i]);
1705 } else { 1696 } else {
1706 kfree_skb(skb); 1697 kfree_skb(skb);
1707 } 1698 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b4481df3d5fa..f9d8800bb72f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -702,43 +702,26 @@ drop:
702 return -1; 702 return -1;
703} 703}
704 704
705static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, 705static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
706 __be16 loc_port, const struct in6_addr *loc_addr, 706 __be16 loc_port, const struct in6_addr *loc_addr,
707 __be16 rmt_port, const struct in6_addr *rmt_addr, 707 __be16 rmt_port, const struct in6_addr *rmt_addr,
708 int dif) 708 int dif, unsigned short hnum)
709{ 709{
710 struct hlist_nulls_node *node; 710 struct inet_sock *inet = inet_sk(sk);
711 unsigned short num = ntohs(loc_port);
712
713 sk_nulls_for_each_from(sk, node) {
714 struct inet_sock *inet = inet_sk(sk);
715
716 if (!net_eq(sock_net(sk), net))
717 continue;
718
719 if (udp_sk(sk)->udp_port_hash == num &&
720 sk->sk_family == PF_INET6) {
721 if (inet->inet_dport) {
722 if (inet->inet_dport != rmt_port)
723 continue;
724 }
725 if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
726 !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
727 continue;
728
729 if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
730 continue;
731 711
732 if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { 712 if (!net_eq(sock_net(sk), net))
733 if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) 713 return false;
734 continue; 714
735 } 715 if (udp_sk(sk)->udp_port_hash != hnum ||
736 if (!inet6_mc_check(sk, loc_addr, rmt_addr)) 716 sk->sk_family != PF_INET6 ||
737 continue; 717 (inet->inet_dport && inet->inet_dport != rmt_port) ||
738 return sk; 718 (!ipv6_addr_any(&sk->sk_v6_daddr) &&
739 } 719 !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
740 } 720 (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
741 return NULL; 721 return false;
722 if (!inet6_mc_check(sk, loc_addr, rmt_addr))
723 return false;
724 return true;
742} 725}
743 726
744static void flush_stack(struct sock **stack, unsigned int count, 727static void flush_stack(struct sock **stack, unsigned int count,
@@ -762,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
762 745
763 if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) 746 if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
764 skb1 = NULL; 747 skb1 = NULL;
748 sock_put(sk);
765 } 749 }
766 if (unlikely(skb1)) 750 if (unlikely(skb1))
767 kfree_skb(skb1); 751 kfree_skb(skb1);
@@ -787,43 +771,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
787{ 771{
788 struct sock *sk, *stack[256 / sizeof(struct sock *)]; 772 struct sock *sk, *stack[256 / sizeof(struct sock *)];
789 const struct udphdr *uh = udp_hdr(skb); 773 const struct udphdr *uh = udp_hdr(skb);
790 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); 774 struct hlist_nulls_node *node;
791 int dif; 775 unsigned short hnum = ntohs(uh->dest);
792 unsigned int i, count = 0; 776 struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
777 int dif = inet6_iif(skb);
778 unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
779 unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
780
781 if (use_hash2) {
782 hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
783 udp_table.mask;
784 hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
785start_lookup:
786 hslot = &udp_table.hash2[hash2];
787 offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
788 }
793 789
794 spin_lock(&hslot->lock); 790 spin_lock(&hslot->lock);
795 sk = sk_nulls_head(&hslot->head); 791 sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
796 dif = inet6_iif(skb); 792 if (__udp_v6_is_mcast_sock(net, sk,
797 sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 793 uh->dest, daddr,
798 while (sk) { 794 uh->source, saddr,
799 /* If zero checksum and no_check is not on for 795 dif, hnum) &&
800 * the socket then skip it. 796 /* If zero checksum and no_check is not on for
801 */ 797 * the socket then skip it.
802 if (uh->check || udp_sk(sk)->no_check6_rx) 798 */
799 (uh->check || udp_sk(sk)->no_check6_rx)) {
800 if (unlikely(count == ARRAY_SIZE(stack))) {
801 flush_stack(stack, count, skb, ~0);
802 count = 0;
803 }
803 stack[count++] = sk; 804 stack[count++] = sk;
804 805 sock_hold(sk);
805 sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
806 uh->source, saddr, dif);
807 if (unlikely(count == ARRAY_SIZE(stack))) {
808 if (!sk)
809 break;
810 flush_stack(stack, count, skb, ~0);
811 count = 0;
812 } 806 }
813 } 807 }
814 /*
815 * before releasing the lock, we must take reference on sockets
816 */
817 for (i = 0; i < count; i++)
818 sock_hold(stack[i]);
819 808
820 spin_unlock(&hslot->lock); 809 spin_unlock(&hslot->lock);
821 810
811 /* Also lookup *:port if we are using hash2 and haven't done so yet. */
812 if (use_hash2 && hash2 != hash2_any) {
813 hash2 = hash2_any;
814 goto start_lookup;
815 }
816
822 if (count) { 817 if (count) {
823 flush_stack(stack, count, skb, count - 1); 818 flush_stack(stack, count, skb, count - 1);
824
825 for (i = 0; i < count; i++)
826 sock_put(stack[i]);
827 } else { 819 } else {
828 kfree_skb(skb); 820 kfree_skb(skb);
829 } 821 }