diff options
| author | Salam Noureddine <noureddine@arista.com> | 2015-01-27 14:35:48 -0500 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2015-01-29 17:41:39 -0500 |
| commit | 7866a621043fbaca3d7389e9b9f69dd1a2e5a855 (patch) | |
| tree | d2913952615b3070c6d8f222eed2a84f2f140f85 | |
| parent | 7b4ce694b2030e7bb41f938cba6a0be4947a5aa5 (diff) | |
dev: add per net_device packet type chains
When many pf_packet listeners are created on a lot of interfaces the
current implementation using global packet type lists scales poorly.
This patch adds per net_device packet type lists to fix this problem.
The patch was originally written by Eric Biederman for linux-2.6.29.
Tested on linux-3.16.
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/linux/netdevice.h | 2 | ||||
| -rw-r--r-- | net/core/dev.c | 132 |
2 files changed, 86 insertions, 48 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 642d426a668f..3d37c6eb1732 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
| @@ -1514,6 +1514,8 @@ struct net_device { | |||
| 1514 | struct list_head napi_list; | 1514 | struct list_head napi_list; |
| 1515 | struct list_head unreg_list; | 1515 | struct list_head unreg_list; |
| 1516 | struct list_head close_list; | 1516 | struct list_head close_list; |
| 1517 | struct list_head ptype_all; | ||
| 1518 | struct list_head ptype_specific; | ||
| 1517 | 1519 | ||
| 1518 | struct { | 1520 | struct { |
| 1519 | struct list_head upper; | 1521 | struct list_head upper; |
diff --git a/net/core/dev.c b/net/core/dev.c index 7f028d441e98..1d564d68e31a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
| @@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) | |||
| 371 | static inline struct list_head *ptype_head(const struct packet_type *pt) | 371 | static inline struct list_head *ptype_head(const struct packet_type *pt) |
| 372 | { | 372 | { |
| 373 | if (pt->type == htons(ETH_P_ALL)) | 373 | if (pt->type == htons(ETH_P_ALL)) |
| 374 | return &ptype_all; | 374 | return pt->dev ? &pt->dev->ptype_all : &ptype_all; |
| 375 | else | 375 | else |
| 376 | return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; | 376 | return pt->dev ? &pt->dev->ptype_specific : |
| 377 | &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; | ||
| 377 | } | 378 | } |
| 378 | 379 | ||
| 379 | /** | 380 | /** |
| @@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb, | |||
| 1734 | return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); | 1735 | return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); |
| 1735 | } | 1736 | } |
| 1736 | 1737 | ||
| 1738 | static inline void deliver_ptype_list_skb(struct sk_buff *skb, | ||
| 1739 | struct packet_type **pt, | ||
| 1740 | struct net_device *dev, __be16 type, | ||
| 1741 | struct list_head *ptype_list) | ||
| 1742 | { | ||
| 1743 | struct packet_type *ptype, *pt_prev = *pt; | ||
| 1744 | |||
| 1745 | list_for_each_entry_rcu(ptype, ptype_list, list) { | ||
| 1746 | if (ptype->type != type) | ||
| 1747 | continue; | ||
| 1748 | if (pt_prev) | ||
| 1749 | deliver_skb(skb, pt_prev, dev); | ||
| 1750 | pt_prev = ptype; | ||
| 1751 | } | ||
| 1752 | *pt = pt_prev; | ||
| 1753 | } | ||
| 1754 | |||
| 1737 | static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) | 1755 | static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) |
| 1738 | { | 1756 | { |
| 1739 | if (!ptype->af_packet_priv || !skb->sk) | 1757 | if (!ptype->af_packet_priv || !skb->sk) |
| @@ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) | |||
| 1757 | struct packet_type *ptype; | 1775 | struct packet_type *ptype; |
| 1758 | struct sk_buff *skb2 = NULL; | 1776 | struct sk_buff *skb2 = NULL; |
| 1759 | struct packet_type *pt_prev = NULL; | 1777 | struct packet_type *pt_prev = NULL; |
| 1778 | struct list_head *ptype_list = &ptype_all; | ||
| 1760 | 1779 | ||
| 1761 | rcu_read_lock(); | 1780 | rcu_read_lock(); |
| 1762 | list_for_each_entry_rcu(ptype, &ptype_all, list) { | 1781 | again: |
| 1782 | list_for_each_entry_rcu(ptype, ptype_list, list) { | ||
| 1763 | /* Never send packets back to the socket | 1783 | /* Never send packets back to the socket |
| 1764 | * they originated from - MvS (miquels@drinkel.ow.org) | 1784 | * they originated from - MvS (miquels@drinkel.ow.org) |
| 1765 | */ | 1785 | */ |
| 1766 | if ((ptype->dev == dev || !ptype->dev) && | 1786 | if (skb_loop_sk(ptype, skb)) |
| 1767 | (!skb_loop_sk(ptype, skb))) { | 1787 | continue; |
| 1768 | if (pt_prev) { | ||
| 1769 | deliver_skb(skb2, pt_prev, skb->dev); | ||
| 1770 | pt_prev = ptype; | ||
| 1771 | continue; | ||
| 1772 | } | ||
| 1773 | 1788 | ||
| 1774 | skb2 = skb_clone(skb, GFP_ATOMIC); | 1789 | if (pt_prev) { |
| 1775 | if (!skb2) | 1790 | deliver_skb(skb2, pt_prev, skb->dev); |
| 1776 | break; | 1791 | pt_prev = ptype; |
| 1792 | continue; | ||
| 1793 | } | ||
| 1777 | 1794 | ||
| 1778 | net_timestamp_set(skb2); | 1795 | /* need to clone skb, done only once */ |
| 1796 | skb2 = skb_clone(skb, GFP_ATOMIC); | ||
| 1797 | if (!skb2) | ||
| 1798 | goto out_unlock; | ||
| 1779 | 1799 | ||
| 1780 | /* skb->nh should be correctly | 1800 | net_timestamp_set(skb2); |
| 1781 | set by sender, so that the second statement is | ||
| 1782 | just protection against buggy protocols. | ||
| 1783 | */ | ||
| 1784 | skb_reset_mac_header(skb2); | ||
| 1785 | |||
| 1786 | if (skb_network_header(skb2) < skb2->data || | ||
| 1787 | skb_network_header(skb2) > skb_tail_pointer(skb2)) { | ||
| 1788 | net_crit_ratelimited("protocol %04x is buggy, dev %s\n", | ||
| 1789 | ntohs(skb2->protocol), | ||
| 1790 | dev->name); | ||
| 1791 | skb_reset_network_header(skb2); | ||
| 1792 | } | ||
| 1793 | 1801 | ||
| 1794 | skb2->transport_header = skb2->network_header; | 1802 | /* skb->nh should be correctly |
| 1795 | skb2->pkt_type = PACKET_OUTGOING; | 1803 | * set by sender, so that the second statement is |
| 1796 | pt_prev = ptype; | 1804 | * just protection against buggy protocols. |
| 1805 | */ | ||
| 1806 | skb_reset_mac_header(skb2); | ||
| 1807 | |||
| 1808 | if (skb_network_header(skb2) < skb2->data || | ||
| 1809 | skb_network_header(skb2) > skb_tail_pointer(skb2)) { | ||
| 1810 | net_crit_ratelimited("protocol %04x is buggy, dev %s\n", | ||
| 1811 | ntohs(skb2->protocol), | ||
| 1812 | dev->name); | ||
| 1813 | skb_reset_network_header(skb2); | ||
| 1797 | } | 1814 | } |
| 1815 | |||
| 1816 | skb2->transport_header = skb2->network_header; | ||
| 1817 | skb2->pkt_type = PACKET_OUTGOING; | ||
| 1818 | pt_prev = ptype; | ||
| 1819 | } | ||
| 1820 | |||
| 1821 | if (ptype_list == &ptype_all) { | ||
| 1822 | ptype_list = &dev->ptype_all; | ||
| 1823 | goto again; | ||
| 1798 | } | 1824 | } |
| 1825 | out_unlock: | ||
| 1799 | if (pt_prev) | 1826 | if (pt_prev) |
| 1800 | pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); | 1827 | pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); |
| 1801 | rcu_read_unlock(); | 1828 | rcu_read_unlock(); |
| @@ -2617,7 +2644,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, | |||
| 2617 | unsigned int len; | 2644 | unsigned int len; |
| 2618 | int rc; | 2645 | int rc; |
| 2619 | 2646 | ||
| 2620 | if (!list_empty(&ptype_all)) | 2647 | if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) |
| 2621 | dev_queue_xmit_nit(skb, dev); | 2648 | dev_queue_xmit_nit(skb, dev); |
| 2622 | 2649 | ||
| 2623 | len = skb->len; | 2650 | len = skb->len; |
| @@ -3615,7 +3642,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) | |||
| 3615 | struct packet_type *ptype, *pt_prev; | 3642 | struct packet_type *ptype, *pt_prev; |
| 3616 | rx_handler_func_t *rx_handler; | 3643 | rx_handler_func_t *rx_handler; |
| 3617 | struct net_device *orig_dev; | 3644 | struct net_device *orig_dev; |
| 3618 | struct net_device *null_or_dev; | ||
| 3619 | bool deliver_exact = false; | 3645 | bool deliver_exact = false; |
| 3620 | int ret = NET_RX_DROP; | 3646 | int ret = NET_RX_DROP; |
| 3621 | __be16 type; | 3647 | __be16 type; |
| @@ -3658,11 +3684,15 @@ another_round: | |||
| 3658 | goto skip_taps; | 3684 | goto skip_taps; |
| 3659 | 3685 | ||
| 3660 | list_for_each_entry_rcu(ptype, &ptype_all, list) { | 3686 | list_for_each_entry_rcu(ptype, &ptype_all, list) { |
| 3661 | if (!ptype->dev || ptype->dev == skb->dev) { | 3687 | if (pt_prev) |
| 3662 | if (pt_prev) | 3688 | ret = deliver_skb(skb, pt_prev, orig_dev); |
| 3663 | ret = deliver_skb(skb, pt_prev, orig_dev); | 3689 | pt_prev = ptype; |
| 3664 | pt_prev = ptype; | 3690 | } |
| 3665 | } | 3691 | |
| 3692 | list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { | ||
| 3693 | if (pt_prev) | ||
| 3694 | ret = deliver_skb(skb, pt_prev, orig_dev); | ||
| 3695 | pt_prev = ptype; | ||
| 3666 | } | 3696 | } |
| 3667 | 3697 | ||
| 3668 | skip_taps: | 3698 | skip_taps: |
| @@ -3718,19 +3748,21 @@ ncls: | |||
| 3718 | skb->vlan_tci = 0; | 3748 | skb->vlan_tci = 0; |
| 3719 | } | 3749 | } |
| 3720 | 3750 | ||
| 3751 | type = skb->protocol; | ||
| 3752 | |||
| 3721 | /* deliver only exact match when indicated */ | 3753 | /* deliver only exact match when indicated */ |
| 3722 | null_or_dev = deliver_exact ? skb->dev : NULL; | 3754 | if (likely(!deliver_exact)) { |
| 3755 | deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, | ||
| 3756 | &ptype_base[ntohs(type) & | ||
| 3757 | PTYPE_HASH_MASK]); | ||
| 3758 | } | ||
| 3723 | 3759 | ||
| 3724 | type = skb->protocol; | 3760 | deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, |
| 3725 | list_for_each_entry_rcu(ptype, | 3761 | &orig_dev->ptype_specific); |
| 3726 | &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { | 3762 | |
| 3727 | if (ptype->type == type && | 3763 | if (unlikely(skb->dev != orig_dev)) { |
| 3728 | (ptype->dev == null_or_dev || ptype->dev == skb->dev || | 3764 | deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, |
| 3729 | ptype->dev == orig_dev)) { | 3765 | &skb->dev->ptype_specific); |
| 3730 | if (pt_prev) | ||
| 3731 | ret = deliver_skb(skb, pt_prev, orig_dev); | ||
| 3732 | pt_prev = ptype; | ||
| 3733 | } | ||
| 3734 | } | 3766 | } |
| 3735 | 3767 | ||
| 3736 | if (pt_prev) { | 3768 | if (pt_prev) { |
| @@ -6579,6 +6611,8 @@ void netdev_run_todo(void) | |||
| 6579 | 6611 | ||
| 6580 | /* paranoia */ | 6612 | /* paranoia */ |
| 6581 | BUG_ON(netdev_refcnt_read(dev)); | 6613 | BUG_ON(netdev_refcnt_read(dev)); |
| 6614 | BUG_ON(!list_empty(&dev->ptype_all)); | ||
| 6615 | BUG_ON(!list_empty(&dev->ptype_specific)); | ||
| 6582 | WARN_ON(rcu_access_pointer(dev->ip_ptr)); | 6616 | WARN_ON(rcu_access_pointer(dev->ip_ptr)); |
| 6583 | WARN_ON(rcu_access_pointer(dev->ip6_ptr)); | 6617 | WARN_ON(rcu_access_pointer(dev->ip6_ptr)); |
| 6584 | WARN_ON(dev->dn_ptr); | 6618 | WARN_ON(dev->dn_ptr); |
| @@ -6761,6 +6795,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, | |||
| 6761 | INIT_LIST_HEAD(&dev->adj_list.lower); | 6795 | INIT_LIST_HEAD(&dev->adj_list.lower); |
| 6762 | INIT_LIST_HEAD(&dev->all_adj_list.upper); | 6796 | INIT_LIST_HEAD(&dev->all_adj_list.upper); |
| 6763 | INIT_LIST_HEAD(&dev->all_adj_list.lower); | 6797 | INIT_LIST_HEAD(&dev->all_adj_list.lower); |
| 6798 | INIT_LIST_HEAD(&dev->ptype_all); | ||
| 6799 | INIT_LIST_HEAD(&dev->ptype_specific); | ||
| 6764 | dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; | 6800 | dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; |
| 6765 | setup(dev); | 6801 | setup(dev); |
| 6766 | 6802 | ||
