aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2010-05-21 15:27:26 -0400
committerJens Axboe <jens.axboe@oracle.com>2010-05-21 15:27:26 -0400
commitee9a3607fb03e804ddf624544105f4e34260c380 (patch)
treece41b6e0fa10982a306f6c142a92dbf3c9961284 /net/core
parentb492e95be0ae672922f4734acf3f5d35c30be948 (diff)
parentd515e86e639890b33a09390d062b0831664f04a2 (diff)
Merge branch 'master' into for-2.6.35
Conflicts: fs/ext3/fsync.c Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c21
-rw-r--r--net/core/dev.c1402
-rw-r--r--net/core/dev_addr_lists.c741
-rw-r--r--net/core/dev_mcast.c232
-rw-r--r--net/core/dst.c45
-rw-r--r--net/core/ethtool.c152
-rw-r--r--net/core/fib_rules.c31
-rw-r--r--net/core/filter.c7
-rw-r--r--net/core/flow.c405
-rw-r--r--net/core/net-sysfs.c377
-rw-r--r--net/core/net-sysfs.h1
-rw-r--r--net/core/net_namespace.c95
-rw-r--r--net/core/netpoll.c26
-rw-r--r--net/core/pktgen.c58
-rw-r--r--net/core/rtnetlink.c369
-rw-r--r--net/core/skbuff.c33
-rw-r--r--net/core/sock.c78
-rw-r--r--net/core/stream.c22
-rw-r--r--net/core/sysctl_net_core.c75
20 files changed, 2731 insertions, 1441 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 08791ac3e05a..51c3eec850ef 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -7,7 +7,7 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
7 7
8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o 8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
9 9
10obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ 10obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o 11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o
12 12
13obj-$(CONFIG_XFRM) += flow.o 13obj-$(CONFIG_XFRM) += flow.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 2dccd4ee591b..e0097531417a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -86,7 +86,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
86 int error; 86 int error;
87 DEFINE_WAIT_FUNC(wait, receiver_wake_function); 87 DEFINE_WAIT_FUNC(wait, receiver_wake_function);
88 88
89 prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 89 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
90 90
91 /* Socket errors? */ 91 /* Socket errors? */
92 error = sock_error(sk); 92 error = sock_error(sk);
@@ -115,7 +115,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
115 error = 0; 115 error = 0;
116 *timeo_p = schedule_timeout(*timeo_p); 116 *timeo_p = schedule_timeout(*timeo_p);
117out: 117out:
118 finish_wait(sk->sk_sleep, &wait); 118 finish_wait(sk_sleep(sk), &wait);
119 return error; 119 return error;
120interrupted: 120interrupted:
121 error = sock_intr_errno(*timeo_p); 121 error = sock_intr_errno(*timeo_p);
@@ -229,9 +229,18 @@ EXPORT_SYMBOL(skb_free_datagram);
229 229
230void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) 230void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
231{ 231{
232 lock_sock(sk); 232 if (likely(atomic_read(&skb->users) == 1))
233 skb_free_datagram(sk, skb); 233 smp_rmb();
234 release_sock(sk); 234 else if (likely(!atomic_dec_and_test(&skb->users)))
235 return;
236
237 lock_sock_bh(sk);
238 skb_orphan(skb);
239 sk_mem_reclaim_partial(sk);
240 unlock_sock_bh(sk);
241
242 /* skb is now orphaned, can be freed outside of locked section */
243 __kfree_skb(skb);
235} 244}
236EXPORT_SYMBOL(skb_free_datagram_locked); 245EXPORT_SYMBOL(skb_free_datagram_locked);
237 246
@@ -726,7 +735,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
726 struct sock *sk = sock->sk; 735 struct sock *sk = sock->sk;
727 unsigned int mask; 736 unsigned int mask;
728 737
729 sock_poll_wait(file, sk->sk_sleep, wait); 738 sock_poll_wait(file, sk_sleep(sk), wait);
730 mask = 0; 739 mask = 0;
731 740
732 /* exceptional events? */ 741 /* exceptional events? */
diff --git a/net/core/dev.c b/net/core/dev.c
index f769098774b7..d273e4e3ecdc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
130#include <linux/jhash.h> 130#include <linux/jhash.h>
131#include <linux/random.h> 131#include <linux/random.h>
132#include <trace/events/napi.h> 132#include <trace/events/napi.h>
133#include <linux/pci.h>
133 134
134#include "net-sysfs.h" 135#include "net-sysfs.h"
135 136
@@ -207,6 +208,20 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208} 209}
209 210
211static inline void rps_lock(struct softnet_data *sd)
212{
213#ifdef CONFIG_RPS
214 spin_lock(&sd->input_pkt_queue.lock);
215#endif
216}
217
218static inline void rps_unlock(struct softnet_data *sd)
219{
220#ifdef CONFIG_RPS
221 spin_unlock(&sd->input_pkt_queue.lock);
222#endif
223}
224
210/* Device list insertion */ 225/* Device list insertion */
211static int list_netdevice(struct net_device *dev) 226static int list_netdevice(struct net_device *dev)
212{ 227{
@@ -249,7 +264,7 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
249 * queue in the local softnet handler. 264 * queue in the local softnet handler.
250 */ 265 */
251 266
252DEFINE_PER_CPU(struct softnet_data, softnet_data); 267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
253EXPORT_PER_CPU_SYMBOL(softnet_data); 268EXPORT_PER_CPU_SYMBOL(softnet_data);
254 269
255#ifdef CONFIG_LOCKDEP 270#ifdef CONFIG_LOCKDEP
@@ -773,14 +788,17 @@ EXPORT_SYMBOL(__dev_getfirstbyhwtype);
773 788
774struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
775{ 790{
776 struct net_device *dev; 791 struct net_device *dev, *ret = NULL;
777 792
778 rtnl_lock(); 793 rcu_read_lock();
779 dev = __dev_getfirstbyhwtype(net, type); 794 for_each_netdev_rcu(net, dev)
780 if (dev) 795 if (dev->type == type) {
781 dev_hold(dev); 796 dev_hold(dev);
782 rtnl_unlock(); 797 ret = dev;
783 return dev; 798 break;
799 }
800 rcu_read_unlock();
801 return ret;
784} 802}
785EXPORT_SYMBOL(dev_getfirstbyhwtype); 803EXPORT_SYMBOL(dev_getfirstbyhwtype);
786 804
@@ -984,15 +1002,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
984 return err; 1002 return err;
985 1003
986rollback: 1004rollback:
987 /* For now only devices in the initial network namespace 1005 ret = device_rename(&dev->dev, dev->name);
988 * are in sysfs. 1006 if (ret) {
989 */ 1007 memcpy(dev->name, oldname, IFNAMSIZ);
990 if (net_eq(net, &init_net)) { 1008 return ret;
991 ret = device_rename(&dev->dev, dev->name);
992 if (ret) {
993 memcpy(dev->name, oldname, IFNAMSIZ);
994 return ret;
995 }
996 } 1009 }
997 1010
998 write_lock_bh(&dev_base_lock); 1011 write_lock_bh(&dev_base_lock);
@@ -1085,9 +1098,9 @@ void netdev_state_change(struct net_device *dev)
1085} 1098}
1086EXPORT_SYMBOL(netdev_state_change); 1099EXPORT_SYMBOL(netdev_state_change);
1087 1100
1088void netdev_bonding_change(struct net_device *dev, unsigned long event) 1101int netdev_bonding_change(struct net_device *dev, unsigned long event)
1089{ 1102{
1090 call_netdevice_notifiers(event, dev); 1103 return call_netdevice_notifiers(event, dev);
1091} 1104}
1092EXPORT_SYMBOL(netdev_bonding_change); 1105EXPORT_SYMBOL(netdev_bonding_change);
1093 1106
@@ -1417,6 +1430,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
1417 1430
1418int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1431int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1419{ 1432{
1433 ASSERT_RTNL();
1420 return raw_notifier_call_chain(&netdev_chain, val, dev); 1434 return raw_notifier_call_chain(&netdev_chain, val, dev);
1421} 1435}
1422 1436
@@ -1435,7 +1449,7 @@ void net_disable_timestamp(void)
1435} 1449}
1436EXPORT_SYMBOL(net_disable_timestamp); 1450EXPORT_SYMBOL(net_disable_timestamp);
1437 1451
1438static inline void net_timestamp(struct sk_buff *skb) 1452static inline void net_timestamp_set(struct sk_buff *skb)
1439{ 1453{
1440 if (atomic_read(&netstamp_needed)) 1454 if (atomic_read(&netstamp_needed))
1441 __net_timestamp(skb); 1455 __net_timestamp(skb);
@@ -1443,6 +1457,12 @@ static inline void net_timestamp(struct sk_buff *skb)
1443 skb->tstamp.tv64 = 0; 1457 skb->tstamp.tv64 = 0;
1444} 1458}
1445 1459
1460static inline void net_timestamp_check(struct sk_buff *skb)
1461{
1462 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1463 __net_timestamp(skb);
1464}
1465
1446/** 1466/**
1447 * dev_forward_skb - loopback an skb to another netif 1467 * dev_forward_skb - loopback an skb to another netif
1448 * 1468 *
@@ -1451,7 +1471,7 @@ static inline void net_timestamp(struct sk_buff *skb)
1451 * 1471 *
1452 * return values: 1472 * return values:
1453 * NET_RX_SUCCESS (no congestion) 1473 * NET_RX_SUCCESS (no congestion)
1454 * NET_RX_DROP (packet was dropped) 1474 * NET_RX_DROP (packet was dropped, but freed)
1455 * 1475 *
1456 * dev_forward_skb can be used for injecting an skb from the 1476 * dev_forward_skb can be used for injecting an skb from the
1457 * start_xmit function of one device into the receive queue 1477 * start_xmit function of one device into the receive queue
@@ -1465,12 +1485,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1465{ 1485{
1466 skb_orphan(skb); 1486 skb_orphan(skb);
1467 1487
1468 if (!(dev->flags & IFF_UP)) 1488 if (!(dev->flags & IFF_UP) ||
1469 return NET_RX_DROP; 1489 (skb->len > (dev->mtu + dev->hard_header_len))) {
1470 1490 kfree_skb(skb);
1471 if (skb->len > (dev->mtu + dev->hard_header_len))
1472 return NET_RX_DROP; 1491 return NET_RX_DROP;
1473 1492 }
1474 skb_set_dev(skb, dev); 1493 skb_set_dev(skb, dev);
1475 skb->tstamp.tv64 = 0; 1494 skb->tstamp.tv64 = 0;
1476 skb->pkt_type = PACKET_HOST; 1495 skb->pkt_type = PACKET_HOST;
@@ -1490,9 +1509,9 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1490 1509
1491#ifdef CONFIG_NET_CLS_ACT 1510#ifdef CONFIG_NET_CLS_ACT
1492 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) 1511 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1493 net_timestamp(skb); 1512 net_timestamp_set(skb);
1494#else 1513#else
1495 net_timestamp(skb); 1514 net_timestamp_set(skb);
1496#endif 1515#endif
1497 1516
1498 rcu_read_lock(); 1517 rcu_read_lock();
@@ -1538,8 +1557,9 @@ static inline void __netif_reschedule(struct Qdisc *q)
1538 1557
1539 local_irq_save(flags); 1558 local_irq_save(flags);
1540 sd = &__get_cpu_var(softnet_data); 1559 sd = &__get_cpu_var(softnet_data);
1541 q->next_sched = sd->output_queue; 1560 q->next_sched = NULL;
1542 sd->output_queue = q; 1561 *sd->output_queue_tailp = q;
1562 sd->output_queue_tailp = &q->next_sched;
1543 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1563 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1544 local_irq_restore(flags); 1564 local_irq_restore(flags);
1545} 1565}
@@ -1784,18 +1804,27 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);
1784 * 2. No high memory really exists on this machine. 1804 * 2. No high memory really exists on this machine.
1785 */ 1805 */
1786 1806
1787static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1807static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1788{ 1808{
1789#ifdef CONFIG_HIGHMEM 1809#ifdef CONFIG_HIGHMEM
1790 int i; 1810 int i;
1811 if (!(dev->features & NETIF_F_HIGHDMA)) {
1812 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1813 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1814 return 1;
1815 }
1791 1816
1792 if (dev->features & NETIF_F_HIGHDMA) 1817 if (PCI_DMA_BUS_IS_PHYS) {
1793 return 0; 1818 struct device *pdev = dev->dev.parent;
1794
1795 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1796 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1797 return 1;
1798 1819
1820 if (!pdev)
1821 return 0;
1822 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1823 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1824 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1825 return 1;
1826 }
1827 }
1799#endif 1828#endif
1800 return 0; 1829 return 0;
1801} 1830}
@@ -1853,6 +1882,17 @@ static int dev_gso_segment(struct sk_buff *skb)
1853 return 0; 1882 return 0;
1854} 1883}
1855 1884
1885/*
1886 * Try to orphan skb early, right before transmission by the device.
1887 * We cannot orphan skb if tx timestamp is requested, since
1888 * drivers need to call skb_tstamp_tx() to send the timestamp.
1889 */
1890static inline void skb_orphan_try(struct sk_buff *skb)
1891{
1892 if (!skb_tx(skb)->flags)
1893 skb_orphan(skb);
1894}
1895
1856int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1896int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1857 struct netdev_queue *txq) 1897 struct netdev_queue *txq)
1858{ 1898{
@@ -1863,13 +1903,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1863 if (!list_empty(&ptype_all)) 1903 if (!list_empty(&ptype_all))
1864 dev_queue_xmit_nit(skb, dev); 1904 dev_queue_xmit_nit(skb, dev);
1865 1905
1866 if (netif_needs_gso(dev, skb)) {
1867 if (unlikely(dev_gso_segment(skb)))
1868 goto out_kfree_skb;
1869 if (skb->next)
1870 goto gso;
1871 }
1872
1873 /* 1906 /*
1874 * If device doesnt need skb->dst, release it right now while 1907 * If device doesnt need skb->dst, release it right now while
1875 * its hot in this cpu cache 1908 * its hot in this cpu cache
@@ -1877,23 +1910,18 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1877 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 1910 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1878 skb_dst_drop(skb); 1911 skb_dst_drop(skb);
1879 1912
1913 skb_orphan_try(skb);
1914
1915 if (netif_needs_gso(dev, skb)) {
1916 if (unlikely(dev_gso_segment(skb)))
1917 goto out_kfree_skb;
1918 if (skb->next)
1919 goto gso;
1920 }
1921
1880 rc = ops->ndo_start_xmit(skb, dev); 1922 rc = ops->ndo_start_xmit(skb, dev);
1881 if (rc == NETDEV_TX_OK) 1923 if (rc == NETDEV_TX_OK)
1882 txq_trans_update(txq); 1924 txq_trans_update(txq);
1883 /*
1884 * TODO: if skb_orphan() was called by
1885 * dev->hard_start_xmit() (for example, the unmodified
1886 * igb driver does that; bnx2 doesn't), then
1887 * skb_tx_software_timestamp() will be unable to send
1888 * back the time stamp.
1889 *
1890 * How can this be prevented? Always create another
1891 * reference to the socket before calling
1892 * dev->hard_start_xmit()? Prevent that skb_orphan()
1893 * does anything in dev->hard_start_xmit() by clearing
1894 * the skb destructor before the call and restoring it
1895 * afterwards, then doing the skb_orphan() ourselves?
1896 */
1897 return rc; 1925 return rc;
1898 } 1926 }
1899 1927
@@ -1932,7 +1960,7 @@ out_kfree_skb:
1932 return rc; 1960 return rc;
1933} 1961}
1934 1962
1935static u32 skb_tx_hashrnd; 1963static u32 hashrnd __read_mostly;
1936 1964
1937u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 1965u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1938{ 1966{
@@ -1948,9 +1976,9 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1948 if (skb->sk && skb->sk->sk_hash) 1976 if (skb->sk && skb->sk->sk_hash)
1949 hash = skb->sk->sk_hash; 1977 hash = skb->sk->sk_hash;
1950 else 1978 else
1951 hash = skb->protocol; 1979 hash = (__force u16) skb->protocol;
1952 1980
1953 hash = jhash_1word(hash, skb_tx_hashrnd); 1981 hash = jhash_1word(hash, hashrnd);
1954 1982
1955 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1983 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1956} 1984}
@@ -1960,10 +1988,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1960{ 1988{
1961 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 1989 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1962 if (net_ratelimit()) { 1990 if (net_ratelimit()) {
1963 WARN(1, "%s selects TX queue %d, but " 1991 pr_warning("%s selects TX queue %d, but "
1964 "real number of TX queues is %d\n", 1992 "real number of TX queues is %d\n",
1965 dev->name, queue_index, 1993 dev->name, queue_index, dev->real_num_tx_queues);
1966 dev->real_num_tx_queues);
1967 } 1994 }
1968 return 0; 1995 return 0;
1969 } 1996 }
@@ -1990,7 +2017,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1990 queue_index = skb_tx_hash(dev, skb); 2017 queue_index = skb_tx_hash(dev, skb);
1991 2018
1992 if (sk) { 2019 if (sk) {
1993 struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache); 2020 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
1994 2021
1995 if (dst && skb_dst(skb) == dst) 2022 if (dst && skb_dst(skb) == dst)
1996 sk_tx_queue_set(sk, queue_index); 2023 sk_tx_queue_set(sk, queue_index);
@@ -2020,6 +2047,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2020 * waiting to be sent out; and the qdisc is not running - 2047 * waiting to be sent out; and the qdisc is not running -
2021 * xmit the skb directly. 2048 * xmit the skb directly.
2022 */ 2049 */
2050 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2051 skb_dst_force(skb);
2023 __qdisc_update_bstats(q, skb->len); 2052 __qdisc_update_bstats(q, skb->len);
2024 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) 2053 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2025 __qdisc_run(q); 2054 __qdisc_run(q);
@@ -2028,6 +2057,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2028 2057
2029 rc = NET_XMIT_SUCCESS; 2058 rc = NET_XMIT_SUCCESS;
2030 } else { 2059 } else {
2060 skb_dst_force(skb);
2031 rc = qdisc_enqueue_root(skb, q); 2061 rc = qdisc_enqueue_root(skb, q);
2032 qdisc_run(q); 2062 qdisc_run(q);
2033 } 2063 }
@@ -2175,11 +2205,249 @@ EXPORT_SYMBOL(dev_queue_xmit);
2175 =======================================================================*/ 2205 =======================================================================*/
2176 2206
2177int netdev_max_backlog __read_mostly = 1000; 2207int netdev_max_backlog __read_mostly = 1000;
2208int netdev_tstamp_prequeue __read_mostly = 1;
2178int netdev_budget __read_mostly = 300; 2209int netdev_budget __read_mostly = 300;
2179int weight_p __read_mostly = 64; /* old backlog weight */ 2210int weight_p __read_mostly = 64; /* old backlog weight */
2180 2211
2181DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 2212/* Called with irq disabled */
2213static inline void ____napi_schedule(struct softnet_data *sd,
2214 struct napi_struct *napi)
2215{
2216 list_add_tail(&napi->poll_list, &sd->poll_list);
2217 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2218}
2182 2219
2220#ifdef CONFIG_RPS
2221
2222/* One global table that all flow-based protocols share. */
2223struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2224EXPORT_SYMBOL(rps_sock_flow_table);
2225
2226/*
2227 * get_rps_cpu is called from netif_receive_skb and returns the target
2228 * CPU from the RPS map of the receiving queue for a given skb.
2229 * rcu_read_lock must be held on entry.
2230 */
2231static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2232 struct rps_dev_flow **rflowp)
2233{
2234 struct ipv6hdr *ip6;
2235 struct iphdr *ip;
2236 struct netdev_rx_queue *rxqueue;
2237 struct rps_map *map;
2238 struct rps_dev_flow_table *flow_table;
2239 struct rps_sock_flow_table *sock_flow_table;
2240 int cpu = -1;
2241 u8 ip_proto;
2242 u16 tcpu;
2243 u32 addr1, addr2, ihl;
2244 union {
2245 u32 v32;
2246 u16 v16[2];
2247 } ports;
2248
2249 if (skb_rx_queue_recorded(skb)) {
2250 u16 index = skb_get_rx_queue(skb);
2251 if (unlikely(index >= dev->num_rx_queues)) {
2252 if (net_ratelimit()) {
2253 pr_warning("%s received packet on queue "
2254 "%u, but number of RX queues is %u\n",
2255 dev->name, index, dev->num_rx_queues);
2256 }
2257 goto done;
2258 }
2259 rxqueue = dev->_rx + index;
2260 } else
2261 rxqueue = dev->_rx;
2262
2263 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2264 goto done;
2265
2266 if (skb->rxhash)
2267 goto got_hash; /* Skip hash computation on packet header */
2268
2269 switch (skb->protocol) {
2270 case __constant_htons(ETH_P_IP):
2271 if (!pskb_may_pull(skb, sizeof(*ip)))
2272 goto done;
2273
2274 ip = (struct iphdr *) skb->data;
2275 ip_proto = ip->protocol;
2276 addr1 = (__force u32) ip->saddr;
2277 addr2 = (__force u32) ip->daddr;
2278 ihl = ip->ihl;
2279 break;
2280 case __constant_htons(ETH_P_IPV6):
2281 if (!pskb_may_pull(skb, sizeof(*ip6)))
2282 goto done;
2283
2284 ip6 = (struct ipv6hdr *) skb->data;
2285 ip_proto = ip6->nexthdr;
2286 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2287 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2288 ihl = (40 >> 2);
2289 break;
2290 default:
2291 goto done;
2292 }
2293 switch (ip_proto) {
2294 case IPPROTO_TCP:
2295 case IPPROTO_UDP:
2296 case IPPROTO_DCCP:
2297 case IPPROTO_ESP:
2298 case IPPROTO_AH:
2299 case IPPROTO_SCTP:
2300 case IPPROTO_UDPLITE:
2301 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2302 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2303 if (ports.v16[1] < ports.v16[0])
2304 swap(ports.v16[0], ports.v16[1]);
2305 break;
2306 }
2307 default:
2308 ports.v32 = 0;
2309 break;
2310 }
2311
2312 /* get a consistent hash (same value on both flow directions) */
2313 if (addr2 < addr1)
2314 swap(addr1, addr2);
2315 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2316 if (!skb->rxhash)
2317 skb->rxhash = 1;
2318
2319got_hash:
2320 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2321 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2322 if (flow_table && sock_flow_table) {
2323 u16 next_cpu;
2324 struct rps_dev_flow *rflow;
2325
2326 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2327 tcpu = rflow->cpu;
2328
2329 next_cpu = sock_flow_table->ents[skb->rxhash &
2330 sock_flow_table->mask];
2331
2332 /*
2333 * If the desired CPU (where last recvmsg was done) is
2334 * different from current CPU (one in the rx-queue flow
2335 * table entry), switch if one of the following holds:
2336 * - Current CPU is unset (equal to RPS_NO_CPU).
2337 * - Current CPU is offline.
2338 * - The current CPU's queue tail has advanced beyond the
2339 * last packet that was enqueued using this table entry.
2340 * This guarantees that all previous packets for the flow
2341 * have been dequeued, thus preserving in order delivery.
2342 */
2343 if (unlikely(tcpu != next_cpu) &&
2344 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2345 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2346 rflow->last_qtail)) >= 0)) {
2347 tcpu = rflow->cpu = next_cpu;
2348 if (tcpu != RPS_NO_CPU)
2349 rflow->last_qtail = per_cpu(softnet_data,
2350 tcpu).input_queue_head;
2351 }
2352 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2353 *rflowp = rflow;
2354 cpu = tcpu;
2355 goto done;
2356 }
2357 }
2358
2359 map = rcu_dereference(rxqueue->rps_map);
2360 if (map) {
2361 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2362
2363 if (cpu_online(tcpu)) {
2364 cpu = tcpu;
2365 goto done;
2366 }
2367 }
2368
2369done:
2370 return cpu;
2371}
2372
2373/* Called from hardirq (IPI) context */
2374static void rps_trigger_softirq(void *data)
2375{
2376 struct softnet_data *sd = data;
2377
2378 ____napi_schedule(sd, &sd->backlog);
2379 sd->received_rps++;
2380}
2381
2382#endif /* CONFIG_RPS */
2383
2384/*
2385 * Check if this softnet_data structure is another cpu one
2386 * If yes, queue it to our IPI list and return 1
2387 * If no, return 0
2388 */
2389static int rps_ipi_queued(struct softnet_data *sd)
2390{
2391#ifdef CONFIG_RPS
2392 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2393
2394 if (sd != mysd) {
2395 sd->rps_ipi_next = mysd->rps_ipi_list;
2396 mysd->rps_ipi_list = sd;
2397
2398 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2399 return 1;
2400 }
2401#endif /* CONFIG_RPS */
2402 return 0;
2403}
2404
2405/*
2406 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2407 * queue (may be a remote CPU queue).
2408 */
2409static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2410 unsigned int *qtail)
2411{
2412 struct softnet_data *sd;
2413 unsigned long flags;
2414
2415 sd = &per_cpu(softnet_data, cpu);
2416
2417 local_irq_save(flags);
2418
2419 rps_lock(sd);
2420 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2421 if (skb_queue_len(&sd->input_pkt_queue)) {
2422enqueue:
2423 __skb_queue_tail(&sd->input_pkt_queue, skb);
2424#ifdef CONFIG_RPS
2425 *qtail = sd->input_queue_head +
2426 skb_queue_len(&sd->input_pkt_queue);
2427#endif
2428 rps_unlock(sd);
2429 local_irq_restore(flags);
2430 return NET_RX_SUCCESS;
2431 }
2432
2433 /* Schedule NAPI for backlog device
2434 * We can use non atomic operation since we own the queue lock
2435 */
2436 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2437 if (!rps_ipi_queued(sd))
2438 ____napi_schedule(sd, &sd->backlog);
2439 }
2440 goto enqueue;
2441 }
2442
2443 sd->dropped++;
2444 rps_unlock(sd);
2445
2446 local_irq_restore(flags);
2447
2448 kfree_skb(skb);
2449 return NET_RX_DROP;
2450}
2183 2451
2184/** 2452/**
2185 * netif_rx - post buffer to the network code 2453 * netif_rx - post buffer to the network code
@@ -2198,41 +2466,38 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2198 2466
2199int netif_rx(struct sk_buff *skb) 2467int netif_rx(struct sk_buff *skb)
2200{ 2468{
2201 struct softnet_data *queue; 2469 int ret;
2202 unsigned long flags;
2203 2470
2204 /* if netpoll wants it, pretend we never saw it */ 2471 /* if netpoll wants it, pretend we never saw it */
2205 if (netpoll_rx(skb)) 2472 if (netpoll_rx(skb))
2206 return NET_RX_DROP; 2473 return NET_RX_DROP;
2207 2474
2208 if (!skb->tstamp.tv64) 2475 if (netdev_tstamp_prequeue)
2209 net_timestamp(skb); 2476 net_timestamp_check(skb);
2210 2477
2211 /* 2478#ifdef CONFIG_RPS
2212 * The code is rearranged so that the path is the most 2479 {
2213 * short when CPU is congested, but is still operating. 2480 struct rps_dev_flow voidflow, *rflow = &voidflow;
2214 */ 2481 int cpu;
2215 local_irq_save(flags);
2216 queue = &__get_cpu_var(softnet_data);
2217 2482
2218 __get_cpu_var(netdev_rx_stat).total++; 2483 rcu_read_lock();
2219 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2220 if (queue->input_pkt_queue.qlen) {
2221enqueue:
2222 __skb_queue_tail(&queue->input_pkt_queue, skb);
2223 local_irq_restore(flags);
2224 return NET_RX_SUCCESS;
2225 }
2226 2484
2227 napi_schedule(&queue->backlog); 2485 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2228 goto enqueue; 2486 if (cpu < 0)
2229 } 2487 cpu = smp_processor_id();
2230 2488
2231 __get_cpu_var(netdev_rx_stat).dropped++; 2489 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2232 local_irq_restore(flags);
2233 2490
2234 kfree_skb(skb); 2491 rcu_read_unlock();
2235 return NET_RX_DROP; 2492 }
2493#else
2494 {
2495 unsigned int qtail;
2496 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2497 put_cpu();
2498 }
2499#endif
2500 return ret;
2236} 2501}
2237EXPORT_SYMBOL(netif_rx); 2502EXPORT_SYMBOL(netif_rx);
2238 2503
@@ -2277,6 +2542,7 @@ static void net_tx_action(struct softirq_action *h)
2277 local_irq_disable(); 2542 local_irq_disable();
2278 head = sd->output_queue; 2543 head = sd->output_queue;
2279 sd->output_queue = NULL; 2544 sd->output_queue = NULL;
2545 sd->output_queue_tailp = &sd->output_queue;
2280 local_irq_enable(); 2546 local_irq_enable();
2281 2547
2282 while (head) { 2548 while (head) {
@@ -2353,7 +2619,8 @@ static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2353#endif 2619#endif
2354 2620
2355#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) 2621#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2356struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; 2622struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2623 struct sk_buff *skb) __read_mostly;
2357EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); 2624EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2358 2625
2359static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, 2626static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
@@ -2361,14 +2628,17 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2361 int *ret, 2628 int *ret,
2362 struct net_device *orig_dev) 2629 struct net_device *orig_dev)
2363{ 2630{
2364 if (skb->dev->macvlan_port == NULL) 2631 struct macvlan_port *port;
2632
2633 port = rcu_dereference(skb->dev->macvlan_port);
2634 if (!port)
2365 return skb; 2635 return skb;
2366 2636
2367 if (*pt_prev) { 2637 if (*pt_prev) {
2368 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2638 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2369 *pt_prev = NULL; 2639 *pt_prev = NULL;
2370 } 2640 }
2371 return macvlan_handle_frame_hook(skb); 2641 return macvlan_handle_frame_hook(port, skb);
2372} 2642}
2373#else 2643#else
2374#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) 2644#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
@@ -2469,22 +2739,56 @@ void netif_nit_deliver(struct sk_buff *skb)
2469 rcu_read_unlock(); 2739 rcu_read_unlock();
2470} 2740}
2471 2741
2472/** 2742static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2473 * netif_receive_skb - process receive buffer from network 2743 struct net_device *master)
2474 * @skb: buffer to process 2744{
2475 * 2745 if (skb->pkt_type == PACKET_HOST) {
2476 * netif_receive_skb() is the main receive data processing function. 2746 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2477 * It always succeeds. The buffer may be dropped during processing 2747
2478 * for congestion control or by the protocol layers. 2748 memcpy(dest, master->dev_addr, ETH_ALEN);
2479 * 2749 }
2480 * This function may only be called from softirq context and interrupts 2750}
2481 * should be enabled. 2751
2482 * 2752/* On bonding slaves other than the currently active slave, suppress
2483 * Return values (usually ignored): 2753 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2484 * NET_RX_SUCCESS: no congestion 2754 * ARP on active-backup slaves with arp_validate enabled.
2485 * NET_RX_DROP: packet was dropped
2486 */ 2755 */
2487int netif_receive_skb(struct sk_buff *skb) 2756int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2757{
2758 struct net_device *dev = skb->dev;
2759
2760 if (master->priv_flags & IFF_MASTER_ARPMON)
2761 dev->last_rx = jiffies;
2762
2763 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2764 /* Do address unmangle. The local destination address
2765 * will be always the one master has. Provides the right
2766 * functionality in a bridge.
2767 */
2768 skb_bond_set_mac_by_master(skb, master);
2769 }
2770
2771 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2772 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2773 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2774 return 0;
2775
2776 if (master->priv_flags & IFF_MASTER_ALB) {
2777 if (skb->pkt_type != PACKET_BROADCAST &&
2778 skb->pkt_type != PACKET_MULTICAST)
2779 return 0;
2780 }
2781 if (master->priv_flags & IFF_MASTER_8023AD &&
2782 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2783 return 0;
2784
2785 return 1;
2786 }
2787 return 0;
2788}
2789EXPORT_SYMBOL(__skb_bond_should_drop);
2790
2791static int __netif_receive_skb(struct sk_buff *skb)
2488{ 2792{
2489 struct packet_type *ptype, *pt_prev; 2793 struct packet_type *ptype, *pt_prev;
2490 struct net_device *orig_dev; 2794 struct net_device *orig_dev;
@@ -2494,8 +2798,8 @@ int netif_receive_skb(struct sk_buff *skb)
2494 int ret = NET_RX_DROP; 2798 int ret = NET_RX_DROP;
2495 __be16 type; 2799 __be16 type;
2496 2800
2497 if (!skb->tstamp.tv64) 2801 if (!netdev_tstamp_prequeue)
2498 net_timestamp(skb); 2802 net_timestamp_check(skb);
2499 2803
2500 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 2804 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2501 return NET_RX_SUCCESS; 2805 return NET_RX_SUCCESS;
@@ -2517,7 +2821,7 @@ int netif_receive_skb(struct sk_buff *skb)
2517 skb->dev = master; 2821 skb->dev = master;
2518 } 2822 }
2519 2823
2520 __get_cpu_var(netdev_rx_stat).total++; 2824 __get_cpu_var(softnet_data).processed++;
2521 2825
2522 skb_reset_network_header(skb); 2826 skb_reset_network_header(skb);
2523 skb_reset_transport_header(skb); 2827 skb_reset_transport_header(skb);
@@ -2595,20 +2899,77 @@ out:
2595 rcu_read_unlock(); 2899 rcu_read_unlock();
2596 return ret; 2900 return ret;
2597} 2901}
2902
2903/**
2904 * netif_receive_skb - process receive buffer from network
2905 * @skb: buffer to process
2906 *
2907 * netif_receive_skb() is the main receive data processing function.
2908 * It always succeeds. The buffer may be dropped during processing
2909 * for congestion control or by the protocol layers.
2910 *
2911 * This function may only be called from softirq context and interrupts
2912 * should be enabled.
2913 *
2914 * Return values (usually ignored):
2915 * NET_RX_SUCCESS: no congestion
2916 * NET_RX_DROP: packet was dropped
2917 */
2918int netif_receive_skb(struct sk_buff *skb)
2919{
2920 if (netdev_tstamp_prequeue)
2921 net_timestamp_check(skb);
2922
2923#ifdef CONFIG_RPS
2924 {
2925 struct rps_dev_flow voidflow, *rflow = &voidflow;
2926 int cpu, ret;
2927
2928 rcu_read_lock();
2929
2930 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2931
2932 if (cpu >= 0) {
2933 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2934 rcu_read_unlock();
2935 } else {
2936 rcu_read_unlock();
2937 ret = __netif_receive_skb(skb);
2938 }
2939
2940 return ret;
2941 }
2942#else
2943 return __netif_receive_skb(skb);
2944#endif
2945}
2598EXPORT_SYMBOL(netif_receive_skb); 2946EXPORT_SYMBOL(netif_receive_skb);
2599 2947
2600/* Network device is going away, flush any packets still pending */ 2948/* Network device is going away, flush any packets still pending
2949 * Called with irqs disabled.
2950 */
2601static void flush_backlog(void *arg) 2951static void flush_backlog(void *arg)
2602{ 2952{
2603 struct net_device *dev = arg; 2953 struct net_device *dev = arg;
2604 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2954 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2605 struct sk_buff *skb, *tmp; 2955 struct sk_buff *skb, *tmp;
2606 2956
2607 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) 2957 rps_lock(sd);
2958 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
2608 if (skb->dev == dev) { 2959 if (skb->dev == dev) {
2609 __skb_unlink(skb, &queue->input_pkt_queue); 2960 __skb_unlink(skb, &sd->input_pkt_queue);
2610 kfree_skb(skb); 2961 kfree_skb(skb);
2962 input_queue_head_add(sd, 1);
2611 } 2963 }
2964 }
2965 rps_unlock(sd);
2966
2967 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2968 if (skb->dev == dev) {
2969 __skb_unlink(skb, &sd->process_queue);
2970 kfree_skb(skb);
2971 }
2972 }
2612} 2973}
2613 2974
2614static int napi_gro_complete(struct sk_buff *skb) 2975static int napi_gro_complete(struct sk_buff *skb)
@@ -2911,27 +3272,85 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
2911} 3272}
2912EXPORT_SYMBOL(napi_gro_frags); 3273EXPORT_SYMBOL(napi_gro_frags);
2913 3274
3275/*
3276 * net_rps_action sends any pending IPI's for rps.
3277 * Note: called with local irq disabled, but exits with local irq enabled.
3278 */
3279static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3280{
3281#ifdef CONFIG_RPS
3282 struct softnet_data *remsd = sd->rps_ipi_list;
3283
3284 if (remsd) {
3285 sd->rps_ipi_list = NULL;
3286
3287 local_irq_enable();
3288
3289 /* Send pending IPI's to kick RPS processing on remote cpus. */
3290 while (remsd) {
3291 struct softnet_data *next = remsd->rps_ipi_next;
3292
3293 if (cpu_online(remsd->cpu))
3294 __smp_call_function_single(remsd->cpu,
3295 &remsd->csd, 0);
3296 remsd = next;
3297 }
3298 } else
3299#endif
3300 local_irq_enable();
3301}
3302
2914static int process_backlog(struct napi_struct *napi, int quota) 3303static int process_backlog(struct napi_struct *napi, int quota)
2915{ 3304{
2916 int work = 0; 3305 int work = 0;
2917 struct softnet_data *queue = &__get_cpu_var(softnet_data); 3306 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
2918 unsigned long start_time = jiffies;
2919 3307
3308#ifdef CONFIG_RPS
3309 /* Check if we have pending ipi, its better to send them now,
3310 * not waiting net_rx_action() end.
3311 */
3312 if (sd->rps_ipi_list) {
3313 local_irq_disable();
3314 net_rps_action_and_irq_enable(sd);
3315 }
3316#endif
2920 napi->weight = weight_p; 3317 napi->weight = weight_p;
2921 do { 3318 local_irq_disable();
3319 while (work < quota) {
2922 struct sk_buff *skb; 3320 struct sk_buff *skb;
3321 unsigned int qlen;
2923 3322
2924 local_irq_disable(); 3323 while ((skb = __skb_dequeue(&sd->process_queue))) {
2925 skb = __skb_dequeue(&queue->input_pkt_queue);
2926 if (!skb) {
2927 __napi_complete(napi);
2928 local_irq_enable(); 3324 local_irq_enable();
2929 break; 3325 __netif_receive_skb(skb);
3326 if (++work >= quota)
3327 return work;
3328 local_irq_disable();
2930 } 3329 }
2931 local_irq_enable();
2932 3330
2933 netif_receive_skb(skb); 3331 rps_lock(sd);
2934 } while (++work < quota && jiffies == start_time); 3332 qlen = skb_queue_len(&sd->input_pkt_queue);
3333 if (qlen) {
3334 input_queue_head_add(sd, qlen);
3335 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3336 &sd->process_queue);
3337 }
3338 if (qlen < quota - work) {
3339 /*
3340 * Inline a custom version of __napi_complete().
3341 * only current cpu owns and manipulates this napi,
3342 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3343 * we can use a plain write instead of clear_bit(),
3344 * and we dont need an smp_mb() memory barrier.
3345 */
3346 list_del(&napi->poll_list);
3347 napi->state = 0;
3348
3349 quota = work + qlen;
3350 }
3351 rps_unlock(sd);
3352 }
3353 local_irq_enable();
2935 3354
2936 return work; 3355 return work;
2937} 3356}
@@ -2947,8 +3366,7 @@ void __napi_schedule(struct napi_struct *n)
2947 unsigned long flags; 3366 unsigned long flags;
2948 3367
2949 local_irq_save(flags); 3368 local_irq_save(flags);
2950 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); 3369 ____napi_schedule(&__get_cpu_var(softnet_data), n);
2951 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2952 local_irq_restore(flags); 3370 local_irq_restore(flags);
2953} 3371}
2954EXPORT_SYMBOL(__napi_schedule); 3372EXPORT_SYMBOL(__napi_schedule);
@@ -3019,17 +3437,16 @@ void netif_napi_del(struct napi_struct *napi)
3019} 3437}
3020EXPORT_SYMBOL(netif_napi_del); 3438EXPORT_SYMBOL(netif_napi_del);
3021 3439
3022
3023static void net_rx_action(struct softirq_action *h) 3440static void net_rx_action(struct softirq_action *h)
3024{ 3441{
3025 struct list_head *list = &__get_cpu_var(softnet_data).poll_list; 3442 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3026 unsigned long time_limit = jiffies + 2; 3443 unsigned long time_limit = jiffies + 2;
3027 int budget = netdev_budget; 3444 int budget = netdev_budget;
3028 void *have; 3445 void *have;
3029 3446
3030 local_irq_disable(); 3447 local_irq_disable();
3031 3448
3032 while (!list_empty(list)) { 3449 while (!list_empty(&sd->poll_list)) {
3033 struct napi_struct *n; 3450 struct napi_struct *n;
3034 int work, weight; 3451 int work, weight;
3035 3452
@@ -3047,7 +3464,7 @@ static void net_rx_action(struct softirq_action *h)
3047 * entries to the tail of this list, and only ->poll() 3464 * entries to the tail of this list, and only ->poll()
3048 * calls can remove this head entry from the list. 3465 * calls can remove this head entry from the list.
3049 */ 3466 */
3050 n = list_first_entry(list, struct napi_struct, poll_list); 3467 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3051 3468
3052 have = netpoll_poll_lock(n); 3469 have = netpoll_poll_lock(n);
3053 3470
@@ -3082,13 +3499,13 @@ static void net_rx_action(struct softirq_action *h)
3082 napi_complete(n); 3499 napi_complete(n);
3083 local_irq_disable(); 3500 local_irq_disable();
3084 } else 3501 } else
3085 list_move_tail(&n->poll_list, list); 3502 list_move_tail(&n->poll_list, &sd->poll_list);
3086 } 3503 }
3087 3504
3088 netpoll_poll_unlock(have); 3505 netpoll_poll_unlock(have);
3089 } 3506 }
3090out: 3507out:
3091 local_irq_enable(); 3508 net_rps_action_and_irq_enable(sd);
3092 3509
3093#ifdef CONFIG_NET_DMA 3510#ifdef CONFIG_NET_DMA
3094 /* 3511 /*
@@ -3101,7 +3518,7 @@ out:
3101 return; 3518 return;
3102 3519
3103softnet_break: 3520softnet_break:
3104 __get_cpu_var(netdev_rx_stat).time_squeeze++; 3521 sd->time_squeeze++;
3105 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3522 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3106 goto out; 3523 goto out;
3107} 3524}
@@ -3302,17 +3719,17 @@ static int dev_seq_show(struct seq_file *seq, void *v)
3302 return 0; 3719 return 0;
3303} 3720}
3304 3721
3305static struct netif_rx_stats *softnet_get_online(loff_t *pos) 3722static struct softnet_data *softnet_get_online(loff_t *pos)
3306{ 3723{
3307 struct netif_rx_stats *rc = NULL; 3724 struct softnet_data *sd = NULL;
3308 3725
3309 while (*pos < nr_cpu_ids) 3726 while (*pos < nr_cpu_ids)
3310 if (cpu_online(*pos)) { 3727 if (cpu_online(*pos)) {
3311 rc = &per_cpu(netdev_rx_stat, *pos); 3728 sd = &per_cpu(softnet_data, *pos);
3312 break; 3729 break;
3313 } else 3730 } else
3314 ++*pos; 3731 ++*pos;
3315 return rc; 3732 return sd;
3316} 3733}
3317 3734
3318static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 3735static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
@@ -3332,12 +3749,12 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
3332 3749
3333static int softnet_seq_show(struct seq_file *seq, void *v) 3750static int softnet_seq_show(struct seq_file *seq, void *v)
3334{ 3751{
3335 struct netif_rx_stats *s = v; 3752 struct softnet_data *sd = v;
3336 3753
3337 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3754 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3338 s->total, s->dropped, s->time_squeeze, 0, 3755 sd->processed, sd->dropped, sd->time_squeeze, 0,
3339 0, 0, 0, 0, /* was fastroute */ 3756 0, 0, 0, 0, /* was fastroute */
3340 s->cpu_collision); 3757 sd->cpu_collision, sd->received_rps);
3341 return 0; 3758 return 0;
3342} 3759}
3343 3760
@@ -3560,11 +3977,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
3560 3977
3561 slave->master = master; 3978 slave->master = master;
3562 3979
3563 synchronize_net(); 3980 if (old) {
3564 3981 synchronize_net();
3565 if (old)
3566 dev_put(old); 3982 dev_put(old);
3567 3983 }
3568 if (master) 3984 if (master)
3569 slave->flags |= IFF_SLAVE; 3985 slave->flags |= IFF_SLAVE;
3570 else 3986 else
@@ -3741,562 +4157,6 @@ void dev_set_rx_mode(struct net_device *dev)
3741 netif_addr_unlock_bh(dev); 4157 netif_addr_unlock_bh(dev);
3742} 4158}
3743 4159
3744/* hw addresses list handling functions */
3745
3746static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3747 int addr_len, unsigned char addr_type)
3748{
3749 struct netdev_hw_addr *ha;
3750 int alloc_size;
3751
3752 if (addr_len > MAX_ADDR_LEN)
3753 return -EINVAL;
3754
3755 list_for_each_entry(ha, &list->list, list) {
3756 if (!memcmp(ha->addr, addr, addr_len) &&
3757 ha->type == addr_type) {
3758 ha->refcount++;
3759 return 0;
3760 }
3761 }
3762
3763
3764 alloc_size = sizeof(*ha);
3765 if (alloc_size < L1_CACHE_BYTES)
3766 alloc_size = L1_CACHE_BYTES;
3767 ha = kmalloc(alloc_size, GFP_ATOMIC);
3768 if (!ha)
3769 return -ENOMEM;
3770 memcpy(ha->addr, addr, addr_len);
3771 ha->type = addr_type;
3772 ha->refcount = 1;
3773 ha->synced = false;
3774 list_add_tail_rcu(&ha->list, &list->list);
3775 list->count++;
3776 return 0;
3777}
3778
3779static void ha_rcu_free(struct rcu_head *head)
3780{
3781 struct netdev_hw_addr *ha;
3782
3783 ha = container_of(head, struct netdev_hw_addr, rcu_head);
3784 kfree(ha);
3785}
3786
3787static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3788 int addr_len, unsigned char addr_type)
3789{
3790 struct netdev_hw_addr *ha;
3791
3792 list_for_each_entry(ha, &list->list, list) {
3793 if (!memcmp(ha->addr, addr, addr_len) &&
3794 (ha->type == addr_type || !addr_type)) {
3795 if (--ha->refcount)
3796 return 0;
3797 list_del_rcu(&ha->list);
3798 call_rcu(&ha->rcu_head, ha_rcu_free);
3799 list->count--;
3800 return 0;
3801 }
3802 }
3803 return -ENOENT;
3804}
3805
3806static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3807 struct netdev_hw_addr_list *from_list,
3808 int addr_len,
3809 unsigned char addr_type)
3810{
3811 int err;
3812 struct netdev_hw_addr *ha, *ha2;
3813 unsigned char type;
3814
3815 list_for_each_entry(ha, &from_list->list, list) {
3816 type = addr_type ? addr_type : ha->type;
3817 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3818 if (err)
3819 goto unroll;
3820 }
3821 return 0;
3822
3823unroll:
3824 list_for_each_entry(ha2, &from_list->list, list) {
3825 if (ha2 == ha)
3826 break;
3827 type = addr_type ? addr_type : ha2->type;
3828 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3829 }
3830 return err;
3831}
3832
3833static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3834 struct netdev_hw_addr_list *from_list,
3835 int addr_len,
3836 unsigned char addr_type)
3837{
3838 struct netdev_hw_addr *ha;
3839 unsigned char type;
3840
3841 list_for_each_entry(ha, &from_list->list, list) {
3842 type = addr_type ? addr_type : ha->type;
3843 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3844 }
3845}
3846
3847static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3848 struct netdev_hw_addr_list *from_list,
3849 int addr_len)
3850{
3851 int err = 0;
3852 struct netdev_hw_addr *ha, *tmp;
3853
3854 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3855 if (!ha->synced) {
3856 err = __hw_addr_add(to_list, ha->addr,
3857 addr_len, ha->type);
3858 if (err)
3859 break;
3860 ha->synced = true;
3861 ha->refcount++;
3862 } else if (ha->refcount == 1) {
3863 __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3864 __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3865 }
3866 }
3867 return err;
3868}
3869
3870static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3871 struct netdev_hw_addr_list *from_list,
3872 int addr_len)
3873{
3874 struct netdev_hw_addr *ha, *tmp;
3875
3876 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3877 if (ha->synced) {
3878 __hw_addr_del(to_list, ha->addr,
3879 addr_len, ha->type);
3880 ha->synced = false;
3881 __hw_addr_del(from_list, ha->addr,
3882 addr_len, ha->type);
3883 }
3884 }
3885}
3886
3887static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3888{
3889 struct netdev_hw_addr *ha, *tmp;
3890
3891 list_for_each_entry_safe(ha, tmp, &list->list, list) {
3892 list_del_rcu(&ha->list);
3893 call_rcu(&ha->rcu_head, ha_rcu_free);
3894 }
3895 list->count = 0;
3896}
3897
3898static void __hw_addr_init(struct netdev_hw_addr_list *list)
3899{
3900 INIT_LIST_HEAD(&list->list);
3901 list->count = 0;
3902}
3903
3904/* Device addresses handling functions */
3905
3906static void dev_addr_flush(struct net_device *dev)
3907{
3908 /* rtnl_mutex must be held here */
3909
3910 __hw_addr_flush(&dev->dev_addrs);
3911 dev->dev_addr = NULL;
3912}
3913
3914static int dev_addr_init(struct net_device *dev)
3915{
3916 unsigned char addr[MAX_ADDR_LEN];
3917 struct netdev_hw_addr *ha;
3918 int err;
3919
3920 /* rtnl_mutex must be held here */
3921
3922 __hw_addr_init(&dev->dev_addrs);
3923 memset(addr, 0, sizeof(addr));
3924 err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3925 NETDEV_HW_ADDR_T_LAN);
3926 if (!err) {
3927 /*
3928 * Get the first (previously created) address from the list
3929 * and set dev_addr pointer to this location.
3930 */
3931 ha = list_first_entry(&dev->dev_addrs.list,
3932 struct netdev_hw_addr, list);
3933 dev->dev_addr = ha->addr;
3934 }
3935 return err;
3936}
3937
3938/**
3939 * dev_addr_add - Add a device address
3940 * @dev: device
3941 * @addr: address to add
3942 * @addr_type: address type
3943 *
3944 * Add a device address to the device or increase the reference count if
3945 * it already exists.
3946 *
3947 * The caller must hold the rtnl_mutex.
3948 */
3949int dev_addr_add(struct net_device *dev, unsigned char *addr,
3950 unsigned char addr_type)
3951{
3952 int err;
3953
3954 ASSERT_RTNL();
3955
3956 err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3957 if (!err)
3958 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3959 return err;
3960}
3961EXPORT_SYMBOL(dev_addr_add);
3962
3963/**
3964 * dev_addr_del - Release a device address.
3965 * @dev: device
3966 * @addr: address to delete
3967 * @addr_type: address type
3968 *
3969 * Release reference to a device address and remove it from the device
3970 * if the reference count drops to zero.
3971 *
3972 * The caller must hold the rtnl_mutex.
3973 */
3974int dev_addr_del(struct net_device *dev, unsigned char *addr,
3975 unsigned char addr_type)
3976{
3977 int err;
3978 struct netdev_hw_addr *ha;
3979
3980 ASSERT_RTNL();
3981
3982 /*
3983 * We can not remove the first address from the list because
3984 * dev->dev_addr points to that.
3985 */
3986 ha = list_first_entry(&dev->dev_addrs.list,
3987 struct netdev_hw_addr, list);
3988 if (ha->addr == dev->dev_addr && ha->refcount == 1)
3989 return -ENOENT;
3990
3991 err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3992 addr_type);
3993 if (!err)
3994 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3995 return err;
3996}
3997EXPORT_SYMBOL(dev_addr_del);
3998
3999/**
4000 * dev_addr_add_multiple - Add device addresses from another device
4001 * @to_dev: device to which addresses will be added
4002 * @from_dev: device from which addresses will be added
4003 * @addr_type: address type - 0 means type will be used from from_dev
4004 *
4005 * Add device addresses of the one device to another.
4006 **
4007 * The caller must hold the rtnl_mutex.
4008 */
4009int dev_addr_add_multiple(struct net_device *to_dev,
4010 struct net_device *from_dev,
4011 unsigned char addr_type)
4012{
4013 int err;
4014
4015 ASSERT_RTNL();
4016
4017 if (from_dev->addr_len != to_dev->addr_len)
4018 return -EINVAL;
4019 err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4020 to_dev->addr_len, addr_type);
4021 if (!err)
4022 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4023 return err;
4024}
4025EXPORT_SYMBOL(dev_addr_add_multiple);
4026
4027/**
4028 * dev_addr_del_multiple - Delete device addresses by another device
4029 * @to_dev: device where the addresses will be deleted
4030 * @from_dev: device by which addresses the addresses will be deleted
4031 * @addr_type: address type - 0 means type will used from from_dev
4032 *
4033 * Deletes addresses in to device by the list of addresses in from device.
4034 *
4035 * The caller must hold the rtnl_mutex.
4036 */
4037int dev_addr_del_multiple(struct net_device *to_dev,
4038 struct net_device *from_dev,
4039 unsigned char addr_type)
4040{
4041 ASSERT_RTNL();
4042
4043 if (from_dev->addr_len != to_dev->addr_len)
4044 return -EINVAL;
4045 __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4046 to_dev->addr_len, addr_type);
4047 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4048 return 0;
4049}
4050EXPORT_SYMBOL(dev_addr_del_multiple);
4051
4052/* multicast addresses handling functions */
4053
4054int __dev_addr_delete(struct dev_addr_list **list, int *count,
4055 void *addr, int alen, int glbl)
4056{
4057 struct dev_addr_list *da;
4058
4059 for (; (da = *list) != NULL; list = &da->next) {
4060 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4061 alen == da->da_addrlen) {
4062 if (glbl) {
4063 int old_glbl = da->da_gusers;
4064 da->da_gusers = 0;
4065 if (old_glbl == 0)
4066 break;
4067 }
4068 if (--da->da_users)
4069 return 0;
4070
4071 *list = da->next;
4072 kfree(da);
4073 (*count)--;
4074 return 0;
4075 }
4076 }
4077 return -ENOENT;
4078}
4079
4080int __dev_addr_add(struct dev_addr_list **list, int *count,
4081 void *addr, int alen, int glbl)
4082{
4083 struct dev_addr_list *da;
4084
4085 for (da = *list; da != NULL; da = da->next) {
4086 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4087 da->da_addrlen == alen) {
4088 if (glbl) {
4089 int old_glbl = da->da_gusers;
4090 da->da_gusers = 1;
4091 if (old_glbl)
4092 return 0;
4093 }
4094 da->da_users++;
4095 return 0;
4096 }
4097 }
4098
4099 da = kzalloc(sizeof(*da), GFP_ATOMIC);
4100 if (da == NULL)
4101 return -ENOMEM;
4102 memcpy(da->da_addr, addr, alen);
4103 da->da_addrlen = alen;
4104 da->da_users = 1;
4105 da->da_gusers = glbl ? 1 : 0;
4106 da->next = *list;
4107 *list = da;
4108 (*count)++;
4109 return 0;
4110}
4111
4112/**
4113 * dev_unicast_delete - Release secondary unicast address.
4114 * @dev: device
4115 * @addr: address to delete
4116 *
4117 * Release reference to a secondary unicast address and remove it
4118 * from the device if the reference count drops to zero.
4119 *
4120 * The caller must hold the rtnl_mutex.
4121 */
4122int dev_unicast_delete(struct net_device *dev, void *addr)
4123{
4124 int err;
4125
4126 ASSERT_RTNL();
4127
4128 netif_addr_lock_bh(dev);
4129 err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4130 NETDEV_HW_ADDR_T_UNICAST);
4131 if (!err)
4132 __dev_set_rx_mode(dev);
4133 netif_addr_unlock_bh(dev);
4134 return err;
4135}
4136EXPORT_SYMBOL(dev_unicast_delete);
4137
4138/**
4139 * dev_unicast_add - add a secondary unicast address
4140 * @dev: device
4141 * @addr: address to add
4142 *
4143 * Add a secondary unicast address to the device or increase
4144 * the reference count if it already exists.
4145 *
4146 * The caller must hold the rtnl_mutex.
4147 */
4148int dev_unicast_add(struct net_device *dev, void *addr)
4149{
4150 int err;
4151
4152 ASSERT_RTNL();
4153
4154 netif_addr_lock_bh(dev);
4155 err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4156 NETDEV_HW_ADDR_T_UNICAST);
4157 if (!err)
4158 __dev_set_rx_mode(dev);
4159 netif_addr_unlock_bh(dev);
4160 return err;
4161}
4162EXPORT_SYMBOL(dev_unicast_add);
4163
4164int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4165 struct dev_addr_list **from, int *from_count)
4166{
4167 struct dev_addr_list *da, *next;
4168 int err = 0;
4169
4170 da = *from;
4171 while (da != NULL) {
4172 next = da->next;
4173 if (!da->da_synced) {
4174 err = __dev_addr_add(to, to_count,
4175 da->da_addr, da->da_addrlen, 0);
4176 if (err < 0)
4177 break;
4178 da->da_synced = 1;
4179 da->da_users++;
4180 } else if (da->da_users == 1) {
4181 __dev_addr_delete(to, to_count,
4182 da->da_addr, da->da_addrlen, 0);
4183 __dev_addr_delete(from, from_count,
4184 da->da_addr, da->da_addrlen, 0);
4185 }
4186 da = next;
4187 }
4188 return err;
4189}
4190EXPORT_SYMBOL_GPL(__dev_addr_sync);
4191
4192void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4193 struct dev_addr_list **from, int *from_count)
4194{
4195 struct dev_addr_list *da, *next;
4196
4197 da = *from;
4198 while (da != NULL) {
4199 next = da->next;
4200 if (da->da_synced) {
4201 __dev_addr_delete(to, to_count,
4202 da->da_addr, da->da_addrlen, 0);
4203 da->da_synced = 0;
4204 __dev_addr_delete(from, from_count,
4205 da->da_addr, da->da_addrlen, 0);
4206 }
4207 da = next;
4208 }
4209}
4210EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4211
4212/**
4213 * dev_unicast_sync - Synchronize device's unicast list to another device
4214 * @to: destination device
4215 * @from: source device
4216 *
4217 * Add newly added addresses to the destination device and release
4218 * addresses that have no users left. The source device must be
4219 * locked by netif_tx_lock_bh.
4220 *
4221 * This function is intended to be called from the dev->set_rx_mode
4222 * function of layered software devices.
4223 */
4224int dev_unicast_sync(struct net_device *to, struct net_device *from)
4225{
4226 int err = 0;
4227
4228 if (to->addr_len != from->addr_len)
4229 return -EINVAL;
4230
4231 netif_addr_lock_bh(to);
4232 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4233 if (!err)
4234 __dev_set_rx_mode(to);
4235 netif_addr_unlock_bh(to);
4236 return err;
4237}
4238EXPORT_SYMBOL(dev_unicast_sync);
4239
4240/**
4241 * dev_unicast_unsync - Remove synchronized addresses from the destination device
4242 * @to: destination device
4243 * @from: source device
4244 *
4245 * Remove all addresses that were added to the destination device by
4246 * dev_unicast_sync(). This function is intended to be called from the
4247 * dev->stop function of layered software devices.
4248 */
4249void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4250{
4251 if (to->addr_len != from->addr_len)
4252 return;
4253
4254 netif_addr_lock_bh(from);
4255 netif_addr_lock(to);
4256 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4257 __dev_set_rx_mode(to);
4258 netif_addr_unlock(to);
4259 netif_addr_unlock_bh(from);
4260}
4261EXPORT_SYMBOL(dev_unicast_unsync);
4262
4263static void dev_unicast_flush(struct net_device *dev)
4264{
4265 netif_addr_lock_bh(dev);
4266 __hw_addr_flush(&dev->uc);
4267 netif_addr_unlock_bh(dev);
4268}
4269
4270static void dev_unicast_init(struct net_device *dev)
4271{
4272 __hw_addr_init(&dev->uc);
4273}
4274
4275
4276static void __dev_addr_discard(struct dev_addr_list **list)
4277{
4278 struct dev_addr_list *tmp;
4279
4280 while (*list != NULL) {
4281 tmp = *list;
4282 *list = tmp->next;
4283 if (tmp->da_users > tmp->da_gusers)
4284 printk("__dev_addr_discard: address leakage! "
4285 "da_users=%d\n", tmp->da_users);
4286 kfree(tmp);
4287 }
4288}
4289
4290static void dev_addr_discard(struct net_device *dev)
4291{
4292 netif_addr_lock_bh(dev);
4293
4294 __dev_addr_discard(&dev->mc_list);
4295 netdev_mc_count(dev) = 0;
4296
4297 netif_addr_unlock_bh(dev);
4298}
4299
4300/** 4160/**
4301 * dev_get_flags - get flags reported to userspace 4161 * dev_get_flags - get flags reported to userspace
4302 * @dev: device 4162 * @dev: device
@@ -4607,8 +4467,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4607 return -EINVAL; 4467 return -EINVAL;
4608 if (!netif_device_present(dev)) 4468 if (!netif_device_present(dev))
4609 return -ENODEV; 4469 return -ENODEV;
4610 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, 4470 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4611 dev->addr_len, 1);
4612 4471
4613 case SIOCDELMULTI: 4472 case SIOCDELMULTI:
4614 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4473 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
@@ -4616,8 +4475,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4616 return -EINVAL; 4475 return -EINVAL;
4617 if (!netif_device_present(dev)) 4476 if (!netif_device_present(dev))
4618 return -ENODEV; 4477 return -ENODEV;
4619 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, 4478 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4620 dev->addr_len, 1);
4621 4479
4622 case SIOCSIFTXQLEN: 4480 case SIOCSIFTXQLEN:
4623 if (ifr->ifr_qlen < 0) 4481 if (ifr->ifr_qlen < 0)
@@ -4924,8 +4782,8 @@ static void rollback_registered_many(struct list_head *head)
4924 /* 4782 /*
4925 * Flush the unicast and multicast chains 4783 * Flush the unicast and multicast chains
4926 */ 4784 */
4927 dev_unicast_flush(dev); 4785 dev_uc_flush(dev);
4928 dev_addr_discard(dev); 4786 dev_mc_flush(dev);
4929 4787
4930 if (dev->netdev_ops->ndo_uninit) 4788 if (dev->netdev_ops->ndo_uninit)
4931 dev->netdev_ops->ndo_uninit(dev); 4789 dev->netdev_ops->ndo_uninit(dev);
@@ -5074,6 +4932,24 @@ int register_netdevice(struct net_device *dev)
5074 4932
5075 dev->iflink = -1; 4933 dev->iflink = -1;
5076 4934
4935#ifdef CONFIG_RPS
4936 if (!dev->num_rx_queues) {
4937 /*
4938 * Allocate a single RX queue if driver never called
4939 * alloc_netdev_mq
4940 */
4941
4942 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4943 if (!dev->_rx) {
4944 ret = -ENOMEM;
4945 goto out;
4946 }
4947
4948 dev->_rx->first = dev->_rx;
4949 atomic_set(&dev->_rx->count, 1);
4950 dev->num_rx_queues = 1;
4951 }
4952#endif
5077 /* Init, if this function is available */ 4953 /* Init, if this function is available */
5078 if (dev->netdev_ops->ndo_init) { 4954 if (dev->netdev_ops->ndo_init) {
5079 ret = dev->netdev_ops->ndo_init(dev); 4955 ret = dev->netdev_ops->ndo_init(dev);
@@ -5113,8 +4989,6 @@ int register_netdevice(struct net_device *dev)
5113 if (dev->features & NETIF_F_SG) 4989 if (dev->features & NETIF_F_SG)
5114 dev->features |= NETIF_F_GSO; 4990 dev->features |= NETIF_F_GSO;
5115 4991
5116 netdev_initialize_kobject(dev);
5117
5118 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 4992 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5119 ret = notifier_to_errno(ret); 4993 ret = notifier_to_errno(ret);
5120 if (ret) 4994 if (ret)
@@ -5434,6 +5308,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5434 struct net_device *dev; 5308 struct net_device *dev;
5435 size_t alloc_size; 5309 size_t alloc_size;
5436 struct net_device *p; 5310 struct net_device *p;
5311#ifdef CONFIG_RPS
5312 struct netdev_rx_queue *rx;
5313 int i;
5314#endif
5437 5315
5438 BUG_ON(strlen(name) >= sizeof(dev->name)); 5316 BUG_ON(strlen(name) >= sizeof(dev->name));
5439 5317
@@ -5459,13 +5337,32 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5459 goto free_p; 5337 goto free_p;
5460 } 5338 }
5461 5339
5340#ifdef CONFIG_RPS
5341 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5342 if (!rx) {
5343 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5344 "rx queues.\n");
5345 goto free_tx;
5346 }
5347
5348 atomic_set(&rx->count, queue_count);
5349
5350 /*
5351 * Set a pointer to first element in the array which holds the
5352 * reference count.
5353 */
5354 for (i = 0; i < queue_count; i++)
5355 rx[i].first = rx;
5356#endif
5357
5462 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5358 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5463 dev->padded = (char *)dev - (char *)p; 5359 dev->padded = (char *)dev - (char *)p;
5464 5360
5465 if (dev_addr_init(dev)) 5361 if (dev_addr_init(dev))
5466 goto free_tx; 5362 goto free_rx;
5467 5363
5468 dev_unicast_init(dev); 5364 dev_mc_init(dev);
5365 dev_uc_init(dev);
5469 5366
5470 dev_net_set(dev, &init_net); 5367 dev_net_set(dev, &init_net);
5471 5368
@@ -5473,6 +5370,11 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5473 dev->num_tx_queues = queue_count; 5370 dev->num_tx_queues = queue_count;
5474 dev->real_num_tx_queues = queue_count; 5371 dev->real_num_tx_queues = queue_count;
5475 5372
5373#ifdef CONFIG_RPS
5374 dev->_rx = rx;
5375 dev->num_rx_queues = queue_count;
5376#endif
5377
5476 dev->gso_max_size = GSO_MAX_SIZE; 5378 dev->gso_max_size = GSO_MAX_SIZE;
5477 5379
5478 netdev_init_queues(dev); 5380 netdev_init_queues(dev);
@@ -5487,9 +5389,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5487 strcpy(dev->name, name); 5389 strcpy(dev->name, name);
5488 return dev; 5390 return dev;
5489 5391
5392free_rx:
5393#ifdef CONFIG_RPS
5394 kfree(rx);
5490free_tx: 5395free_tx:
5396#endif
5491 kfree(tx); 5397 kfree(tx);
5492
5493free_p: 5398free_p:
5494 kfree(p); 5399 kfree(p);
5495 return NULL; 5400 return NULL;
@@ -5635,15 +5540,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5635 if (dev->features & NETIF_F_NETNS_LOCAL) 5540 if (dev->features & NETIF_F_NETNS_LOCAL)
5636 goto out; 5541 goto out;
5637 5542
5638#ifdef CONFIG_SYSFS
5639 /* Don't allow real devices to be moved when sysfs
5640 * is enabled.
5641 */
5642 err = -EINVAL;
5643 if (dev->dev.parent)
5644 goto out;
5645#endif
5646
5647 /* Ensure the device has been registrered */ 5543 /* Ensure the device has been registrered */
5648 err = -EINVAL; 5544 err = -EINVAL;
5649 if (dev->reg_state != NETREG_REGISTERED) 5545 if (dev->reg_state != NETREG_REGISTERED)
@@ -5691,10 +5587,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5691 /* 5587 /*
5692 * Flush the unicast and multicast chains 5588 * Flush the unicast and multicast chains
5693 */ 5589 */
5694 dev_unicast_flush(dev); 5590 dev_uc_flush(dev);
5695 dev_addr_discard(dev); 5591 dev_mc_flush(dev);
5696
5697 netdev_unregister_kobject(dev);
5698 5592
5699 /* Actually switch the network namespace */ 5593 /* Actually switch the network namespace */
5700 dev_net_set(dev, net); 5594 dev_net_set(dev, net);
@@ -5708,7 +5602,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5708 } 5602 }
5709 5603
5710 /* Fixup kobjects */ 5604 /* Fixup kobjects */
5711 err = netdev_register_kobject(dev); 5605 err = device_rename(&dev->dev, dev->name);
5712 WARN_ON(err); 5606 WARN_ON(err);
5713 5607
5714 /* Add the device back in the hashes */ 5608 /* Add the device back in the hashes */
@@ -5735,7 +5629,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5735 void *ocpu) 5629 void *ocpu)
5736{ 5630{
5737 struct sk_buff **list_skb; 5631 struct sk_buff **list_skb;
5738 struct Qdisc **list_net;
5739 struct sk_buff *skb; 5632 struct sk_buff *skb;
5740 unsigned int cpu, oldcpu = (unsigned long)ocpu; 5633 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5741 struct softnet_data *sd, *oldsd; 5634 struct softnet_data *sd, *oldsd;
@@ -5756,19 +5649,23 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5756 *list_skb = oldsd->completion_queue; 5649 *list_skb = oldsd->completion_queue;
5757 oldsd->completion_queue = NULL; 5650 oldsd->completion_queue = NULL;
5758 5651
5759 /* Find end of our output_queue. */
5760 list_net = &sd->output_queue;
5761 while (*list_net)
5762 list_net = &(*list_net)->next_sched;
5763 /* Append output queue from offline CPU. */ 5652 /* Append output queue from offline CPU. */
5764 *list_net = oldsd->output_queue; 5653 if (oldsd->output_queue) {
5765 oldsd->output_queue = NULL; 5654 *sd->output_queue_tailp = oldsd->output_queue;
5655 sd->output_queue_tailp = oldsd->output_queue_tailp;
5656 oldsd->output_queue = NULL;
5657 oldsd->output_queue_tailp = &oldsd->output_queue;
5658 }
5766 5659
5767 raise_softirq_irqoff(NET_TX_SOFTIRQ); 5660 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5768 local_irq_enable(); 5661 local_irq_enable();
5769 5662
5770 /* Process offline CPU's input_pkt_queue */ 5663 /* Process offline CPU's input_pkt_queue */
5771 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) 5664 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5665 netif_rx(skb);
5666 input_queue_head_add(oldsd, 1);
5667 }
5668 while ((skb = __skb_dequeue(&oldsd->process_queue)))
5772 netif_rx(skb); 5669 netif_rx(skb);
5773 5670
5774 return NOTIFY_OK; 5671 return NOTIFY_OK;
@@ -5985,17 +5882,26 @@ static int __init net_dev_init(void)
5985 */ 5882 */
5986 5883
5987 for_each_possible_cpu(i) { 5884 for_each_possible_cpu(i) {
5988 struct softnet_data *queue; 5885 struct softnet_data *sd = &per_cpu(softnet_data, i);
5989 5886
5990 queue = &per_cpu(softnet_data, i); 5887 memset(sd, 0, sizeof(*sd));
5991 skb_queue_head_init(&queue->input_pkt_queue); 5888 skb_queue_head_init(&sd->input_pkt_queue);
5992 queue->completion_queue = NULL; 5889 skb_queue_head_init(&sd->process_queue);
5993 INIT_LIST_HEAD(&queue->poll_list); 5890 sd->completion_queue = NULL;
5891 INIT_LIST_HEAD(&sd->poll_list);
5892 sd->output_queue = NULL;
5893 sd->output_queue_tailp = &sd->output_queue;
5894#ifdef CONFIG_RPS
5895 sd->csd.func = rps_trigger_softirq;
5896 sd->csd.info = sd;
5897 sd->csd.flags = 0;
5898 sd->cpu = i;
5899#endif
5994 5900
5995 queue->backlog.poll = process_backlog; 5901 sd->backlog.poll = process_backlog;
5996 queue->backlog.weight = weight_p; 5902 sd->backlog.weight = weight_p;
5997 queue->backlog.gro_list = NULL; 5903 sd->backlog.gro_list = NULL;
5998 queue->backlog.gro_count = 0; 5904 sd->backlog.gro_count = 0;
5999 } 5905 }
6000 5906
6001 dev_boot_phase = 0; 5907 dev_boot_phase = 0;
@@ -6030,7 +5936,7 @@ subsys_initcall(net_dev_init);
6030 5936
6031static int __init initialize_hashrnd(void) 5937static int __init initialize_hashrnd(void)
6032{ 5938{
6033 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); 5939 get_random_bytes(&hashrnd, sizeof(hashrnd));
6034 return 0; 5940 return 0;
6035} 5941}
6036 5942
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
new file mode 100644
index 000000000000..508f9c18992f
--- /dev/null
+++ b/net/core/dev_addr_lists.c
@@ -0,0 +1,741 @@
1/*
2 * net/core/dev_addr_lists.c - Functions for handling net device lists
3 * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
4 *
5 * This file contains functions for working with unicast, multicast and device
6 * addresses lists.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/netdevice.h>
15#include <linux/rtnetlink.h>
16#include <linux/list.h>
17#include <linux/proc_fs.h>
18
19/*
20 * General list handling functions
21 */
22
23static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
24 unsigned char *addr, int addr_len,
25 unsigned char addr_type, bool global)
26{
27 struct netdev_hw_addr *ha;
28 int alloc_size;
29
30 if (addr_len > MAX_ADDR_LEN)
31 return -EINVAL;
32
33 list_for_each_entry(ha, &list->list, list) {
34 if (!memcmp(ha->addr, addr, addr_len) &&
35 ha->type == addr_type) {
36 if (global) {
37 /* check if addr is already used as global */
38 if (ha->global_use)
39 return 0;
40 else
41 ha->global_use = true;
42 }
43 ha->refcount++;
44 return 0;
45 }
46 }
47
48
49 alloc_size = sizeof(*ha);
50 if (alloc_size < L1_CACHE_BYTES)
51 alloc_size = L1_CACHE_BYTES;
52 ha = kmalloc(alloc_size, GFP_ATOMIC);
53 if (!ha)
54 return -ENOMEM;
55 memcpy(ha->addr, addr, addr_len);
56 ha->type = addr_type;
57 ha->refcount = 1;
58 ha->global_use = global;
59 ha->synced = false;
60 list_add_tail_rcu(&ha->list, &list->list);
61 list->count++;
62 return 0;
63}
64
65static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
66 int addr_len, unsigned char addr_type)
67{
68 return __hw_addr_add_ex(list, addr, addr_len, addr_type, false);
69}
70
71static void ha_rcu_free(struct rcu_head *head)
72{
73 struct netdev_hw_addr *ha;
74
75 ha = container_of(head, struct netdev_hw_addr, rcu_head);
76 kfree(ha);
77}
78
79static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
80 unsigned char *addr, int addr_len,
81 unsigned char addr_type, bool global)
82{
83 struct netdev_hw_addr *ha;
84
85 list_for_each_entry(ha, &list->list, list) {
86 if (!memcmp(ha->addr, addr, addr_len) &&
87 (ha->type == addr_type || !addr_type)) {
88 if (global) {
89 if (!ha->global_use)
90 break;
91 else
92 ha->global_use = false;
93 }
94 if (--ha->refcount)
95 return 0;
96 list_del_rcu(&ha->list);
97 call_rcu(&ha->rcu_head, ha_rcu_free);
98 list->count--;
99 return 0;
100 }
101 }
102 return -ENOENT;
103}
104
105static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
106 int addr_len, unsigned char addr_type)
107{
108 return __hw_addr_del_ex(list, addr, addr_len, addr_type, false);
109}
110
111int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
112 struct netdev_hw_addr_list *from_list,
113 int addr_len, unsigned char addr_type)
114{
115 int err;
116 struct netdev_hw_addr *ha, *ha2;
117 unsigned char type;
118
119 list_for_each_entry(ha, &from_list->list, list) {
120 type = addr_type ? addr_type : ha->type;
121 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
122 if (err)
123 goto unroll;
124 }
125 return 0;
126
127unroll:
128 list_for_each_entry(ha2, &from_list->list, list) {
129 if (ha2 == ha)
130 break;
131 type = addr_type ? addr_type : ha2->type;
132 __hw_addr_del(to_list, ha2->addr, addr_len, type);
133 }
134 return err;
135}
136EXPORT_SYMBOL(__hw_addr_add_multiple);
137
138void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
139 struct netdev_hw_addr_list *from_list,
140 int addr_len, unsigned char addr_type)
141{
142 struct netdev_hw_addr *ha;
143 unsigned char type;
144
145 list_for_each_entry(ha, &from_list->list, list) {
146 type = addr_type ? addr_type : ha->type;
147 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
148 }
149}
150EXPORT_SYMBOL(__hw_addr_del_multiple);
151
152int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
153 struct netdev_hw_addr_list *from_list,
154 int addr_len)
155{
156 int err = 0;
157 struct netdev_hw_addr *ha, *tmp;
158
159 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
160 if (!ha->synced) {
161 err = __hw_addr_add(to_list, ha->addr,
162 addr_len, ha->type);
163 if (err)
164 break;
165 ha->synced = true;
166 ha->refcount++;
167 } else if (ha->refcount == 1) {
168 __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
169 __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
170 }
171 }
172 return err;
173}
174EXPORT_SYMBOL(__hw_addr_sync);
175
176void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
177 struct netdev_hw_addr_list *from_list,
178 int addr_len)
179{
180 struct netdev_hw_addr *ha, *tmp;
181
182 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
183 if (ha->synced) {
184 __hw_addr_del(to_list, ha->addr,
185 addr_len, ha->type);
186 ha->synced = false;
187 __hw_addr_del(from_list, ha->addr,
188 addr_len, ha->type);
189 }
190 }
191}
192EXPORT_SYMBOL(__hw_addr_unsync);
193
194void __hw_addr_flush(struct netdev_hw_addr_list *list)
195{
196 struct netdev_hw_addr *ha, *tmp;
197
198 list_for_each_entry_safe(ha, tmp, &list->list, list) {
199 list_del_rcu(&ha->list);
200 call_rcu(&ha->rcu_head, ha_rcu_free);
201 }
202 list->count = 0;
203}
204EXPORT_SYMBOL(__hw_addr_flush);
205
206void __hw_addr_init(struct netdev_hw_addr_list *list)
207{
208 INIT_LIST_HEAD(&list->list);
209 list->count = 0;
210}
211EXPORT_SYMBOL(__hw_addr_init);
212
213/*
214 * Device addresses handling functions
215 */
216
217/**
218 * dev_addr_flush - Flush device address list
219 * @dev: device
220 *
221 * Flush device address list and reset ->dev_addr.
222 *
223 * The caller must hold the rtnl_mutex.
224 */
225void dev_addr_flush(struct net_device *dev)
226{
227 /* rtnl_mutex must be held here */
228
229 __hw_addr_flush(&dev->dev_addrs);
230 dev->dev_addr = NULL;
231}
232EXPORT_SYMBOL(dev_addr_flush);
233
234/**
235 * dev_addr_init - Init device address list
236 * @dev: device
237 *
238 * Init device address list and create the first element,
239 * used by ->dev_addr.
240 *
241 * The caller must hold the rtnl_mutex.
242 */
243int dev_addr_init(struct net_device *dev)
244{
245 unsigned char addr[MAX_ADDR_LEN];
246 struct netdev_hw_addr *ha;
247 int err;
248
249 /* rtnl_mutex must be held here */
250
251 __hw_addr_init(&dev->dev_addrs);
252 memset(addr, 0, sizeof(addr));
253 err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
254 NETDEV_HW_ADDR_T_LAN);
255 if (!err) {
256 /*
257 * Get the first (previously created) address from the list
258 * and set dev_addr pointer to this location.
259 */
260 ha = list_first_entry(&dev->dev_addrs.list,
261 struct netdev_hw_addr, list);
262 dev->dev_addr = ha->addr;
263 }
264 return err;
265}
266EXPORT_SYMBOL(dev_addr_init);
267
268/**
269 * dev_addr_add - Add a device address
270 * @dev: device
271 * @addr: address to add
272 * @addr_type: address type
273 *
274 * Add a device address to the device or increase the reference count if
275 * it already exists.
276 *
277 * The caller must hold the rtnl_mutex.
278 */
279int dev_addr_add(struct net_device *dev, unsigned char *addr,
280 unsigned char addr_type)
281{
282 int err;
283
284 ASSERT_RTNL();
285
286 err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
287 if (!err)
288 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
289 return err;
290}
291EXPORT_SYMBOL(dev_addr_add);
292
293/**
294 * dev_addr_del - Release a device address.
295 * @dev: device
296 * @addr: address to delete
297 * @addr_type: address type
298 *
299 * Release reference to a device address and remove it from the device
300 * if the reference count drops to zero.
301 *
302 * The caller must hold the rtnl_mutex.
303 */
304int dev_addr_del(struct net_device *dev, unsigned char *addr,
305 unsigned char addr_type)
306{
307 int err;
308 struct netdev_hw_addr *ha;
309
310 ASSERT_RTNL();
311
312 /*
313 * We can not remove the first address from the list because
314 * dev->dev_addr points to that.
315 */
316 ha = list_first_entry(&dev->dev_addrs.list,
317 struct netdev_hw_addr, list);
318 if (ha->addr == dev->dev_addr && ha->refcount == 1)
319 return -ENOENT;
320
321 err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
322 addr_type);
323 if (!err)
324 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
325 return err;
326}
327EXPORT_SYMBOL(dev_addr_del);
328
329/**
330 * dev_addr_add_multiple - Add device addresses from another device
331 * @to_dev: device to which addresses will be added
332 * @from_dev: device from which addresses will be added
333 * @addr_type: address type - 0 means type will be used from from_dev
334 *
335 * Add device addresses of the one device to another.
336 **
337 * The caller must hold the rtnl_mutex.
338 */
339int dev_addr_add_multiple(struct net_device *to_dev,
340 struct net_device *from_dev,
341 unsigned char addr_type)
342{
343 int err;
344
345 ASSERT_RTNL();
346
347 if (from_dev->addr_len != to_dev->addr_len)
348 return -EINVAL;
349 err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
350 to_dev->addr_len, addr_type);
351 if (!err)
352 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
353 return err;
354}
355EXPORT_SYMBOL(dev_addr_add_multiple);
356
357/**
358 * dev_addr_del_multiple - Delete device addresses by another device
359 * @to_dev: device where the addresses will be deleted
360 * @from_dev: device by which addresses the addresses will be deleted
361 * @addr_type: address type - 0 means type will used from from_dev
362 *
363 * Deletes addresses in to device by the list of addresses in from device.
364 *
365 * The caller must hold the rtnl_mutex.
366 */
367int dev_addr_del_multiple(struct net_device *to_dev,
368 struct net_device *from_dev,
369 unsigned char addr_type)
370{
371 ASSERT_RTNL();
372
373 if (from_dev->addr_len != to_dev->addr_len)
374 return -EINVAL;
375 __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
376 to_dev->addr_len, addr_type);
377 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
378 return 0;
379}
380EXPORT_SYMBOL(dev_addr_del_multiple);
381
382/*
383 * Unicast list handling functions
384 */
385
386/**
387 * dev_uc_add - Add a secondary unicast address
388 * @dev: device
389 * @addr: address to add
390 *
391 * Add a secondary unicast address to the device or increase
392 * the reference count if it already exists.
393 */
394int dev_uc_add(struct net_device *dev, unsigned char *addr)
395{
396 int err;
397
398 netif_addr_lock_bh(dev);
399 err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
400 NETDEV_HW_ADDR_T_UNICAST);
401 if (!err)
402 __dev_set_rx_mode(dev);
403 netif_addr_unlock_bh(dev);
404 return err;
405}
406EXPORT_SYMBOL(dev_uc_add);
407
408/**
409 * dev_uc_del - Release secondary unicast address.
410 * @dev: device
411 * @addr: address to delete
412 *
413 * Release reference to a secondary unicast address and remove it
414 * from the device if the reference count drops to zero.
415 */
416int dev_uc_del(struct net_device *dev, unsigned char *addr)
417{
418 int err;
419
420 netif_addr_lock_bh(dev);
421 err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
422 NETDEV_HW_ADDR_T_UNICAST);
423 if (!err)
424 __dev_set_rx_mode(dev);
425 netif_addr_unlock_bh(dev);
426 return err;
427}
428EXPORT_SYMBOL(dev_uc_del);
429
430/**
431 * dev_uc_sync - Synchronize device's unicast list to another device
432 * @to: destination device
433 * @from: source device
434 *
435 * Add newly added addresses to the destination device and release
436 * addresses that have no users left. The source device must be
437 * locked by netif_tx_lock_bh.
438 *
439 * This function is intended to be called from the dev->set_rx_mode
440 * function of layered software devices.
441 */
442int dev_uc_sync(struct net_device *to, struct net_device *from)
443{
444 int err = 0;
445
446 if (to->addr_len != from->addr_len)
447 return -EINVAL;
448
449 netif_addr_lock_bh(to);
450 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
451 if (!err)
452 __dev_set_rx_mode(to);
453 netif_addr_unlock_bh(to);
454 return err;
455}
456EXPORT_SYMBOL(dev_uc_sync);
457
458/**
459 * dev_uc_unsync - Remove synchronized addresses from the destination device
460 * @to: destination device
461 * @from: source device
462 *
463 * Remove all addresses that were added to the destination device by
464 * dev_uc_sync(). This function is intended to be called from the
465 * dev->stop function of layered software devices.
466 */
467void dev_uc_unsync(struct net_device *to, struct net_device *from)
468{
469 if (to->addr_len != from->addr_len)
470 return;
471
472 netif_addr_lock_bh(from);
473 netif_addr_lock(to);
474 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
475 __dev_set_rx_mode(to);
476 netif_addr_unlock(to);
477 netif_addr_unlock_bh(from);
478}
479EXPORT_SYMBOL(dev_uc_unsync);
480
481/**
482 * dev_uc_flush - Flush unicast addresses
483 * @dev: device
484 *
485 * Flush unicast addresses.
486 */
487void dev_uc_flush(struct net_device *dev)
488{
489 netif_addr_lock_bh(dev);
490 __hw_addr_flush(&dev->uc);
491 netif_addr_unlock_bh(dev);
492}
493EXPORT_SYMBOL(dev_uc_flush);
494
495/**
496 * dev_uc_flush - Init unicast address list
497 * @dev: device
498 *
499 * Init unicast address list.
500 */
501void dev_uc_init(struct net_device *dev)
502{
503 __hw_addr_init(&dev->uc);
504}
505EXPORT_SYMBOL(dev_uc_init);
506
507/*
508 * Multicast list handling functions
509 */
510
511static int __dev_mc_add(struct net_device *dev, unsigned char *addr,
512 bool global)
513{
514 int err;
515
516 netif_addr_lock_bh(dev);
517 err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
518 NETDEV_HW_ADDR_T_MULTICAST, global);
519 if (!err)
520 __dev_set_rx_mode(dev);
521 netif_addr_unlock_bh(dev);
522 return err;
523}
524/**
525 * dev_mc_add - Add a multicast address
526 * @dev: device
527 * @addr: address to add
528 *
529 * Add a multicast address to the device or increase
530 * the reference count if it already exists.
531 */
532int dev_mc_add(struct net_device *dev, unsigned char *addr)
533{
534 return __dev_mc_add(dev, addr, false);
535}
536EXPORT_SYMBOL(dev_mc_add);
537
538/**
539 * dev_mc_add_global - Add a global multicast address
540 * @dev: device
541 * @addr: address to add
542 *
543 * Add a global multicast address to the device.
544 */
545int dev_mc_add_global(struct net_device *dev, unsigned char *addr)
546{
547 return __dev_mc_add(dev, addr, true);
548}
549EXPORT_SYMBOL(dev_mc_add_global);
550
551static int __dev_mc_del(struct net_device *dev, unsigned char *addr,
552 bool global)
553{
554 int err;
555
556 netif_addr_lock_bh(dev);
557 err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
558 NETDEV_HW_ADDR_T_MULTICAST, global);
559 if (!err)
560 __dev_set_rx_mode(dev);
561 netif_addr_unlock_bh(dev);
562 return err;
563}
564
565/**
566 * dev_mc_del - Delete a multicast address.
567 * @dev: device
568 * @addr: address to delete
569 *
570 * Release reference to a multicast address and remove it
571 * from the device if the reference count drops to zero.
572 */
573int dev_mc_del(struct net_device *dev, unsigned char *addr)
574{
575 return __dev_mc_del(dev, addr, false);
576}
577EXPORT_SYMBOL(dev_mc_del);
578
579/**
580 * dev_mc_del_global - Delete a global multicast address.
581 * @dev: device
582 * @addr: address to delete
583 *
584 * Release reference to a multicast address and remove it
585 * from the device if the reference count drops to zero.
586 */
587int dev_mc_del_global(struct net_device *dev, unsigned char *addr)
588{
589 return __dev_mc_del(dev, addr, true);
590}
591EXPORT_SYMBOL(dev_mc_del_global);
592
593/**
594 * dev_mc_sync - Synchronize device's unicast list to another device
595 * @to: destination device
596 * @from: source device
597 *
598 * Add newly added addresses to the destination device and release
599 * addresses that have no users left. The source device must be
600 * locked by netif_tx_lock_bh.
601 *
602 * This function is intended to be called from the dev->set_multicast_list
603 * or dev->set_rx_mode function of layered software devices.
604 */
605int dev_mc_sync(struct net_device *to, struct net_device *from)
606{
607 int err = 0;
608
609 if (to->addr_len != from->addr_len)
610 return -EINVAL;
611
612 netif_addr_lock_bh(to);
613 err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
614 if (!err)
615 __dev_set_rx_mode(to);
616 netif_addr_unlock_bh(to);
617 return err;
618}
619EXPORT_SYMBOL(dev_mc_sync);
620
621/**
622 * dev_mc_unsync - Remove synchronized addresses from the destination device
623 * @to: destination device
624 * @from: source device
625 *
626 * Remove all addresses that were added to the destination device by
627 * dev_mc_sync(). This function is intended to be called from the
628 * dev->stop function of layered software devices.
629 */
630void dev_mc_unsync(struct net_device *to, struct net_device *from)
631{
632 if (to->addr_len != from->addr_len)
633 return;
634
635 netif_addr_lock_bh(from);
636 netif_addr_lock(to);
637 __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
638 __dev_set_rx_mode(to);
639 netif_addr_unlock(to);
640 netif_addr_unlock_bh(from);
641}
642EXPORT_SYMBOL(dev_mc_unsync);
643
644/**
645 * dev_mc_flush - Flush multicast addresses
646 * @dev: device
647 *
648 * Flush multicast addresses.
649 */
650void dev_mc_flush(struct net_device *dev)
651{
652 netif_addr_lock_bh(dev);
653 __hw_addr_flush(&dev->mc);
654 netif_addr_unlock_bh(dev);
655}
656EXPORT_SYMBOL(dev_mc_flush);
657
658/**
659 * dev_mc_flush - Init multicast address list
660 * @dev: device
661 *
662 * Init multicast address list.
663 */
664void dev_mc_init(struct net_device *dev)
665{
666 __hw_addr_init(&dev->mc);
667}
668EXPORT_SYMBOL(dev_mc_init);
669
670#ifdef CONFIG_PROC_FS
671#include <linux/seq_file.h>
672
673static int dev_mc_seq_show(struct seq_file *seq, void *v)
674{
675 struct netdev_hw_addr *ha;
676 struct net_device *dev = v;
677
678 if (v == SEQ_START_TOKEN)
679 return 0;
680
681 netif_addr_lock_bh(dev);
682 netdev_for_each_mc_addr(ha, dev) {
683 int i;
684
685 seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
686 dev->name, ha->refcount, ha->global_use);
687
688 for (i = 0; i < dev->addr_len; i++)
689 seq_printf(seq, "%02x", ha->addr[i]);
690
691 seq_putc(seq, '\n');
692 }
693 netif_addr_unlock_bh(dev);
694 return 0;
695}
696
697static const struct seq_operations dev_mc_seq_ops = {
698 .start = dev_seq_start,
699 .next = dev_seq_next,
700 .stop = dev_seq_stop,
701 .show = dev_mc_seq_show,
702};
703
704static int dev_mc_seq_open(struct inode *inode, struct file *file)
705{
706 return seq_open_net(inode, file, &dev_mc_seq_ops,
707 sizeof(struct seq_net_private));
708}
709
710static const struct file_operations dev_mc_seq_fops = {
711 .owner = THIS_MODULE,
712 .open = dev_mc_seq_open,
713 .read = seq_read,
714 .llseek = seq_lseek,
715 .release = seq_release_net,
716};
717
718#endif
719
720static int __net_init dev_mc_net_init(struct net *net)
721{
722 if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops))
723 return -ENOMEM;
724 return 0;
725}
726
727static void __net_exit dev_mc_net_exit(struct net *net)
728{
729 proc_net_remove(net, "dev_mcast");
730}
731
732static struct pernet_operations __net_initdata dev_mc_net_ops = {
733 .init = dev_mc_net_init,
734 .exit = dev_mc_net_exit,
735};
736
737void __init dev_mcast_init(void)
738{
739 register_pernet_subsys(&dev_mc_net_ops);
740}
741
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
deleted file mode 100644
index 3dc295beb483..000000000000
--- a/net/core/dev_mcast.c
+++ /dev/null
@@ -1,232 +0,0 @@
1/*
2 * Linux NET3: Multicast List maintenance.
3 *
4 * Authors:
5 * Tim Kordas <tjk@nostromo.eeap.cwru.edu>
6 * Richard Underwood <richard@wuzz.demon.co.uk>
7 *
8 * Stir fried together from the IP multicast and CAP patches above
9 * Alan Cox <alan@lxorguk.ukuu.org.uk>
10 *
11 * Fixes:
12 * Alan Cox : Update the device on a real delete
13 * rather than any time but...
14 * Alan Cox : IFF_ALLMULTI support.
15 * Alan Cox : New format set_multicast_list() calls.
16 * Gleb Natapov : Remove dev_mc_lock.
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/module.h>
25#include <asm/uaccess.h>
26#include <asm/system.h>
27#include <linux/bitops.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/string.h>
31#include <linux/mm.h>
32#include <linux/socket.h>
33#include <linux/sockios.h>
34#include <linux/in.h>
35#include <linux/errno.h>
36#include <linux/interrupt.h>
37#include <linux/if_ether.h>
38#include <linux/inet.h>
39#include <linux/netdevice.h>
40#include <linux/etherdevice.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
43#include <linux/init.h>
44#include <net/net_namespace.h>
45#include <net/ip.h>
46#include <net/route.h>
47#include <linux/skbuff.h>
48#include <net/sock.h>
49#include <net/arp.h>
50
51
52/*
53 * Device multicast list maintenance.
54 *
55 * This is used both by IP and by the user level maintenance functions.
56 * Unlike BSD we maintain a usage count on a given multicast address so
57 * that a casual user application can add/delete multicasts used by
58 * protocols without doing damage to the protocols when it deletes the
59 * entries. It also helps IP as it tracks overlapping maps.
60 *
61 * Device mc lists are changed by bh at least if IPv6 is enabled,
62 * so that it must be bh protected.
63 *
64 * We block accesses to device mc filters with netif_tx_lock.
65 */
66
67/*
68 * Delete a device level multicast
69 */
70
71int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
72{
73 int err;
74
75 netif_addr_lock_bh(dev);
76 err = __dev_addr_delete(&dev->mc_list, &dev->mc_count,
77 addr, alen, glbl);
78 if (!err) {
79 /*
80 * We have altered the list, so the card
81 * loaded filter is now wrong. Fix it
82 */
83
84 __dev_set_rx_mode(dev);
85 }
86 netif_addr_unlock_bh(dev);
87 return err;
88}
89
90/*
91 * Add a device level multicast
92 */
93
94int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
95{
96 int err;
97
98 netif_addr_lock_bh(dev);
99 if (alen != dev->addr_len)
100 err = -EINVAL;
101 else
102 err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl);
103 if (!err)
104 __dev_set_rx_mode(dev);
105 netif_addr_unlock_bh(dev);
106 return err;
107}
108
109/**
110 * dev_mc_sync - Synchronize device's multicast list to another device
111 * @to: destination device
112 * @from: source device
113 *
114 * Add newly added addresses to the destination device and release
115 * addresses that have no users left. The source device must be
116 * locked by netif_tx_lock_bh.
117 *
118 * This function is intended to be called from the dev->set_multicast_list
119 * or dev->set_rx_mode function of layered software devices.
120 */
121int dev_mc_sync(struct net_device *to, struct net_device *from)
122{
123 int err = 0;
124
125 netif_addr_lock_bh(to);
126 err = __dev_addr_sync(&to->mc_list, &to->mc_count,
127 &from->mc_list, &from->mc_count);
128 if (!err)
129 __dev_set_rx_mode(to);
130 netif_addr_unlock_bh(to);
131
132 return err;
133}
134EXPORT_SYMBOL(dev_mc_sync);
135
136
137/**
138 * dev_mc_unsync - Remove synchronized addresses from the destination
139 * device
140 * @to: destination device
141 * @from: source device
142 *
143 * Remove all addresses that were added to the destination device by
144 * dev_mc_sync(). This function is intended to be called from the
145 * dev->stop function of layered software devices.
146 */
147void dev_mc_unsync(struct net_device *to, struct net_device *from)
148{
149 netif_addr_lock_bh(from);
150 netif_addr_lock(to);
151
152 __dev_addr_unsync(&to->mc_list, &to->mc_count,
153 &from->mc_list, &from->mc_count);
154 __dev_set_rx_mode(to);
155
156 netif_addr_unlock(to);
157 netif_addr_unlock_bh(from);
158}
159EXPORT_SYMBOL(dev_mc_unsync);
160
161#ifdef CONFIG_PROC_FS
162static int dev_mc_seq_show(struct seq_file *seq, void *v)
163{
164 struct dev_addr_list *m;
165 struct net_device *dev = v;
166
167 if (v == SEQ_START_TOKEN)
168 return 0;
169
170 netif_addr_lock_bh(dev);
171 for (m = dev->mc_list; m; m = m->next) {
172 int i;
173
174 seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
175 dev->name, m->dmi_users, m->dmi_gusers);
176
177 for (i = 0; i < m->dmi_addrlen; i++)
178 seq_printf(seq, "%02x", m->dmi_addr[i]);
179
180 seq_putc(seq, '\n');
181 }
182 netif_addr_unlock_bh(dev);
183 return 0;
184}
185
186static const struct seq_operations dev_mc_seq_ops = {
187 .start = dev_seq_start,
188 .next = dev_seq_next,
189 .stop = dev_seq_stop,
190 .show = dev_mc_seq_show,
191};
192
193static int dev_mc_seq_open(struct inode *inode, struct file *file)
194{
195 return seq_open_net(inode, file, &dev_mc_seq_ops,
196 sizeof(struct seq_net_private));
197}
198
199static const struct file_operations dev_mc_seq_fops = {
200 .owner = THIS_MODULE,
201 .open = dev_mc_seq_open,
202 .read = seq_read,
203 .llseek = seq_lseek,
204 .release = seq_release_net,
205};
206
207#endif
208
209static int __net_init dev_mc_net_init(struct net *net)
210{
211 if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops))
212 return -ENOMEM;
213 return 0;
214}
215
216static void __net_exit dev_mc_net_exit(struct net *net)
217{
218 proc_net_remove(net, "dev_mcast");
219}
220
221static struct pernet_operations __net_initdata dev_mc_net_ops = {
222 .init = dev_mc_net_init,
223 .exit = dev_mc_net_exit,
224};
225
226void __init dev_mcast_init(void)
227{
228 register_pernet_subsys(&dev_mc_net_ops);
229}
230
231EXPORT_SYMBOL(dev_mc_add);
232EXPORT_SYMBOL(dev_mc_delete);
diff --git a/net/core/dst.c b/net/core/dst.c
index f307bc18f6a0..9920722cc82b 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -44,7 +44,7 @@ static atomic_t dst_total = ATOMIC_INIT(0);
44 */ 44 */
45static struct { 45static struct {
46 spinlock_t lock; 46 spinlock_t lock;
47 struct dst_entry *list; 47 struct dst_entry *list;
48 unsigned long timer_inc; 48 unsigned long timer_inc;
49 unsigned long timer_expires; 49 unsigned long timer_expires;
50} dst_garbage = { 50} dst_garbage = {
@@ -52,7 +52,7 @@ static struct {
52 .timer_inc = DST_GC_MAX, 52 .timer_inc = DST_GC_MAX,
53}; 53};
54static void dst_gc_task(struct work_struct *work); 54static void dst_gc_task(struct work_struct *work);
55static void ___dst_free(struct dst_entry * dst); 55static void ___dst_free(struct dst_entry *dst);
56 56
57static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); 57static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
58 58
@@ -136,8 +136,8 @@ loop:
136 } 136 }
137 expires = dst_garbage.timer_expires; 137 expires = dst_garbage.timer_expires;
138 /* 138 /*
139 * if the next desired timer is more than 4 seconds in the future 139 * if the next desired timer is more than 4 seconds in the
140 * then round the timer to whole seconds 140 * future then round the timer to whole seconds
141 */ 141 */
142 if (expires > 4*HZ) 142 if (expires > 4*HZ)
143 expires = round_jiffies_relative(expires); 143 expires = round_jiffies_relative(expires);
@@ -152,7 +152,8 @@ loop:
152 " expires: %lu elapsed: %lu us\n", 152 " expires: %lu elapsed: %lu us\n",
153 atomic_read(&dst_total), delayed, work_performed, 153 atomic_read(&dst_total), delayed, work_performed,
154 expires, 154 expires,
155 elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC); 155 elapsed.tv_sec * USEC_PER_SEC +
156 elapsed.tv_nsec / NSEC_PER_USEC);
156#endif 157#endif
157} 158}
158 159
@@ -163,9 +164,9 @@ int dst_discard(struct sk_buff *skb)
163} 164}
164EXPORT_SYMBOL(dst_discard); 165EXPORT_SYMBOL(dst_discard);
165 166
166void * dst_alloc(struct dst_ops * ops) 167void *dst_alloc(struct dst_ops *ops)
167{ 168{
168 struct dst_entry * dst; 169 struct dst_entry *dst;
169 170
170 if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { 171 if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
171 if (ops->gc(ops)) 172 if (ops->gc(ops))
@@ -185,19 +186,20 @@ void * dst_alloc(struct dst_ops * ops)
185 atomic_inc(&ops->entries); 186 atomic_inc(&ops->entries);
186 return dst; 187 return dst;
187} 188}
189EXPORT_SYMBOL(dst_alloc);
188 190
189static void ___dst_free(struct dst_entry * dst) 191static void ___dst_free(struct dst_entry *dst)
190{ 192{
191 /* The first case (dev==NULL) is required, when 193 /* The first case (dev==NULL) is required, when
192 protocol module is unloaded. 194 protocol module is unloaded.
193 */ 195 */
194 if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { 196 if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
195 dst->input = dst->output = dst_discard; 197 dst->input = dst->output = dst_discard;
196 }
197 dst->obsolete = 2; 198 dst->obsolete = 2;
198} 199}
200EXPORT_SYMBOL(__dst_free);
199 201
200void __dst_free(struct dst_entry * dst) 202void __dst_free(struct dst_entry *dst)
201{ 203{
202 spin_lock_bh(&dst_garbage.lock); 204 spin_lock_bh(&dst_garbage.lock);
203 ___dst_free(dst); 205 ___dst_free(dst);
@@ -262,15 +264,16 @@ again:
262 } 264 }
263 return NULL; 265 return NULL;
264} 266}
267EXPORT_SYMBOL(dst_destroy);
265 268
266void dst_release(struct dst_entry *dst) 269void dst_release(struct dst_entry *dst)
267{ 270{
268 if (dst) { 271 if (dst) {
269 int newrefcnt; 272 int newrefcnt;
270 273
271 smp_mb__before_atomic_dec(); 274 smp_mb__before_atomic_dec();
272 newrefcnt = atomic_dec_return(&dst->__refcnt); 275 newrefcnt = atomic_dec_return(&dst->__refcnt);
273 WARN_ON(newrefcnt < 0); 276 WARN_ON(newrefcnt < 0);
274 } 277 }
275} 278}
276EXPORT_SYMBOL(dst_release); 279EXPORT_SYMBOL(dst_release);
@@ -283,8 +286,8 @@ EXPORT_SYMBOL(dst_release);
283 * 286 *
284 * Commented and originally written by Alexey. 287 * Commented and originally written by Alexey.
285 */ 288 */
286static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, 289static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
287 int unregister) 290 int unregister)
288{ 291{
289 if (dst->ops->ifdown) 292 if (dst->ops->ifdown)
290 dst->ops->ifdown(dst, dev, unregister); 293 dst->ops->ifdown(dst, dev, unregister);
@@ -306,7 +309,8 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
306 } 309 }
307} 310}
308 311
309static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) 312static int dst_dev_event(struct notifier_block *this, unsigned long event,
313 void *ptr)
310{ 314{
311 struct net_device *dev = ptr; 315 struct net_device *dev = ptr;
312 struct dst_entry *dst, *last = NULL; 316 struct dst_entry *dst, *last = NULL;
@@ -329,9 +333,8 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void
329 last->next = dst; 333 last->next = dst;
330 else 334 else
331 dst_busy_list = dst; 335 dst_busy_list = dst;
332 for (; dst; dst = dst->next) { 336 for (; dst; dst = dst->next)
333 dst_ifdown(dst, dev, event != NETDEV_DOWN); 337 dst_ifdown(dst, dev, event != NETDEV_DOWN);
334 }
335 mutex_unlock(&dst_gc_mutex); 338 mutex_unlock(&dst_gc_mutex);
336 break; 339 break;
337 } 340 }
@@ -346,7 +349,3 @@ void __init dst_init(void)
346{ 349{
347 register_netdevice_notifier(&dst_dev_notifier); 350 register_netdevice_notifier(&dst_dev_notifier);
348} 351}
349
350EXPORT_SYMBOL(__dst_free);
351EXPORT_SYMBOL(dst_alloc);
352EXPORT_SYMBOL(dst_destroy);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9d55c57f318a..a0f4964033d2 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -18,8 +18,8 @@
18#include <linux/ethtool.h> 18#include <linux/ethtool.h>
19#include <linux/netdevice.h> 19#include <linux/netdevice.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/uaccess.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
22#include <asm/uaccess.h>
23 23
24/* 24/*
25 * Some useful ethtool_ops methods that're device independent. 25 * Some useful ethtool_ops methods that're device independent.
@@ -31,6 +31,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
31{ 31{
32 return netif_carrier_ok(dev) ? 1 : 0; 32 return netif_carrier_ok(dev) ? 1 : 0;
33} 33}
34EXPORT_SYMBOL(ethtool_op_get_link);
34 35
35u32 ethtool_op_get_rx_csum(struct net_device *dev) 36u32 ethtool_op_get_rx_csum(struct net_device *dev)
36{ 37{
@@ -63,6 +64,7 @@ int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
63 64
64 return 0; 65 return 0;
65} 66}
67EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
66 68
67int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) 69int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
68{ 70{
@@ -73,11 +75,13 @@ int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
73 75
74 return 0; 76 return 0;
75} 77}
78EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
76 79
77u32 ethtool_op_get_sg(struct net_device *dev) 80u32 ethtool_op_get_sg(struct net_device *dev)
78{ 81{
79 return (dev->features & NETIF_F_SG) != 0; 82 return (dev->features & NETIF_F_SG) != 0;
80} 83}
84EXPORT_SYMBOL(ethtool_op_get_sg);
81 85
82int ethtool_op_set_sg(struct net_device *dev, u32 data) 86int ethtool_op_set_sg(struct net_device *dev, u32 data)
83{ 87{
@@ -88,11 +92,13 @@ int ethtool_op_set_sg(struct net_device *dev, u32 data)
88 92
89 return 0; 93 return 0;
90} 94}
95EXPORT_SYMBOL(ethtool_op_set_sg);
91 96
92u32 ethtool_op_get_tso(struct net_device *dev) 97u32 ethtool_op_get_tso(struct net_device *dev)
93{ 98{
94 return (dev->features & NETIF_F_TSO) != 0; 99 return (dev->features & NETIF_F_TSO) != 0;
95} 100}
101EXPORT_SYMBOL(ethtool_op_get_tso);
96 102
97int ethtool_op_set_tso(struct net_device *dev, u32 data) 103int ethtool_op_set_tso(struct net_device *dev, u32 data)
98{ 104{
@@ -103,11 +109,13 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data)
103 109
104 return 0; 110 return 0;
105} 111}
112EXPORT_SYMBOL(ethtool_op_set_tso);
106 113
107u32 ethtool_op_get_ufo(struct net_device *dev) 114u32 ethtool_op_get_ufo(struct net_device *dev)
108{ 115{
109 return (dev->features & NETIF_F_UFO) != 0; 116 return (dev->features & NETIF_F_UFO) != 0;
110} 117}
118EXPORT_SYMBOL(ethtool_op_get_ufo);
111 119
112int ethtool_op_set_ufo(struct net_device *dev, u32 data) 120int ethtool_op_set_ufo(struct net_device *dev, u32 data)
113{ 121{
@@ -117,12 +125,13 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data)
117 dev->features &= ~NETIF_F_UFO; 125 dev->features &= ~NETIF_F_UFO;
118 return 0; 126 return 0;
119} 127}
128EXPORT_SYMBOL(ethtool_op_set_ufo);
120 129
121/* the following list of flags are the same as their associated 130/* the following list of flags are the same as their associated
122 * NETIF_F_xxx values in include/linux/netdevice.h 131 * NETIF_F_xxx values in include/linux/netdevice.h
123 */ 132 */
124static const u32 flags_dup_features = 133static const u32 flags_dup_features =
125 (ETH_FLAG_LRO | ETH_FLAG_NTUPLE); 134 (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH);
126 135
127u32 ethtool_op_get_flags(struct net_device *dev) 136u32 ethtool_op_get_flags(struct net_device *dev)
128{ 137{
@@ -133,6 +142,7 @@ u32 ethtool_op_get_flags(struct net_device *dev)
133 142
134 return dev->features & flags_dup_features; 143 return dev->features & flags_dup_features;
135} 144}
145EXPORT_SYMBOL(ethtool_op_get_flags);
136 146
137int ethtool_op_set_flags(struct net_device *dev, u32 data) 147int ethtool_op_set_flags(struct net_device *dev, u32 data)
138{ 148{
@@ -153,9 +163,15 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data)
153 features &= ~NETIF_F_NTUPLE; 163 features &= ~NETIF_F_NTUPLE;
154 } 164 }
155 165
166 if (data & ETH_FLAG_RXHASH)
167 features |= NETIF_F_RXHASH;
168 else
169 features &= ~NETIF_F_RXHASH;
170
156 dev->features = features; 171 dev->features = features;
157 return 0; 172 return 0;
158} 173}
174EXPORT_SYMBOL(ethtool_op_set_flags);
159 175
160void ethtool_ntuple_flush(struct net_device *dev) 176void ethtool_ntuple_flush(struct net_device *dev)
161{ 177{
@@ -201,7 +217,8 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
201 return dev->ethtool_ops->set_settings(dev, &cmd); 217 return dev->ethtool_ops->set_settings(dev, &cmd);
202} 218}
203 219
204static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) 220static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
221 void __user *useraddr)
205{ 222{
206 struct ethtool_drvinfo info; 223 struct ethtool_drvinfo info;
207 const struct ethtool_ops *ops = dev->ethtool_ops; 224 const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -241,7 +258,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void _
241} 258}
242 259
243static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, 260static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
244 void __user *useraddr) 261 void __user *useraddr)
245{ 262{
246 struct ethtool_sset_info info; 263 struct ethtool_sset_info info;
247 const struct ethtool_ops *ops = dev->ethtool_ops; 264 const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -300,7 +317,8 @@ out:
300 return ret; 317 return ret;
301} 318}
302 319
303static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) 320static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
321 void __user *useraddr)
304{ 322{
305 struct ethtool_rxnfc cmd; 323 struct ethtool_rxnfc cmd;
306 324
@@ -313,7 +331,8 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, void __u
313 return dev->ethtool_ops->set_rxnfc(dev, &cmd); 331 return dev->ethtool_ops->set_rxnfc(dev, &cmd);
314} 332}
315 333
316static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) 334static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
335 void __user *useraddr)
317{ 336{
318 struct ethtool_rxnfc info; 337 struct ethtool_rxnfc info;
319 const struct ethtool_ops *ops = dev->ethtool_ops; 338 const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -358,8 +377,8 @@ err_out:
358} 377}
359 378
360static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, 379static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
361 struct ethtool_rx_ntuple_flow_spec *spec, 380 struct ethtool_rx_ntuple_flow_spec *spec,
362 struct ethtool_rx_ntuple_flow_spec_container *fsc) 381 struct ethtool_rx_ntuple_flow_spec_container *fsc)
363{ 382{
364 383
365 /* don't add filters forever */ 384 /* don't add filters forever */
@@ -385,7 +404,8 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
385 list->count++; 404 list->count++;
386} 405}
387 406
388static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) 407static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
408 void __user *useraddr)
389{ 409{
390 struct ethtool_rx_ntuple cmd; 410 struct ethtool_rx_ntuple cmd;
391 const struct ethtool_ops *ops = dev->ethtool_ops; 411 const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -502,7 +522,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
502 p += ETH_GSTRING_LEN; 522 p += ETH_GSTRING_LEN;
503 num_strings++; 523 num_strings++;
504 goto unknown_filter; 524 goto unknown_filter;
505 }; 525 }
506 526
507 /* now the rest of the filters */ 527 /* now the rest of the filters */
508 switch (fsc->fs.flow_type) { 528 switch (fsc->fs.flow_type) {
@@ -510,125 +530,125 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
510 case UDP_V4_FLOW: 530 case UDP_V4_FLOW:
511 case SCTP_V4_FLOW: 531 case SCTP_V4_FLOW:
512 sprintf(p, "\tSrc IP addr: 0x%x\n", 532 sprintf(p, "\tSrc IP addr: 0x%x\n",
513 fsc->fs.h_u.tcp_ip4_spec.ip4src); 533 fsc->fs.h_u.tcp_ip4_spec.ip4src);
514 p += ETH_GSTRING_LEN; 534 p += ETH_GSTRING_LEN;
515 num_strings++; 535 num_strings++;
516 sprintf(p, "\tSrc IP mask: 0x%x\n", 536 sprintf(p, "\tSrc IP mask: 0x%x\n",
517 fsc->fs.m_u.tcp_ip4_spec.ip4src); 537 fsc->fs.m_u.tcp_ip4_spec.ip4src);
518 p += ETH_GSTRING_LEN; 538 p += ETH_GSTRING_LEN;
519 num_strings++; 539 num_strings++;
520 sprintf(p, "\tDest IP addr: 0x%x\n", 540 sprintf(p, "\tDest IP addr: 0x%x\n",
521 fsc->fs.h_u.tcp_ip4_spec.ip4dst); 541 fsc->fs.h_u.tcp_ip4_spec.ip4dst);
522 p += ETH_GSTRING_LEN; 542 p += ETH_GSTRING_LEN;
523 num_strings++; 543 num_strings++;
524 sprintf(p, "\tDest IP mask: 0x%x\n", 544 sprintf(p, "\tDest IP mask: 0x%x\n",
525 fsc->fs.m_u.tcp_ip4_spec.ip4dst); 545 fsc->fs.m_u.tcp_ip4_spec.ip4dst);
526 p += ETH_GSTRING_LEN; 546 p += ETH_GSTRING_LEN;
527 num_strings++; 547 num_strings++;
528 sprintf(p, "\tSrc Port: %d, mask: 0x%x\n", 548 sprintf(p, "\tSrc Port: %d, mask: 0x%x\n",
529 fsc->fs.h_u.tcp_ip4_spec.psrc, 549 fsc->fs.h_u.tcp_ip4_spec.psrc,
530 fsc->fs.m_u.tcp_ip4_spec.psrc); 550 fsc->fs.m_u.tcp_ip4_spec.psrc);
531 p += ETH_GSTRING_LEN; 551 p += ETH_GSTRING_LEN;
532 num_strings++; 552 num_strings++;
533 sprintf(p, "\tDest Port: %d, mask: 0x%x\n", 553 sprintf(p, "\tDest Port: %d, mask: 0x%x\n",
534 fsc->fs.h_u.tcp_ip4_spec.pdst, 554 fsc->fs.h_u.tcp_ip4_spec.pdst,
535 fsc->fs.m_u.tcp_ip4_spec.pdst); 555 fsc->fs.m_u.tcp_ip4_spec.pdst);
536 p += ETH_GSTRING_LEN; 556 p += ETH_GSTRING_LEN;
537 num_strings++; 557 num_strings++;
538 sprintf(p, "\tTOS: %d, mask: 0x%x\n", 558 sprintf(p, "\tTOS: %d, mask: 0x%x\n",
539 fsc->fs.h_u.tcp_ip4_spec.tos, 559 fsc->fs.h_u.tcp_ip4_spec.tos,
540 fsc->fs.m_u.tcp_ip4_spec.tos); 560 fsc->fs.m_u.tcp_ip4_spec.tos);
541 p += ETH_GSTRING_LEN; 561 p += ETH_GSTRING_LEN;
542 num_strings++; 562 num_strings++;
543 break; 563 break;
544 case AH_ESP_V4_FLOW: 564 case AH_ESP_V4_FLOW:
545 case ESP_V4_FLOW: 565 case ESP_V4_FLOW:
546 sprintf(p, "\tSrc IP addr: 0x%x\n", 566 sprintf(p, "\tSrc IP addr: 0x%x\n",
547 fsc->fs.h_u.ah_ip4_spec.ip4src); 567 fsc->fs.h_u.ah_ip4_spec.ip4src);
548 p += ETH_GSTRING_LEN; 568 p += ETH_GSTRING_LEN;
549 num_strings++; 569 num_strings++;
550 sprintf(p, "\tSrc IP mask: 0x%x\n", 570 sprintf(p, "\tSrc IP mask: 0x%x\n",
551 fsc->fs.m_u.ah_ip4_spec.ip4src); 571 fsc->fs.m_u.ah_ip4_spec.ip4src);
552 p += ETH_GSTRING_LEN; 572 p += ETH_GSTRING_LEN;
553 num_strings++; 573 num_strings++;
554 sprintf(p, "\tDest IP addr: 0x%x\n", 574 sprintf(p, "\tDest IP addr: 0x%x\n",
555 fsc->fs.h_u.ah_ip4_spec.ip4dst); 575 fsc->fs.h_u.ah_ip4_spec.ip4dst);
556 p += ETH_GSTRING_LEN; 576 p += ETH_GSTRING_LEN;
557 num_strings++; 577 num_strings++;
558 sprintf(p, "\tDest IP mask: 0x%x\n", 578 sprintf(p, "\tDest IP mask: 0x%x\n",
559 fsc->fs.m_u.ah_ip4_spec.ip4dst); 579 fsc->fs.m_u.ah_ip4_spec.ip4dst);
560 p += ETH_GSTRING_LEN; 580 p += ETH_GSTRING_LEN;
561 num_strings++; 581 num_strings++;
562 sprintf(p, "\tSPI: %d, mask: 0x%x\n", 582 sprintf(p, "\tSPI: %d, mask: 0x%x\n",
563 fsc->fs.h_u.ah_ip4_spec.spi, 583 fsc->fs.h_u.ah_ip4_spec.spi,
564 fsc->fs.m_u.ah_ip4_spec.spi); 584 fsc->fs.m_u.ah_ip4_spec.spi);
565 p += ETH_GSTRING_LEN; 585 p += ETH_GSTRING_LEN;
566 num_strings++; 586 num_strings++;
567 sprintf(p, "\tTOS: %d, mask: 0x%x\n", 587 sprintf(p, "\tTOS: %d, mask: 0x%x\n",
568 fsc->fs.h_u.ah_ip4_spec.tos, 588 fsc->fs.h_u.ah_ip4_spec.tos,
569 fsc->fs.m_u.ah_ip4_spec.tos); 589 fsc->fs.m_u.ah_ip4_spec.tos);
570 p += ETH_GSTRING_LEN; 590 p += ETH_GSTRING_LEN;
571 num_strings++; 591 num_strings++;
572 break; 592 break;
573 case IP_USER_FLOW: 593 case IP_USER_FLOW:
574 sprintf(p, "\tSrc IP addr: 0x%x\n", 594 sprintf(p, "\tSrc IP addr: 0x%x\n",
575 fsc->fs.h_u.raw_ip4_spec.ip4src); 595 fsc->fs.h_u.raw_ip4_spec.ip4src);
576 p += ETH_GSTRING_LEN; 596 p += ETH_GSTRING_LEN;
577 num_strings++; 597 num_strings++;
578 sprintf(p, "\tSrc IP mask: 0x%x\n", 598 sprintf(p, "\tSrc IP mask: 0x%x\n",
579 fsc->fs.m_u.raw_ip4_spec.ip4src); 599 fsc->fs.m_u.raw_ip4_spec.ip4src);
580 p += ETH_GSTRING_LEN; 600 p += ETH_GSTRING_LEN;
581 num_strings++; 601 num_strings++;
582 sprintf(p, "\tDest IP addr: 0x%x\n", 602 sprintf(p, "\tDest IP addr: 0x%x\n",
583 fsc->fs.h_u.raw_ip4_spec.ip4dst); 603 fsc->fs.h_u.raw_ip4_spec.ip4dst);
584 p += ETH_GSTRING_LEN; 604 p += ETH_GSTRING_LEN;
585 num_strings++; 605 num_strings++;
586 sprintf(p, "\tDest IP mask: 0x%x\n", 606 sprintf(p, "\tDest IP mask: 0x%x\n",
587 fsc->fs.m_u.raw_ip4_spec.ip4dst); 607 fsc->fs.m_u.raw_ip4_spec.ip4dst);
588 p += ETH_GSTRING_LEN; 608 p += ETH_GSTRING_LEN;
589 num_strings++; 609 num_strings++;
590 break; 610 break;
591 case IPV4_FLOW: 611 case IPV4_FLOW:
592 sprintf(p, "\tSrc IP addr: 0x%x\n", 612 sprintf(p, "\tSrc IP addr: 0x%x\n",
593 fsc->fs.h_u.usr_ip4_spec.ip4src); 613 fsc->fs.h_u.usr_ip4_spec.ip4src);
594 p += ETH_GSTRING_LEN; 614 p += ETH_GSTRING_LEN;
595 num_strings++; 615 num_strings++;
596 sprintf(p, "\tSrc IP mask: 0x%x\n", 616 sprintf(p, "\tSrc IP mask: 0x%x\n",
597 fsc->fs.m_u.usr_ip4_spec.ip4src); 617 fsc->fs.m_u.usr_ip4_spec.ip4src);
598 p += ETH_GSTRING_LEN; 618 p += ETH_GSTRING_LEN;
599 num_strings++; 619 num_strings++;
600 sprintf(p, "\tDest IP addr: 0x%x\n", 620 sprintf(p, "\tDest IP addr: 0x%x\n",
601 fsc->fs.h_u.usr_ip4_spec.ip4dst); 621 fsc->fs.h_u.usr_ip4_spec.ip4dst);
602 p += ETH_GSTRING_LEN; 622 p += ETH_GSTRING_LEN;
603 num_strings++; 623 num_strings++;
604 sprintf(p, "\tDest IP mask: 0x%x\n", 624 sprintf(p, "\tDest IP mask: 0x%x\n",
605 fsc->fs.m_u.usr_ip4_spec.ip4dst); 625 fsc->fs.m_u.usr_ip4_spec.ip4dst);
606 p += ETH_GSTRING_LEN; 626 p += ETH_GSTRING_LEN;
607 num_strings++; 627 num_strings++;
608 sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n", 628 sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n",
609 fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, 629 fsc->fs.h_u.usr_ip4_spec.l4_4_bytes,
610 fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); 630 fsc->fs.m_u.usr_ip4_spec.l4_4_bytes);
611 p += ETH_GSTRING_LEN; 631 p += ETH_GSTRING_LEN;
612 num_strings++; 632 num_strings++;
613 sprintf(p, "\tTOS: %d, mask: 0x%x\n", 633 sprintf(p, "\tTOS: %d, mask: 0x%x\n",
614 fsc->fs.h_u.usr_ip4_spec.tos, 634 fsc->fs.h_u.usr_ip4_spec.tos,
615 fsc->fs.m_u.usr_ip4_spec.tos); 635 fsc->fs.m_u.usr_ip4_spec.tos);
616 p += ETH_GSTRING_LEN; 636 p += ETH_GSTRING_LEN;
617 num_strings++; 637 num_strings++;
618 sprintf(p, "\tIP Version: %d, mask: 0x%x\n", 638 sprintf(p, "\tIP Version: %d, mask: 0x%x\n",
619 fsc->fs.h_u.usr_ip4_spec.ip_ver, 639 fsc->fs.h_u.usr_ip4_spec.ip_ver,
620 fsc->fs.m_u.usr_ip4_spec.ip_ver); 640 fsc->fs.m_u.usr_ip4_spec.ip_ver);
621 p += ETH_GSTRING_LEN; 641 p += ETH_GSTRING_LEN;
622 num_strings++; 642 num_strings++;
623 sprintf(p, "\tProtocol: %d, mask: 0x%x\n", 643 sprintf(p, "\tProtocol: %d, mask: 0x%x\n",
624 fsc->fs.h_u.usr_ip4_spec.proto, 644 fsc->fs.h_u.usr_ip4_spec.proto,
625 fsc->fs.m_u.usr_ip4_spec.proto); 645 fsc->fs.m_u.usr_ip4_spec.proto);
626 p += ETH_GSTRING_LEN; 646 p += ETH_GSTRING_LEN;
627 num_strings++; 647 num_strings++;
628 break; 648 break;
629 }; 649 }
630 sprintf(p, "\tVLAN: %d, mask: 0x%x\n", 650 sprintf(p, "\tVLAN: %d, mask: 0x%x\n",
631 fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); 651 fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask);
632 p += ETH_GSTRING_LEN; 652 p += ETH_GSTRING_LEN;
633 num_strings++; 653 num_strings++;
634 sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data); 654 sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data);
@@ -641,7 +661,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
641 sprintf(p, "\tAction: Drop\n"); 661 sprintf(p, "\tAction: Drop\n");
642 else 662 else
643 sprintf(p, "\tAction: Direct to queue %d\n", 663 sprintf(p, "\tAction: Direct to queue %d\n",
644 fsc->fs.action); 664 fsc->fs.action);
645 p += ETH_GSTRING_LEN; 665 p += ETH_GSTRING_LEN;
646 num_strings++; 666 num_strings++;
647unknown_filter: 667unknown_filter:
@@ -853,7 +873,8 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
853 return ret; 873 return ret;
854} 874}
855 875
856static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) 876static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
877 void __user *useraddr)
857{ 878{
858 struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; 879 struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
859 880
@@ -867,7 +888,8 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void
867 return 0; 888 return 0;
868} 889}
869 890
870static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) 891static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
892 void __user *useraddr)
871{ 893{
872 struct ethtool_coalesce coalesce; 894 struct ethtool_coalesce coalesce;
873 895
@@ -971,6 +993,7 @@ static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
971 993
972 return dev->ethtool_ops->set_tx_csum(dev, edata.data); 994 return dev->ethtool_ops->set_tx_csum(dev, edata.data);
973} 995}
996EXPORT_SYMBOL(ethtool_op_set_tx_csum);
974 997
975static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) 998static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
976{ 999{
@@ -1042,7 +1065,7 @@ static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
1042 1065
1043 edata.data = dev->features & NETIF_F_GSO; 1066 edata.data = dev->features & NETIF_F_GSO;
1044 if (copy_to_user(useraddr, &edata, sizeof(edata))) 1067 if (copy_to_user(useraddr, &edata, sizeof(edata)))
1045 return -EFAULT; 1068 return -EFAULT;
1046 return 0; 1069 return 0;
1047} 1070}
1048 1071
@@ -1065,7 +1088,7 @@ static int ethtool_get_gro(struct net_device *dev, char __user *useraddr)
1065 1088
1066 edata.data = dev->features & NETIF_F_GRO; 1089 edata.data = dev->features & NETIF_F_GRO;
1067 if (copy_to_user(useraddr, &edata, sizeof(edata))) 1090 if (copy_to_user(useraddr, &edata, sizeof(edata)))
1068 return -EFAULT; 1091 return -EFAULT;
1069 return 0; 1092 return 0;
1070} 1093}
1071 1094
@@ -1277,7 +1300,8 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
1277 return actor(dev, edata.data); 1300 return actor(dev, edata.data);
1278} 1301}
1279 1302
1280static noinline_for_stack int ethtool_flash_device(struct net_device *dev, char __user *useraddr) 1303static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
1304 char __user *useraddr)
1281{ 1305{
1282 struct ethtool_flash efl; 1306 struct ethtool_flash efl;
1283 1307
@@ -1306,11 +1330,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1306 if (!dev->ethtool_ops) 1330 if (!dev->ethtool_ops)
1307 return -EOPNOTSUPP; 1331 return -EOPNOTSUPP;
1308 1332
1309 if (copy_from_user(&ethcmd, useraddr, sizeof (ethcmd))) 1333 if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
1310 return -EFAULT; 1334 return -EFAULT;
1311 1335
1312 /* Allow some commands to be done by anyone */ 1336 /* Allow some commands to be done by anyone */
1313 switch(ethcmd) { 1337 switch (ethcmd) {
1314 case ETHTOOL_GDRVINFO: 1338 case ETHTOOL_GDRVINFO:
1315 case ETHTOOL_GMSGLVL: 1339 case ETHTOOL_GMSGLVL:
1316 case ETHTOOL_GCOALESCE: 1340 case ETHTOOL_GCOALESCE:
@@ -1338,10 +1362,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1338 return -EPERM; 1362 return -EPERM;
1339 } 1363 }
1340 1364
1341 if (dev->ethtool_ops->begin) 1365 if (dev->ethtool_ops->begin) {
1342 if ((rc = dev->ethtool_ops->begin(dev)) < 0) 1366 rc = dev->ethtool_ops->begin(dev);
1367 if (rc < 0)
1343 return rc; 1368 return rc;
1344 1369 }
1345 old_features = dev->features; 1370 old_features = dev->features;
1346 1371
1347 switch (ethcmd) { 1372 switch (ethcmd) {
@@ -1531,16 +1556,3 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1531 1556
1532 return rc; 1557 return rc;
1533} 1558}
1534
1535EXPORT_SYMBOL(ethtool_op_get_link);
1536EXPORT_SYMBOL(ethtool_op_get_sg);
1537EXPORT_SYMBOL(ethtool_op_get_tso);
1538EXPORT_SYMBOL(ethtool_op_set_sg);
1539EXPORT_SYMBOL(ethtool_op_set_tso);
1540EXPORT_SYMBOL(ethtool_op_set_tx_csum);
1541EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
1542EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
1543EXPORT_SYMBOL(ethtool_op_set_ufo);
1544EXPORT_SYMBOL(ethtool_op_get_ufo);
1545EXPORT_SYMBOL(ethtool_op_set_flags);
1546EXPORT_SYMBOL(ethtool_op_get_flags);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index d2c3e7dc2e5f..42e84e08a1be 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -39,6 +39,24 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
39} 39}
40EXPORT_SYMBOL(fib_default_rule_add); 40EXPORT_SYMBOL(fib_default_rule_add);
41 41
42u32 fib_default_rule_pref(struct fib_rules_ops *ops)
43{
44 struct list_head *pos;
45 struct fib_rule *rule;
46
47 if (!list_empty(&ops->rules_list)) {
48 pos = ops->rules_list.next;
49 if (pos->next != &ops->rules_list) {
50 rule = list_entry(pos->next, struct fib_rule, list);
51 if (rule->pref)
52 return rule->pref - 1;
53 }
54 }
55
56 return 0;
57}
58EXPORT_SYMBOL(fib_default_rule_pref);
59
42static void notify_rule_change(int event, struct fib_rule *rule, 60static void notify_rule_change(int event, struct fib_rule *rule,
43 struct fib_rules_ops *ops, struct nlmsghdr *nlh, 61 struct fib_rules_ops *ops, struct nlmsghdr *nlh,
44 u32 pid); 62 u32 pid);
@@ -104,12 +122,12 @@ errout:
104} 122}
105 123
106struct fib_rules_ops * 124struct fib_rules_ops *
107fib_rules_register(struct fib_rules_ops *tmpl, struct net *net) 125fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
108{ 126{
109 struct fib_rules_ops *ops; 127 struct fib_rules_ops *ops;
110 int err; 128 int err;
111 129
112 ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL); 130 ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
113 if (ops == NULL) 131 if (ops == NULL)
114 return ERR_PTR(-ENOMEM); 132 return ERR_PTR(-ENOMEM);
115 133
@@ -124,7 +142,6 @@ fib_rules_register(struct fib_rules_ops *tmpl, struct net *net)
124 142
125 return ops; 143 return ops;
126} 144}
127
128EXPORT_SYMBOL_GPL(fib_rules_register); 145EXPORT_SYMBOL_GPL(fib_rules_register);
129 146
130void fib_rules_cleanup_ops(struct fib_rules_ops *ops) 147void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
@@ -158,7 +175,6 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
158 175
159 call_rcu(&ops->rcu, fib_rules_put_rcu); 176 call_rcu(&ops->rcu, fib_rules_put_rcu);
160} 177}
161
162EXPORT_SYMBOL_GPL(fib_rules_unregister); 178EXPORT_SYMBOL_GPL(fib_rules_unregister);
163 179
164static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, 180static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
@@ -221,7 +237,6 @@ out:
221 237
222 return err; 238 return err;
223} 239}
224
225EXPORT_SYMBOL_GPL(fib_rules_lookup); 240EXPORT_SYMBOL_GPL(fib_rules_lookup);
226 241
227static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, 242static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
@@ -520,6 +535,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
520 return -EMSGSIZE; 535 return -EMSGSIZE;
521 536
522 frh = nlmsg_data(nlh); 537 frh = nlmsg_data(nlh);
538 frh->family = ops->family;
523 frh->table = rule->table; 539 frh->table = rule->table;
524 NLA_PUT_U32(skb, FRA_TABLE, rule->table); 540 NLA_PUT_U32(skb, FRA_TABLE, rule->table);
525 frh->res1 = 0; 541 frh->res1 = 0;
@@ -614,7 +630,7 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
614 break; 630 break;
615 631
616 cb->args[1] = 0; 632 cb->args[1] = 0;
617 skip: 633skip:
618 idx++; 634 idx++;
619 } 635 }
620 rcu_read_unlock(); 636 rcu_read_unlock();
@@ -686,7 +702,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,
686 struct fib_rules_ops *ops; 702 struct fib_rules_ops *ops;
687 703
688 ASSERT_RTNL(); 704 ASSERT_RTNL();
689 rcu_read_lock();
690 705
691 switch (event) { 706 switch (event) {
692 case NETDEV_REGISTER: 707 case NETDEV_REGISTER:
@@ -700,8 +715,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,
700 break; 715 break;
701 } 716 }
702 717
703 rcu_read_unlock();
704
705 return NOTIFY_DONE; 718 return NOTIFY_DONE;
706} 719}
707 720
diff --git a/net/core/filter.c b/net/core/filter.c
index ff943bed21af..da69fb728d32 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -302,6 +302,8 @@ load_b:
302 A = skb->pkt_type; 302 A = skb->pkt_type;
303 continue; 303 continue;
304 case SKF_AD_IFINDEX: 304 case SKF_AD_IFINDEX:
305 if (!skb->dev)
306 return 0;
305 A = skb->dev->ifindex; 307 A = skb->dev->ifindex;
306 continue; 308 continue;
307 case SKF_AD_MARK: 309 case SKF_AD_MARK:
@@ -310,6 +312,11 @@ load_b:
310 case SKF_AD_QUEUE: 312 case SKF_AD_QUEUE:
311 A = skb->queue_mapping; 313 A = skb->queue_mapping;
312 continue; 314 continue;
315 case SKF_AD_HATYPE:
316 if (!skb->dev)
317 return 0;
318 A = skb->dev->type;
319 continue;
313 case SKF_AD_NLATTR: { 320 case SKF_AD_NLATTR: {
314 struct nlattr *nla; 321 struct nlattr *nla;
315 322
diff --git a/net/core/flow.c b/net/core/flow.c
index 96015871ecea..161900674009 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,113 +26,158 @@
26#include <linux/security.h> 26#include <linux/security.h>
27 27
28struct flow_cache_entry { 28struct flow_cache_entry {
29 struct flow_cache_entry *next; 29 union {
30 u16 family; 30 struct hlist_node hlist;
31 u8 dir; 31 struct list_head gc_list;
32 u32 genid; 32 } u;
33 struct flowi key; 33 u16 family;
34 void *object; 34 u8 dir;
35 atomic_t *object_ref; 35 u32 genid;
36 struct flowi key;
37 struct flow_cache_object *object;
36}; 38};
37 39
38atomic_t flow_cache_genid = ATOMIC_INIT(0); 40struct flow_cache_percpu {
39 41 struct hlist_head *hash_table;
40static u32 flow_hash_shift; 42 int hash_count;
41#define flow_hash_size (1 << flow_hash_shift) 43 u32 hash_rnd;
42static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; 44 int hash_rnd_recalc;
43 45 struct tasklet_struct flush_tasklet;
44#define flow_table(cpu) (per_cpu(flow_tables, cpu)) 46};
45
46static struct kmem_cache *flow_cachep __read_mostly;
47 47
48static int flow_lwm, flow_hwm; 48struct flow_flush_info {
49 struct flow_cache *cache;
50 atomic_t cpuleft;
51 struct completion completion;
52};
49 53
50struct flow_percpu_info { 54struct flow_cache {
51 int hash_rnd_recalc; 55 u32 hash_shift;
52 u32 hash_rnd; 56 unsigned long order;
53 int count; 57 struct flow_cache_percpu *percpu;
58 struct notifier_block hotcpu_notifier;
59 int low_watermark;
60 int high_watermark;
61 struct timer_list rnd_timer;
54}; 62};
55static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
56 63
57#define flow_hash_rnd_recalc(cpu) \ 64atomic_t flow_cache_genid = ATOMIC_INIT(0);
58 (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) 65static struct flow_cache flow_cache_global;
59#define flow_hash_rnd(cpu) \ 66static struct kmem_cache *flow_cachep;
60 (per_cpu(flow_hash_info, cpu).hash_rnd)
61#define flow_count(cpu) \
62 (per_cpu(flow_hash_info, cpu).count)
63 67
64static struct timer_list flow_hash_rnd_timer; 68static DEFINE_SPINLOCK(flow_cache_gc_lock);
69static LIST_HEAD(flow_cache_gc_list);
65 70
66#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) 71#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
67 72#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
68struct flow_flush_info {
69 atomic_t cpuleft;
70 struct completion completion;
71};
72static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
73
74#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
75 73
76static void flow_cache_new_hashrnd(unsigned long arg) 74static void flow_cache_new_hashrnd(unsigned long arg)
77{ 75{
76 struct flow_cache *fc = (void *) arg;
78 int i; 77 int i;
79 78
80 for_each_possible_cpu(i) 79 for_each_possible_cpu(i)
81 flow_hash_rnd_recalc(i) = 1; 80 per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
82 81
83 flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; 82 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
84 add_timer(&flow_hash_rnd_timer); 83 add_timer(&fc->rnd_timer);
84}
85
86static int flow_entry_valid(struct flow_cache_entry *fle)
87{
88 if (atomic_read(&flow_cache_genid) != fle->genid)
89 return 0;
90 if (fle->object && !fle->object->ops->check(fle->object))
91 return 0;
92 return 1;
85} 93}
86 94
87static void flow_entry_kill(int cpu, struct flow_cache_entry *fle) 95static void flow_entry_kill(struct flow_cache_entry *fle)
88{ 96{
89 if (fle->object) 97 if (fle->object)
90 atomic_dec(fle->object_ref); 98 fle->object->ops->delete(fle->object);
91 kmem_cache_free(flow_cachep, fle); 99 kmem_cache_free(flow_cachep, fle);
92 flow_count(cpu)--;
93} 100}
94 101
95static void __flow_cache_shrink(int cpu, int shrink_to) 102static void flow_cache_gc_task(struct work_struct *work)
96{ 103{
97 struct flow_cache_entry *fle, **flp; 104 struct list_head gc_list;
98 int i; 105 struct flow_cache_entry *fce, *n;
99 106
100 for (i = 0; i < flow_hash_size; i++) { 107 INIT_LIST_HEAD(&gc_list);
101 int k = 0; 108 spin_lock_bh(&flow_cache_gc_lock);
109 list_splice_tail_init(&flow_cache_gc_list, &gc_list);
110 spin_unlock_bh(&flow_cache_gc_lock);
102 111
103 flp = &flow_table(cpu)[i]; 112 list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
104 while ((fle = *flp) != NULL && k < shrink_to) { 113 flow_entry_kill(fce);
105 k++; 114}
106 flp = &fle->next; 115static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
107 } 116
108 while ((fle = *flp) != NULL) { 117static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
109 *flp = fle->next; 118 int deleted, struct list_head *gc_list)
110 flow_entry_kill(cpu, fle); 119{
111 } 120 if (deleted) {
121 fcp->hash_count -= deleted;
122 spin_lock_bh(&flow_cache_gc_lock);
123 list_splice_tail(gc_list, &flow_cache_gc_list);
124 spin_unlock_bh(&flow_cache_gc_lock);
125 schedule_work(&flow_cache_gc_work);
112 } 126 }
113} 127}
114 128
115static void flow_cache_shrink(int cpu) 129static void __flow_cache_shrink(struct flow_cache *fc,
130 struct flow_cache_percpu *fcp,
131 int shrink_to)
116{ 132{
117 int shrink_to = flow_lwm / flow_hash_size; 133 struct flow_cache_entry *fle;
134 struct hlist_node *entry, *tmp;
135 LIST_HEAD(gc_list);
136 int i, deleted = 0;
137
138 for (i = 0; i < flow_cache_hash_size(fc); i++) {
139 int saved = 0;
140
141 hlist_for_each_entry_safe(fle, entry, tmp,
142 &fcp->hash_table[i], u.hlist) {
143 if (saved < shrink_to &&
144 flow_entry_valid(fle)) {
145 saved++;
146 } else {
147 deleted++;
148 hlist_del(&fle->u.hlist);
149 list_add_tail(&fle->u.gc_list, &gc_list);
150 }
151 }
152 }
118 153
119 __flow_cache_shrink(cpu, shrink_to); 154 flow_cache_queue_garbage(fcp, deleted, &gc_list);
120} 155}
121 156
122static void flow_new_hash_rnd(int cpu) 157static void flow_cache_shrink(struct flow_cache *fc,
158 struct flow_cache_percpu *fcp)
123{ 159{
124 get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); 160 int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
125 flow_hash_rnd_recalc(cpu) = 0;
126 161
127 __flow_cache_shrink(cpu, 0); 162 __flow_cache_shrink(fc, fcp, shrink_to);
128} 163}
129 164
130static u32 flow_hash_code(struct flowi *key, int cpu) 165static void flow_new_hash_rnd(struct flow_cache *fc,
166 struct flow_cache_percpu *fcp)
167{
168 get_random_bytes(&fcp->hash_rnd, sizeof(u32));
169 fcp->hash_rnd_recalc = 0;
170 __flow_cache_shrink(fc, fcp, 0);
171}
172
173static u32 flow_hash_code(struct flow_cache *fc,
174 struct flow_cache_percpu *fcp,
175 struct flowi *key)
131{ 176{
132 u32 *k = (u32 *) key; 177 u32 *k = (u32 *) key;
133 178
134 return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & 179 return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
135 (flow_hash_size - 1)); 180 & (flow_cache_hash_size(fc) - 1));
136} 181}
137 182
138#if (BITS_PER_LONG == 64) 183#if (BITS_PER_LONG == 64)
@@ -165,114 +210,117 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
165 return 0; 210 return 0;
166} 211}
167 212
168void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, 213struct flow_cache_object *
169 flow_resolve_t resolver) 214flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
215 flow_resolve_t resolver, void *ctx)
170{ 216{
171 struct flow_cache_entry *fle, **head; 217 struct flow_cache *fc = &flow_cache_global;
218 struct flow_cache_percpu *fcp;
219 struct flow_cache_entry *fle, *tfle;
220 struct hlist_node *entry;
221 struct flow_cache_object *flo;
172 unsigned int hash; 222 unsigned int hash;
173 int cpu;
174 223
175 local_bh_disable(); 224 local_bh_disable();
176 cpu = smp_processor_id(); 225 fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
177 226
178 fle = NULL; 227 fle = NULL;
228 flo = NULL;
179 /* Packet really early in init? Making flow_cache_init a 229 /* Packet really early in init? Making flow_cache_init a
180 * pre-smp initcall would solve this. --RR */ 230 * pre-smp initcall would solve this. --RR */
181 if (!flow_table(cpu)) 231 if (!fcp->hash_table)
182 goto nocache; 232 goto nocache;
183 233
184 if (flow_hash_rnd_recalc(cpu)) 234 if (fcp->hash_rnd_recalc)
185 flow_new_hash_rnd(cpu); 235 flow_new_hash_rnd(fc, fcp);
186 hash = flow_hash_code(key, cpu);
187 236
188 head = &flow_table(cpu)[hash]; 237 hash = flow_hash_code(fc, fcp, key);
189 for (fle = *head; fle; fle = fle->next) { 238 hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
190 if (fle->family == family && 239 if (tfle->family == family &&
191 fle->dir == dir && 240 tfle->dir == dir &&
192 flow_key_compare(key, &fle->key) == 0) { 241 flow_key_compare(key, &tfle->key) == 0) {
193 if (fle->genid == atomic_read(&flow_cache_genid)) { 242 fle = tfle;
194 void *ret = fle->object;
195
196 if (ret)
197 atomic_inc(fle->object_ref);
198 local_bh_enable();
199
200 return ret;
201 }
202 break; 243 break;
203 } 244 }
204 } 245 }
205 246
206 if (!fle) { 247 if (unlikely(!fle)) {
207 if (flow_count(cpu) > flow_hwm) 248 if (fcp->hash_count > fc->high_watermark)
208 flow_cache_shrink(cpu); 249 flow_cache_shrink(fc, fcp);
209 250
210 fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); 251 fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
211 if (fle) { 252 if (fle) {
212 fle->next = *head;
213 *head = fle;
214 fle->family = family; 253 fle->family = family;
215 fle->dir = dir; 254 fle->dir = dir;
216 memcpy(&fle->key, key, sizeof(*key)); 255 memcpy(&fle->key, key, sizeof(*key));
217 fle->object = NULL; 256 fle->object = NULL;
218 flow_count(cpu)++; 257 hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
258 fcp->hash_count++;
219 } 259 }
260 } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
261 flo = fle->object;
262 if (!flo)
263 goto ret_object;
264 flo = flo->ops->get(flo);
265 if (flo)
266 goto ret_object;
267 } else if (fle->object) {
268 flo = fle->object;
269 flo->ops->delete(flo);
270 fle->object = NULL;
220 } 271 }
221 272
222nocache: 273nocache:
223 { 274 flo = NULL;
224 int err; 275 if (fle) {
225 void *obj; 276 flo = fle->object;
226 atomic_t *obj_ref; 277 fle->object = NULL;
227
228 err = resolver(net, key, family, dir, &obj, &obj_ref);
229
230 if (fle && !err) {
231 fle->genid = atomic_read(&flow_cache_genid);
232
233 if (fle->object)
234 atomic_dec(fle->object_ref);
235
236 fle->object = obj;
237 fle->object_ref = obj_ref;
238 if (obj)
239 atomic_inc(fle->object_ref);
240 }
241 local_bh_enable();
242
243 if (err)
244 obj = ERR_PTR(err);
245 return obj;
246 } 278 }
279 flo = resolver(net, key, family, dir, flo, ctx);
280 if (fle) {
281 fle->genid = atomic_read(&flow_cache_genid);
282 if (!IS_ERR(flo))
283 fle->object = flo;
284 else
285 fle->genid--;
286 } else {
287 if (flo && !IS_ERR(flo))
288 flo->ops->delete(flo);
289 }
290ret_object:
291 local_bh_enable();
292 return flo;
247} 293}
248 294
249static void flow_cache_flush_tasklet(unsigned long data) 295static void flow_cache_flush_tasklet(unsigned long data)
250{ 296{
251 struct flow_flush_info *info = (void *)data; 297 struct flow_flush_info *info = (void *)data;
252 int i; 298 struct flow_cache *fc = info->cache;
253 int cpu; 299 struct flow_cache_percpu *fcp;
254 300 struct flow_cache_entry *fle;
255 cpu = smp_processor_id(); 301 struct hlist_node *entry, *tmp;
256 for (i = 0; i < flow_hash_size; i++) { 302 LIST_HEAD(gc_list);
257 struct flow_cache_entry *fle; 303 int i, deleted = 0;
258 304
259 fle = flow_table(cpu)[i]; 305 fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
260 for (; fle; fle = fle->next) { 306 for (i = 0; i < flow_cache_hash_size(fc); i++) {
261 unsigned genid = atomic_read(&flow_cache_genid); 307 hlist_for_each_entry_safe(fle, entry, tmp,
262 308 &fcp->hash_table[i], u.hlist) {
263 if (!fle->object || fle->genid == genid) 309 if (flow_entry_valid(fle))
264 continue; 310 continue;
265 311
266 fle->object = NULL; 312 deleted++;
267 atomic_dec(fle->object_ref); 313 hlist_del(&fle->u.hlist);
314 list_add_tail(&fle->u.gc_list, &gc_list);
268 } 315 }
269 } 316 }
270 317
318 flow_cache_queue_garbage(fcp, deleted, &gc_list);
319
271 if (atomic_dec_and_test(&info->cpuleft)) 320 if (atomic_dec_and_test(&info->cpuleft))
272 complete(&info->completion); 321 complete(&info->completion);
273} 322}
274 323
275static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
276static void flow_cache_flush_per_cpu(void *data) 324static void flow_cache_flush_per_cpu(void *data)
277{ 325{
278 struct flow_flush_info *info = data; 326 struct flow_flush_info *info = data;
@@ -280,8 +328,7 @@ static void flow_cache_flush_per_cpu(void *data)
280 struct tasklet_struct *tasklet; 328 struct tasklet_struct *tasklet;
281 329
282 cpu = smp_processor_id(); 330 cpu = smp_processor_id();
283 331 tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
284 tasklet = flow_flush_tasklet(cpu);
285 tasklet->data = (unsigned long)info; 332 tasklet->data = (unsigned long)info;
286 tasklet_schedule(tasklet); 333 tasklet_schedule(tasklet);
287} 334}
@@ -294,6 +341,7 @@ void flow_cache_flush(void)
294 /* Don't want cpus going down or up during this. */ 341 /* Don't want cpus going down or up during this. */
295 get_online_cpus(); 342 get_online_cpus();
296 mutex_lock(&flow_flush_sem); 343 mutex_lock(&flow_flush_sem);
344 info.cache = &flow_cache_global;
297 atomic_set(&info.cpuleft, num_online_cpus()); 345 atomic_set(&info.cpuleft, num_online_cpus());
298 init_completion(&info.completion); 346 init_completion(&info.completion);
299 347
@@ -307,62 +355,75 @@ void flow_cache_flush(void)
307 put_online_cpus(); 355 put_online_cpus();
308} 356}
309 357
310static void __init flow_cache_cpu_prepare(int cpu) 358static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
359 struct flow_cache_percpu *fcp)
311{ 360{
312 struct tasklet_struct *tasklet; 361 fcp->hash_table = (struct hlist_head *)
313 unsigned long order; 362 __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
314 363 if (!fcp->hash_table)
315 for (order = 0; 364 panic("NET: failed to allocate flow cache order %lu\n", fc->order);
316 (PAGE_SIZE << order) < 365
317 (sizeof(struct flow_cache_entry *)*flow_hash_size); 366 fcp->hash_rnd_recalc = 1;
318 order++) 367 fcp->hash_count = 0;
319 /* NOTHING */; 368 tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
320
321 flow_table(cpu) = (struct flow_cache_entry **)
322 __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
323 if (!flow_table(cpu))
324 panic("NET: failed to allocate flow cache order %lu\n", order);
325
326 flow_hash_rnd_recalc(cpu) = 1;
327 flow_count(cpu) = 0;
328
329 tasklet = flow_flush_tasklet(cpu);
330 tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
331} 369}
332 370
333static int flow_cache_cpu(struct notifier_block *nfb, 371static int flow_cache_cpu(struct notifier_block *nfb,
334 unsigned long action, 372 unsigned long action,
335 void *hcpu) 373 void *hcpu)
336{ 374{
375 struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
376 int cpu = (unsigned long) hcpu;
377 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
378
337 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 379 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
338 __flow_cache_shrink((unsigned long)hcpu, 0); 380 __flow_cache_shrink(fc, fcp, 0);
339 return NOTIFY_OK; 381 return NOTIFY_OK;
340} 382}
341 383
342static int __init flow_cache_init(void) 384static int flow_cache_init(struct flow_cache *fc)
343{ 385{
386 unsigned long order;
344 int i; 387 int i;
345 388
346 flow_cachep = kmem_cache_create("flow_cache", 389 fc->hash_shift = 10;
347 sizeof(struct flow_cache_entry), 390 fc->low_watermark = 2 * flow_cache_hash_size(fc);
348 0, SLAB_PANIC, 391 fc->high_watermark = 4 * flow_cache_hash_size(fc);
349 NULL); 392
350 flow_hash_shift = 10; 393 for (order = 0;
351 flow_lwm = 2 * flow_hash_size; 394 (PAGE_SIZE << order) <
352 flow_hwm = 4 * flow_hash_size; 395 (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
396 order++)
397 /* NOTHING */;
398 fc->order = order;
399 fc->percpu = alloc_percpu(struct flow_cache_percpu);
353 400
354 setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0); 401 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
355 flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; 402 (unsigned long) fc);
356 add_timer(&flow_hash_rnd_timer); 403 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
404 add_timer(&fc->rnd_timer);
357 405
358 for_each_possible_cpu(i) 406 for_each_possible_cpu(i)
359 flow_cache_cpu_prepare(i); 407 flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
408
409 fc->hotcpu_notifier = (struct notifier_block){
410 .notifier_call = flow_cache_cpu,
411 };
412 register_hotcpu_notifier(&fc->hotcpu_notifier);
360 413
361 hotcpu_notifier(flow_cache_cpu, 0);
362 return 0; 414 return 0;
363} 415}
364 416
365module_init(flow_cache_init); 417static int __init flow_cache_init_global(void)
418{
419 flow_cachep = kmem_cache_create("flow_cache",
420 sizeof(struct flow_cache_entry),
421 0, SLAB_PANIC, NULL);
422
423 return flow_cache_init(&flow_cache_global);
424}
425
426module_init(flow_cache_init_global);
366 427
367EXPORT_SYMBOL(flow_cache_genid); 428EXPORT_SYMBOL(flow_cache_genid);
368EXPORT_SYMBOL(flow_cache_lookup); 429EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 59cfc7d8fc45..99e7052d7323 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -14,9 +14,12 @@
14#include <linux/netdevice.h> 14#include <linux/netdevice.h>
15#include <linux/if_arp.h> 15#include <linux/if_arp.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/nsproxy.h>
17#include <net/sock.h> 18#include <net/sock.h>
19#include <net/net_namespace.h>
18#include <linux/rtnetlink.h> 20#include <linux/rtnetlink.h>
19#include <linux/wireless.h> 21#include <linux/wireless.h>
22#include <linux/vmalloc.h>
20#include <net/wext.h> 23#include <net/wext.h>
21 24
22#include "net-sysfs.h" 25#include "net-sysfs.h"
@@ -466,18 +469,345 @@ static struct attribute_group wireless_group = {
466 .attrs = wireless_attrs, 469 .attrs = wireless_attrs,
467}; 470};
468#endif 471#endif
469
470#endif /* CONFIG_SYSFS */ 472#endif /* CONFIG_SYSFS */
471 473
474#ifdef CONFIG_RPS
475/*
476 * RX queue sysfs structures and functions.
477 */
478struct rx_queue_attribute {
479 struct attribute attr;
480 ssize_t (*show)(struct netdev_rx_queue *queue,
481 struct rx_queue_attribute *attr, char *buf);
482 ssize_t (*store)(struct netdev_rx_queue *queue,
483 struct rx_queue_attribute *attr, const char *buf, size_t len);
484};
485#define to_rx_queue_attr(_attr) container_of(_attr, \
486 struct rx_queue_attribute, attr)
487
488#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
489
490static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
491 char *buf)
492{
493 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
494 struct netdev_rx_queue *queue = to_rx_queue(kobj);
495
496 if (!attribute->show)
497 return -EIO;
498
499 return attribute->show(queue, attribute, buf);
500}
501
502static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
503 const char *buf, size_t count)
504{
505 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
506 struct netdev_rx_queue *queue = to_rx_queue(kobj);
507
508 if (!attribute->store)
509 return -EIO;
510
511 return attribute->store(queue, attribute, buf, count);
512}
513
514static struct sysfs_ops rx_queue_sysfs_ops = {
515 .show = rx_queue_attr_show,
516 .store = rx_queue_attr_store,
517};
518
519static ssize_t show_rps_map(struct netdev_rx_queue *queue,
520 struct rx_queue_attribute *attribute, char *buf)
521{
522 struct rps_map *map;
523 cpumask_var_t mask;
524 size_t len = 0;
525 int i;
526
527 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
528 return -ENOMEM;
529
530 rcu_read_lock();
531 map = rcu_dereference(queue->rps_map);
532 if (map)
533 for (i = 0; i < map->len; i++)
534 cpumask_set_cpu(map->cpus[i], mask);
535
536 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
537 if (PAGE_SIZE - len < 3) {
538 rcu_read_unlock();
539 free_cpumask_var(mask);
540 return -EINVAL;
541 }
542 rcu_read_unlock();
543
544 free_cpumask_var(mask);
545 len += sprintf(buf + len, "\n");
546 return len;
547}
548
549static void rps_map_release(struct rcu_head *rcu)
550{
551 struct rps_map *map = container_of(rcu, struct rps_map, rcu);
552
553 kfree(map);
554}
555
556static ssize_t store_rps_map(struct netdev_rx_queue *queue,
557 struct rx_queue_attribute *attribute,
558 const char *buf, size_t len)
559{
560 struct rps_map *old_map, *map;
561 cpumask_var_t mask;
562 int err, cpu, i;
563 static DEFINE_SPINLOCK(rps_map_lock);
564
565 if (!capable(CAP_NET_ADMIN))
566 return -EPERM;
567
568 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
569 return -ENOMEM;
570
571 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
572 if (err) {
573 free_cpumask_var(mask);
574 return err;
575 }
576
577 map = kzalloc(max_t(unsigned,
578 RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
579 GFP_KERNEL);
580 if (!map) {
581 free_cpumask_var(mask);
582 return -ENOMEM;
583 }
584
585 i = 0;
586 for_each_cpu_and(cpu, mask, cpu_online_mask)
587 map->cpus[i++] = cpu;
588
589 if (i)
590 map->len = i;
591 else {
592 kfree(map);
593 map = NULL;
594 }
595
596 spin_lock(&rps_map_lock);
597 old_map = queue->rps_map;
598 rcu_assign_pointer(queue->rps_map, map);
599 spin_unlock(&rps_map_lock);
600
601 if (old_map)
602 call_rcu(&old_map->rcu, rps_map_release);
603
604 free_cpumask_var(mask);
605 return len;
606}
607
608static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
609 struct rx_queue_attribute *attr,
610 char *buf)
611{
612 struct rps_dev_flow_table *flow_table;
613 unsigned int val = 0;
614
615 rcu_read_lock();
616 flow_table = rcu_dereference(queue->rps_flow_table);
617 if (flow_table)
618 val = flow_table->mask + 1;
619 rcu_read_unlock();
620
621 return sprintf(buf, "%u\n", val);
622}
623
624static void rps_dev_flow_table_release_work(struct work_struct *work)
625{
626 struct rps_dev_flow_table *table = container_of(work,
627 struct rps_dev_flow_table, free_work);
628
629 vfree(table);
630}
631
632static void rps_dev_flow_table_release(struct rcu_head *rcu)
633{
634 struct rps_dev_flow_table *table = container_of(rcu,
635 struct rps_dev_flow_table, rcu);
636
637 INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
638 schedule_work(&table->free_work);
639}
640
641static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
642 struct rx_queue_attribute *attr,
643 const char *buf, size_t len)
644{
645 unsigned int count;
646 char *endp;
647 struct rps_dev_flow_table *table, *old_table;
648 static DEFINE_SPINLOCK(rps_dev_flow_lock);
649
650 if (!capable(CAP_NET_ADMIN))
651 return -EPERM;
652
653 count = simple_strtoul(buf, &endp, 0);
654 if (endp == buf)
655 return -EINVAL;
656
657 if (count) {
658 int i;
659
660 if (count > 1<<30) {
661 /* Enforce a limit to prevent overflow */
662 return -EINVAL;
663 }
664 count = roundup_pow_of_two(count);
665 table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
666 if (!table)
667 return -ENOMEM;
668
669 table->mask = count - 1;
670 for (i = 0; i < count; i++)
671 table->flows[i].cpu = RPS_NO_CPU;
672 } else
673 table = NULL;
674
675 spin_lock(&rps_dev_flow_lock);
676 old_table = queue->rps_flow_table;
677 rcu_assign_pointer(queue->rps_flow_table, table);
678 spin_unlock(&rps_dev_flow_lock);
679
680 if (old_table)
681 call_rcu(&old_table->rcu, rps_dev_flow_table_release);
682
683 return len;
684}
685
686static struct rx_queue_attribute rps_cpus_attribute =
687 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
688
689
690static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
691 __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
692 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
693
694static struct attribute *rx_queue_default_attrs[] = {
695 &rps_cpus_attribute.attr,
696 &rps_dev_flow_table_cnt_attribute.attr,
697 NULL
698};
699
700static void rx_queue_release(struct kobject *kobj)
701{
702 struct netdev_rx_queue *queue = to_rx_queue(kobj);
703 struct netdev_rx_queue *first = queue->first;
704
705 if (queue->rps_map)
706 call_rcu(&queue->rps_map->rcu, rps_map_release);
707
708 if (queue->rps_flow_table)
709 call_rcu(&queue->rps_flow_table->rcu,
710 rps_dev_flow_table_release);
711
712 if (atomic_dec_and_test(&first->count))
713 kfree(first);
714}
715
716static struct kobj_type rx_queue_ktype = {
717 .sysfs_ops = &rx_queue_sysfs_ops,
718 .release = rx_queue_release,
719 .default_attrs = rx_queue_default_attrs,
720};
721
722static int rx_queue_add_kobject(struct net_device *net, int index)
723{
724 struct netdev_rx_queue *queue = net->_rx + index;
725 struct kobject *kobj = &queue->kobj;
726 int error = 0;
727
728 kobj->kset = net->queues_kset;
729 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
730 "rx-%u", index);
731 if (error) {
732 kobject_put(kobj);
733 return error;
734 }
735
736 kobject_uevent(kobj, KOBJ_ADD);
737
738 return error;
739}
740
741static int rx_queue_register_kobjects(struct net_device *net)
742{
743 int i;
744 int error = 0;
745
746 net->queues_kset = kset_create_and_add("queues",
747 NULL, &net->dev.kobj);
748 if (!net->queues_kset)
749 return -ENOMEM;
750 for (i = 0; i < net->num_rx_queues; i++) {
751 error = rx_queue_add_kobject(net, i);
752 if (error)
753 break;
754 }
755
756 if (error)
757 while (--i >= 0)
758 kobject_put(&net->_rx[i].kobj);
759
760 return error;
761}
762
763static void rx_queue_remove_kobjects(struct net_device *net)
764{
765 int i;
766
767 for (i = 0; i < net->num_rx_queues; i++)
768 kobject_put(&net->_rx[i].kobj);
769 kset_unregister(net->queues_kset);
770}
771#endif /* CONFIG_RPS */
772
773static const void *net_current_ns(void)
774{
775 return current->nsproxy->net_ns;
776}
777
778static const void *net_initial_ns(void)
779{
780 return &init_net;
781}
782
783static const void *net_netlink_ns(struct sock *sk)
784{
785 return sock_net(sk);
786}
787
788static struct kobj_ns_type_operations net_ns_type_operations = {
789 .type = KOBJ_NS_TYPE_NET,
790 .current_ns = net_current_ns,
791 .netlink_ns = net_netlink_ns,
792 .initial_ns = net_initial_ns,
793};
794
795static void net_kobj_ns_exit(struct net *net)
796{
797 kobj_ns_exit(KOBJ_NS_TYPE_NET, net);
798}
799
800static struct pernet_operations kobj_net_ops = {
801 .exit = net_kobj_ns_exit,
802};
803
804
472#ifdef CONFIG_HOTPLUG 805#ifdef CONFIG_HOTPLUG
473static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) 806static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
474{ 807{
475 struct net_device *dev = to_net_dev(d); 808 struct net_device *dev = to_net_dev(d);
476 int retval; 809 int retval;
477 810
478 if (!net_eq(dev_net(dev), &init_net))
479 return 0;
480
481 /* pass interface to uevent. */ 811 /* pass interface to uevent. */
482 retval = add_uevent_var(env, "INTERFACE=%s", dev->name); 812 retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
483 if (retval) 813 if (retval)
@@ -507,6 +837,13 @@ static void netdev_release(struct device *d)
507 kfree((char *)dev - dev->padded); 837 kfree((char *)dev - dev->padded);
508} 838}
509 839
840static const void *net_namespace(struct device *d)
841{
842 struct net_device *dev;
843 dev = container_of(d, struct net_device, dev);
844 return dev_net(dev);
845}
846
510static struct class net_class = { 847static struct class net_class = {
511 .name = "net", 848 .name = "net",
512 .dev_release = netdev_release, 849 .dev_release = netdev_release,
@@ -516,6 +853,8 @@ static struct class net_class = {
516#ifdef CONFIG_HOTPLUG 853#ifdef CONFIG_HOTPLUG
517 .dev_uevent = netdev_uevent, 854 .dev_uevent = netdev_uevent,
518#endif 855#endif
856 .ns_type = &net_ns_type_operations,
857 .namespace = net_namespace,
519}; 858};
520 859
521/* Delete sysfs entries but hold kobject reference until after all 860/* Delete sysfs entries but hold kobject reference until after all
@@ -527,8 +866,9 @@ void netdev_unregister_kobject(struct net_device * net)
527 866
528 kobject_get(&dev->kobj); 867 kobject_get(&dev->kobj);
529 868
530 if (!net_eq(dev_net(net), &init_net)) 869#ifdef CONFIG_RPS
531 return; 870 rx_queue_remove_kobjects(net);
871#endif
532 872
533 device_del(dev); 873 device_del(dev);
534} 874}
@@ -538,7 +878,9 @@ int netdev_register_kobject(struct net_device *net)
538{ 878{
539 struct device *dev = &(net->dev); 879 struct device *dev = &(net->dev);
540 const struct attribute_group **groups = net->sysfs_groups; 880 const struct attribute_group **groups = net->sysfs_groups;
881 int error = 0;
541 882
883 device_initialize(dev);
542 dev->class = &net_class; 884 dev->class = &net_class;
543 dev->platform_data = net; 885 dev->platform_data = net;
544 dev->groups = groups; 886 dev->groups = groups;
@@ -561,10 +903,19 @@ int netdev_register_kobject(struct net_device *net)
561#endif 903#endif
562#endif /* CONFIG_SYSFS */ 904#endif /* CONFIG_SYSFS */
563 905
564 if (!net_eq(dev_net(net), &init_net)) 906 error = device_add(dev);
565 return 0; 907 if (error)
908 return error;
909
910#ifdef CONFIG_RPS
911 error = rx_queue_register_kobjects(net);
912 if (error) {
913 device_del(dev);
914 return error;
915 }
916#endif
566 917
567 return device_add(dev); 918 return error;
568} 919}
569 920
570int netdev_class_create_file(struct class_attribute *class_attr) 921int netdev_class_create_file(struct class_attribute *class_attr)
@@ -580,13 +931,9 @@ void netdev_class_remove_file(struct class_attribute *class_attr)
580EXPORT_SYMBOL(netdev_class_create_file); 931EXPORT_SYMBOL(netdev_class_create_file);
581EXPORT_SYMBOL(netdev_class_remove_file); 932EXPORT_SYMBOL(netdev_class_remove_file);
582 933
583void netdev_initialize_kobject(struct net_device *net)
584{
585 struct device *device = &(net->dev);
586 device_initialize(device);
587}
588
589int netdev_kobject_init(void) 934int netdev_kobject_init(void)
590{ 935{
936 kobj_ns_type_register(&net_ns_type_operations);
937 register_pernet_subsys(&kobj_net_ops);
591 return class_register(&net_class); 938 return class_register(&net_class);
592} 939}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 14e7524260b3..805555e8b187 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -4,5 +4,4 @@
4int netdev_kobject_init(void); 4int netdev_kobject_init(void);
5int netdev_register_kobject(struct net_device *); 5int netdev_register_kobject(struct net_device *);
6void netdev_unregister_kobject(struct net_device *); 6void netdev_unregister_kobject(struct net_device *);
7void netdev_initialize_kobject(struct net_device *);
8#endif 7#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index bd8c4712ea24..c988e685433a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -27,6 +27,51 @@ EXPORT_SYMBOL(init_net);
27 27
28#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ 28#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
29 29
30static void net_generic_release(struct rcu_head *rcu)
31{
32 struct net_generic *ng;
33
34 ng = container_of(rcu, struct net_generic, rcu);
35 kfree(ng);
36}
37
38static int net_assign_generic(struct net *net, int id, void *data)
39{
40 struct net_generic *ng, *old_ng;
41
42 BUG_ON(!mutex_is_locked(&net_mutex));
43 BUG_ON(id == 0);
44
45 ng = old_ng = net->gen;
46 if (old_ng->len >= id)
47 goto assign;
48
49 ng = kzalloc(sizeof(struct net_generic) +
50 id * sizeof(void *), GFP_KERNEL);
51 if (ng == NULL)
52 return -ENOMEM;
53
54 /*
55 * Some synchronisation notes:
56 *
57 * The net_generic explores the net->gen array inside rcu
58 * read section. Besides once set the net->gen->ptr[x]
59 * pointer never changes (see rules in netns/generic.h).
60 *
61 * That said, we simply duplicate this array and schedule
62 * the old copy for kfree after a grace period.
63 */
64
65 ng->len = id;
66 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
67
68 rcu_assign_pointer(net->gen, ng);
69 call_rcu(&old_ng->rcu, net_generic_release);
70assign:
71 ng->ptr[id - 1] = data;
72 return 0;
73}
74
30static int ops_init(const struct pernet_operations *ops, struct net *net) 75static int ops_init(const struct pernet_operations *ops, struct net *net)
31{ 76{
32 int err; 77 int err;
@@ -469,10 +514,10 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
469 * addition run the exit method for all existing network 514 * addition run the exit method for all existing network
470 * namespaces. 515 * namespaces.
471 */ 516 */
472void unregister_pernet_subsys(struct pernet_operations *module) 517void unregister_pernet_subsys(struct pernet_operations *ops)
473{ 518{
474 mutex_lock(&net_mutex); 519 mutex_lock(&net_mutex);
475 unregister_pernet_operations(module); 520 unregister_pernet_operations(ops);
476 mutex_unlock(&net_mutex); 521 mutex_unlock(&net_mutex);
477} 522}
478EXPORT_SYMBOL_GPL(unregister_pernet_subsys); 523EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
@@ -526,49 +571,3 @@ void unregister_pernet_device(struct pernet_operations *ops)
526 mutex_unlock(&net_mutex); 571 mutex_unlock(&net_mutex);
527} 572}
528EXPORT_SYMBOL_GPL(unregister_pernet_device); 573EXPORT_SYMBOL_GPL(unregister_pernet_device);
529
530static void net_generic_release(struct rcu_head *rcu)
531{
532 struct net_generic *ng;
533
534 ng = container_of(rcu, struct net_generic, rcu);
535 kfree(ng);
536}
537
538int net_assign_generic(struct net *net, int id, void *data)
539{
540 struct net_generic *ng, *old_ng;
541
542 BUG_ON(!mutex_is_locked(&net_mutex));
543 BUG_ON(id == 0);
544
545 ng = old_ng = net->gen;
546 if (old_ng->len >= id)
547 goto assign;
548
549 ng = kzalloc(sizeof(struct net_generic) +
550 id * sizeof(void *), GFP_KERNEL);
551 if (ng == NULL)
552 return -ENOMEM;
553
554 /*
555 * Some synchronisation notes:
556 *
557 * The net_generic explores the net->gen array inside rcu
558 * read section. Besides once set the net->gen->ptr[x]
559 * pointer never changes (see rules in netns/generic.h).
560 *
561 * That said, we simply duplicate this array and schedule
562 * the old copy for kfree after a grace period.
563 */
564
565 ng->len = id;
566 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
567
568 rcu_assign_pointer(net->gen, ng);
569 call_rcu(&old_ng->rcu, net_generic_release);
570assign:
571 ng->ptr[id - 1] = data;
572 return 0;
573}
574EXPORT_SYMBOL_GPL(net_assign_generic);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a58f59b97597..94825b109551 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -179,9 +179,8 @@ static void service_arp_queue(struct netpoll_info *npi)
179 } 179 }
180} 180}
181 181
182void netpoll_poll(struct netpoll *np) 182void netpoll_poll_dev(struct net_device *dev)
183{ 183{
184 struct net_device *dev = np->dev;
185 const struct net_device_ops *ops; 184 const struct net_device_ops *ops;
186 185
187 if (!dev || !netif_running(dev)) 186 if (!dev || !netif_running(dev))
@@ -201,6 +200,11 @@ void netpoll_poll(struct netpoll *np)
201 zap_completion_queue(); 200 zap_completion_queue();
202} 201}
203 202
203void netpoll_poll(struct netpoll *np)
204{
205 netpoll_poll_dev(np->dev);
206}
207
204static void refill_skbs(void) 208static void refill_skbs(void)
205{ 209{
206 struct sk_buff *skb; 210 struct sk_buff *skb;
@@ -282,7 +286,7 @@ static int netpoll_owner_active(struct net_device *dev)
282 return 0; 286 return 0;
283} 287}
284 288
285static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 289void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
286{ 290{
287 int status = NETDEV_TX_BUSY; 291 int status = NETDEV_TX_BUSY;
288 unsigned long tries; 292 unsigned long tries;
@@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
308 tries > 0; --tries) { 312 tries > 0; --tries) {
309 if (__netif_tx_trylock(txq)) { 313 if (__netif_tx_trylock(txq)) {
310 if (!netif_tx_queue_stopped(txq)) { 314 if (!netif_tx_queue_stopped(txq)) {
315 dev->priv_flags |= IFF_IN_NETPOLL;
311 status = ops->ndo_start_xmit(skb, dev); 316 status = ops->ndo_start_xmit(skb, dev);
317 dev->priv_flags &= ~IFF_IN_NETPOLL;
312 if (status == NETDEV_TX_OK) 318 if (status == NETDEV_TX_OK)
313 txq_trans_update(txq); 319 txq_trans_update(txq);
314 } 320 }
@@ -756,7 +762,10 @@ int netpoll_setup(struct netpoll *np)
756 atomic_inc(&npinfo->refcnt); 762 atomic_inc(&npinfo->refcnt);
757 } 763 }
758 764
759 if (!ndev->netdev_ops->ndo_poll_controller) { 765 npinfo->netpoll = np;
766
767 if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
768 !ndev->netdev_ops->ndo_poll_controller) {
760 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", 769 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
761 np->name, np->dev_name); 770 np->name, np->dev_name);
762 err = -ENOTSUPP; 771 err = -ENOTSUPP;
@@ -878,6 +887,7 @@ void netpoll_cleanup(struct netpoll *np)
878 } 887 }
879 888
880 if (atomic_dec_and_test(&npinfo->refcnt)) { 889 if (atomic_dec_and_test(&npinfo->refcnt)) {
890 const struct net_device_ops *ops;
881 skb_queue_purge(&npinfo->arp_tx); 891 skb_queue_purge(&npinfo->arp_tx);
882 skb_queue_purge(&npinfo->txq); 892 skb_queue_purge(&npinfo->txq);
883 cancel_rearming_delayed_work(&npinfo->tx_work); 893 cancel_rearming_delayed_work(&npinfo->tx_work);
@@ -885,7 +895,11 @@ void netpoll_cleanup(struct netpoll *np)
885 /* clean after last, unfinished work */ 895 /* clean after last, unfinished work */
886 __skb_queue_purge(&npinfo->txq); 896 __skb_queue_purge(&npinfo->txq);
887 kfree(npinfo); 897 kfree(npinfo);
888 np->dev->npinfo = NULL; 898 ops = np->dev->netdev_ops;
899 if (ops->ndo_netpoll_cleanup)
900 ops->ndo_netpoll_cleanup(np->dev);
901 else
902 np->dev->npinfo = NULL;
889 } 903 }
890 } 904 }
891 905
@@ -908,6 +922,7 @@ void netpoll_set_trap(int trap)
908 atomic_dec(&trapped); 922 atomic_dec(&trapped);
909} 923}
910 924
925EXPORT_SYMBOL(netpoll_send_skb);
911EXPORT_SYMBOL(netpoll_set_trap); 926EXPORT_SYMBOL(netpoll_set_trap);
912EXPORT_SYMBOL(netpoll_trap); 927EXPORT_SYMBOL(netpoll_trap);
913EXPORT_SYMBOL(netpoll_print_options); 928EXPORT_SYMBOL(netpoll_print_options);
@@ -915,4 +930,5 @@ EXPORT_SYMBOL(netpoll_parse_options);
915EXPORT_SYMBOL(netpoll_setup); 930EXPORT_SYMBOL(netpoll_setup);
916EXPORT_SYMBOL(netpoll_cleanup); 931EXPORT_SYMBOL(netpoll_cleanup);
917EXPORT_SYMBOL(netpoll_send_udp); 932EXPORT_SYMBOL(netpoll_send_udp);
933EXPORT_SYMBOL(netpoll_poll_dev);
918EXPORT_SYMBOL(netpoll_poll); 934EXPORT_SYMBOL(netpoll_poll);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 43923811bd6a..2ad68da418df 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -169,7 +169,7 @@
169#include <asm/dma.h> 169#include <asm/dma.h>
170#include <asm/div64.h> /* do_div */ 170#include <asm/div64.h> /* do_div */
171 171
172#define VERSION "2.72" 172#define VERSION "2.73"
173#define IP_NAME_SZ 32 173#define IP_NAME_SZ 32
174#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ 174#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
175#define MPLS_STACK_BOTTOM htonl(0x00000100) 175#define MPLS_STACK_BOTTOM htonl(0x00000100)
@@ -190,6 +190,7 @@
190#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ 190#define F_IPSEC_ON (1<<12) /* ipsec on for flows */
191#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ 191#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */
192#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ 192#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
193#define F_NODE (1<<15) /* Node memory alloc*/
193 194
194/* Thread control flag bits */ 195/* Thread control flag bits */
195#define T_STOP (1<<0) /* Stop run */ 196#define T_STOP (1<<0) /* Stop run */
@@ -372,6 +373,7 @@ struct pktgen_dev {
372 373
373 u16 queue_map_min; 374 u16 queue_map_min;
374 u16 queue_map_max; 375 u16 queue_map_max;
376 int node; /* Memory node */
375 377
376#ifdef CONFIG_XFRM 378#ifdef CONFIG_XFRM
377 __u8 ipsmode; /* IPSEC mode (config) */ 379 __u8 ipsmode; /* IPSEC mode (config) */
@@ -607,6 +609,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
607 if (pkt_dev->traffic_class) 609 if (pkt_dev->traffic_class)
608 seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); 610 seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
609 611
612 if (pkt_dev->node >= 0)
613 seq_printf(seq, " node: %d\n", pkt_dev->node);
614
610 seq_printf(seq, " Flags: "); 615 seq_printf(seq, " Flags: ");
611 616
612 if (pkt_dev->flags & F_IPV6) 617 if (pkt_dev->flags & F_IPV6)
@@ -660,6 +665,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
660 if (pkt_dev->flags & F_SVID_RND) 665 if (pkt_dev->flags & F_SVID_RND)
661 seq_printf(seq, "SVID_RND "); 666 seq_printf(seq, "SVID_RND ");
662 667
668 if (pkt_dev->flags & F_NODE)
669 seq_printf(seq, "NODE_ALLOC ");
670
663 seq_puts(seq, "\n"); 671 seq_puts(seq, "\n");
664 672
665 /* not really stopped, more like last-running-at */ 673 /* not really stopped, more like last-running-at */
@@ -1074,6 +1082,21 @@ static ssize_t pktgen_if_write(struct file *file,
1074 pkt_dev->dst_mac_count); 1082 pkt_dev->dst_mac_count);
1075 return count; 1083 return count;
1076 } 1084 }
1085 if (!strcmp(name, "node")) {
1086 len = num_arg(&user_buffer[i], 10, &value);
1087 if (len < 0)
1088 return len;
1089
1090 i += len;
1091
1092 if (node_possible(value)) {
1093 pkt_dev->node = value;
1094 sprintf(pg_result, "OK: node=%d", pkt_dev->node);
1095 }
1096 else
1097 sprintf(pg_result, "ERROR: node not possible");
1098 return count;
1099 }
1077 if (!strcmp(name, "flag")) { 1100 if (!strcmp(name, "flag")) {
1078 char f[32]; 1101 char f[32];
1079 memset(f, 0, 32); 1102 memset(f, 0, 32);
@@ -1166,12 +1189,18 @@ static ssize_t pktgen_if_write(struct file *file,
1166 else if (strcmp(f, "!IPV6") == 0) 1189 else if (strcmp(f, "!IPV6") == 0)
1167 pkt_dev->flags &= ~F_IPV6; 1190 pkt_dev->flags &= ~F_IPV6;
1168 1191
1192 else if (strcmp(f, "NODE_ALLOC") == 0)
1193 pkt_dev->flags |= F_NODE;
1194
1195 else if (strcmp(f, "!NODE_ALLOC") == 0)
1196 pkt_dev->flags &= ~F_NODE;
1197
1169 else { 1198 else {
1170 sprintf(pg_result, 1199 sprintf(pg_result,
1171 "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", 1200 "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
1172 f, 1201 f,
1173 "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " 1202 "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
1174 "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n"); 1203 "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n");
1175 return count; 1204 return count;
1176 } 1205 }
1177 sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); 1206 sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
@@ -2572,9 +2601,27 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2572 mod_cur_headers(pkt_dev); 2601 mod_cur_headers(pkt_dev);
2573 2602
2574 datalen = (odev->hard_header_len + 16) & ~0xf; 2603 datalen = (odev->hard_header_len + 16) & ~0xf;
2575 skb = __netdev_alloc_skb(odev, 2604
2576 pkt_dev->cur_pkt_size + 64 2605 if (pkt_dev->flags & F_NODE) {
2577 + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); 2606 int node;
2607
2608 if (pkt_dev->node >= 0)
2609 node = pkt_dev->node;
2610 else
2611 node = numa_node_id();
2612
2613 skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64
2614 + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node);
2615 if (likely(skb)) {
2616 skb_reserve(skb, NET_SKB_PAD);
2617 skb->dev = odev;
2618 }
2619 }
2620 else
2621 skb = __netdev_alloc_skb(odev,
2622 pkt_dev->cur_pkt_size + 64
2623 + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT);
2624
2578 if (!skb) { 2625 if (!skb) {
2579 sprintf(pkt_dev->result, "No memory"); 2626 sprintf(pkt_dev->result, "No memory");
2580 return NULL; 2627 return NULL;
@@ -3674,6 +3721,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
3674 pkt_dev->svlan_p = 0; 3721 pkt_dev->svlan_p = 0;
3675 pkt_dev->svlan_cfi = 0; 3722 pkt_dev->svlan_cfi = 0;
3676 pkt_dev->svlan_id = 0xffff; 3723 pkt_dev->svlan_id = 0xffff;
3724 pkt_dev->node = -1;
3677 3725
3678 err = pktgen_setup_dev(pkt_dev, ifname); 3726 err = pktgen_setup_dev(pkt_dev, ifname);
3679 if (err) 3727 if (err)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fe776c9ddeca..e4b9870e4706 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -98,7 +98,7 @@ int lockdep_rtnl_is_held(void)
98EXPORT_SYMBOL(lockdep_rtnl_is_held); 98EXPORT_SYMBOL(lockdep_rtnl_is_held);
99#endif /* #ifdef CONFIG_PROVE_LOCKING */ 99#endif /* #ifdef CONFIG_PROVE_LOCKING */
100 100
101static struct rtnl_link *rtnl_msg_handlers[NPROTO]; 101static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
102 102
103static inline int rtm_msgindex(int msgtype) 103static inline int rtm_msgindex(int msgtype)
104{ 104{
@@ -118,7 +118,11 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
118{ 118{
119 struct rtnl_link *tab; 119 struct rtnl_link *tab;
120 120
121 tab = rtnl_msg_handlers[protocol]; 121 if (protocol <= RTNL_FAMILY_MAX)
122 tab = rtnl_msg_handlers[protocol];
123 else
124 tab = NULL;
125
122 if (tab == NULL || tab[msgindex].doit == NULL) 126 if (tab == NULL || tab[msgindex].doit == NULL)
123 tab = rtnl_msg_handlers[PF_UNSPEC]; 127 tab = rtnl_msg_handlers[PF_UNSPEC];
124 128
@@ -129,7 +133,11 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
129{ 133{
130 struct rtnl_link *tab; 134 struct rtnl_link *tab;
131 135
132 tab = rtnl_msg_handlers[protocol]; 136 if (protocol <= RTNL_FAMILY_MAX)
137 tab = rtnl_msg_handlers[protocol];
138 else
139 tab = NULL;
140
133 if (tab == NULL || tab[msgindex].dumpit == NULL) 141 if (tab == NULL || tab[msgindex].dumpit == NULL)
134 tab = rtnl_msg_handlers[PF_UNSPEC]; 142 tab = rtnl_msg_handlers[PF_UNSPEC];
135 143
@@ -159,7 +167,7 @@ int __rtnl_register(int protocol, int msgtype,
159 struct rtnl_link *tab; 167 struct rtnl_link *tab;
160 int msgindex; 168 int msgindex;
161 169
162 BUG_ON(protocol < 0 || protocol >= NPROTO); 170 BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
163 msgindex = rtm_msgindex(msgtype); 171 msgindex = rtm_msgindex(msgtype);
164 172
165 tab = rtnl_msg_handlers[protocol]; 173 tab = rtnl_msg_handlers[protocol];
@@ -211,7 +219,7 @@ int rtnl_unregister(int protocol, int msgtype)
211{ 219{
212 int msgindex; 220 int msgindex;
213 221
214 BUG_ON(protocol < 0 || protocol >= NPROTO); 222 BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
215 msgindex = rtm_msgindex(msgtype); 223 msgindex = rtm_msgindex(msgtype);
216 224
217 if (rtnl_msg_handlers[protocol] == NULL) 225 if (rtnl_msg_handlers[protocol] == NULL)
@@ -233,7 +241,7 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
233 */ 241 */
234void rtnl_unregister_all(int protocol) 242void rtnl_unregister_all(int protocol)
235{ 243{
236 BUG_ON(protocol < 0 || protocol >= NPROTO); 244 BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
237 245
238 kfree(rtnl_msg_handlers[protocol]); 246 kfree(rtnl_msg_handlers[protocol]);
239 rtnl_msg_handlers[protocol] = NULL; 247 rtnl_msg_handlers[protocol] = NULL;
@@ -600,17 +608,83 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
600 608
601 a->rx_compressed = b->rx_compressed; 609 a->rx_compressed = b->rx_compressed;
602 a->tx_compressed = b->tx_compressed; 610 a->tx_compressed = b->tx_compressed;
603}; 611}
604 612
613static void copy_rtnl_link_stats64(void *v, const struct net_device_stats *b)
614{
615 struct rtnl_link_stats64 a;
616
617 a.rx_packets = b->rx_packets;
618 a.tx_packets = b->tx_packets;
619 a.rx_bytes = b->rx_bytes;
620 a.tx_bytes = b->tx_bytes;
621 a.rx_errors = b->rx_errors;
622 a.tx_errors = b->tx_errors;
623 a.rx_dropped = b->rx_dropped;
624 a.tx_dropped = b->tx_dropped;
625
626 a.multicast = b->multicast;
627 a.collisions = b->collisions;
628
629 a.rx_length_errors = b->rx_length_errors;
630 a.rx_over_errors = b->rx_over_errors;
631 a.rx_crc_errors = b->rx_crc_errors;
632 a.rx_frame_errors = b->rx_frame_errors;
633 a.rx_fifo_errors = b->rx_fifo_errors;
634 a.rx_missed_errors = b->rx_missed_errors;
635
636 a.tx_aborted_errors = b->tx_aborted_errors;
637 a.tx_carrier_errors = b->tx_carrier_errors;
638 a.tx_fifo_errors = b->tx_fifo_errors;
639 a.tx_heartbeat_errors = b->tx_heartbeat_errors;
640 a.tx_window_errors = b->tx_window_errors;
641
642 a.rx_compressed = b->rx_compressed;
643 a.tx_compressed = b->tx_compressed;
644 memcpy(v, &a, sizeof(a));
645}
646
647/* All VF info */
605static inline int rtnl_vfinfo_size(const struct net_device *dev) 648static inline int rtnl_vfinfo_size(const struct net_device *dev)
606{ 649{
607 if (dev->dev.parent && dev_is_pci(dev->dev.parent)) 650 if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
608 return dev_num_vf(dev->dev.parent) * 651
609 sizeof(struct ifla_vf_info); 652 int num_vfs = dev_num_vf(dev->dev.parent);
610 else 653 size_t size = nlmsg_total_size(sizeof(struct nlattr));
654 size += nlmsg_total_size(num_vfs * sizeof(struct nlattr));
655 size += num_vfs * (sizeof(struct ifla_vf_mac) +
656 sizeof(struct ifla_vf_vlan) +
657 sizeof(struct ifla_vf_tx_rate));
658 return size;
659 } else
611 return 0; 660 return 0;
612} 661}
613 662
663static size_t rtnl_port_size(const struct net_device *dev)
664{
665 size_t port_size = nla_total_size(4) /* PORT_VF */
666 + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
667 + nla_total_size(sizeof(struct ifla_port_vsi))
668 /* PORT_VSI_TYPE */
669 + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
670 + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
671 + nla_total_size(1) /* PROT_VDP_REQUEST */
672 + nla_total_size(2); /* PORT_VDP_RESPONSE */
673 size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
674 size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
675 + port_size;
676 size_t port_self_size = nla_total_size(sizeof(struct nlattr))
677 + port_size;
678
679 if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
680 return 0;
681 if (dev_num_vf(dev->dev.parent))
682 return port_self_size + vf_ports_size +
683 vf_port_size * dev_num_vf(dev->dev.parent);
684 else
685 return port_self_size;
686}
687
614static inline size_t if_nlmsg_size(const struct net_device *dev) 688static inline size_t if_nlmsg_size(const struct net_device *dev)
615{ 689{
616 return NLMSG_ALIGN(sizeof(struct ifinfomsg)) 690 return NLMSG_ALIGN(sizeof(struct ifinfomsg))
@@ -619,6 +693,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
619 + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ 693 + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
620 + nla_total_size(sizeof(struct rtnl_link_ifmap)) 694 + nla_total_size(sizeof(struct rtnl_link_ifmap))
621 + nla_total_size(sizeof(struct rtnl_link_stats)) 695 + nla_total_size(sizeof(struct rtnl_link_stats))
696 + nla_total_size(sizeof(struct rtnl_link_stats64))
622 + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ 697 + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
623 + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ 698 + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
624 + nla_total_size(4) /* IFLA_TXQLEN */ 699 + nla_total_size(4) /* IFLA_TXQLEN */
@@ -629,10 +704,83 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
629 + nla_total_size(1) /* IFLA_OPERSTATE */ 704 + nla_total_size(1) /* IFLA_OPERSTATE */
630 + nla_total_size(1) /* IFLA_LINKMODE */ 705 + nla_total_size(1) /* IFLA_LINKMODE */
631 + nla_total_size(4) /* IFLA_NUM_VF */ 706 + nla_total_size(4) /* IFLA_NUM_VF */
632 + nla_total_size(rtnl_vfinfo_size(dev)) /* IFLA_VFINFO */ 707 + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
708 + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
633 + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ 709 + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
634} 710}
635 711
712static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
713{
714 struct nlattr *vf_ports;
715 struct nlattr *vf_port;
716 int vf;
717 int err;
718
719 vf_ports = nla_nest_start(skb, IFLA_VF_PORTS);
720 if (!vf_ports)
721 return -EMSGSIZE;
722
723 for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
724 vf_port = nla_nest_start(skb, IFLA_VF_PORT);
725 if (!vf_port) {
726 nla_nest_cancel(skb, vf_ports);
727 return -EMSGSIZE;
728 }
729 NLA_PUT_U32(skb, IFLA_PORT_VF, vf);
730 err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
731 if (err) {
732nla_put_failure:
733 nla_nest_cancel(skb, vf_port);
734 continue;
735 }
736 nla_nest_end(skb, vf_port);
737 }
738
739 nla_nest_end(skb, vf_ports);
740
741 return 0;
742}
743
744static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
745{
746 struct nlattr *port_self;
747 int err;
748
749 port_self = nla_nest_start(skb, IFLA_PORT_SELF);
750 if (!port_self)
751 return -EMSGSIZE;
752
753 err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
754 if (err) {
755 nla_nest_cancel(skb, port_self);
756 return err;
757 }
758
759 nla_nest_end(skb, port_self);
760
761 return 0;
762}
763
764static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
765{
766 int err;
767
768 if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
769 return 0;
770
771 err = rtnl_port_self_fill(skb, dev);
772 if (err)
773 return err;
774
775 if (dev_num_vf(dev->dev.parent)) {
776 err = rtnl_vf_ports_fill(skb, dev);
777 if (err)
778 return err;
779 }
780
781 return 0;
782}
783
636static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, 784static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
637 int type, u32 pid, u32 seq, u32 change, 785 int type, u32 pid, u32 seq, u32 change,
638 unsigned int flags) 786 unsigned int flags)
@@ -698,17 +846,52 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
698 stats = dev_get_stats(dev); 846 stats = dev_get_stats(dev);
699 copy_rtnl_link_stats(nla_data(attr), stats); 847 copy_rtnl_link_stats(nla_data(attr), stats);
700 848
849 attr = nla_reserve(skb, IFLA_STATS64,
850 sizeof(struct rtnl_link_stats64));
851 if (attr == NULL)
852 goto nla_put_failure;
853 copy_rtnl_link_stats64(nla_data(attr), stats);
854
855 if (dev->dev.parent)
856 NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
857
701 if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { 858 if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
702 int i; 859 int i;
703 struct ifla_vf_info ivi;
704 860
705 NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); 861 struct nlattr *vfinfo, *vf;
706 for (i = 0; i < dev_num_vf(dev->dev.parent); i++) { 862 int num_vfs = dev_num_vf(dev->dev.parent);
863
864 vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
865 if (!vfinfo)
866 goto nla_put_failure;
867 for (i = 0; i < num_vfs; i++) {
868 struct ifla_vf_info ivi;
869 struct ifla_vf_mac vf_mac;
870 struct ifla_vf_vlan vf_vlan;
871 struct ifla_vf_tx_rate vf_tx_rate;
707 if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) 872 if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
708 break; 873 break;
709 NLA_PUT(skb, IFLA_VFINFO, sizeof(ivi), &ivi); 874 vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
875 memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
876 vf_vlan.vlan = ivi.vlan;
877 vf_vlan.qos = ivi.qos;
878 vf_tx_rate.rate = ivi.tx_rate;
879 vf = nla_nest_start(skb, IFLA_VF_INFO);
880 if (!vf) {
881 nla_nest_cancel(skb, vfinfo);
882 goto nla_put_failure;
883 }
884 NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
885 NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
886 NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
887 nla_nest_end(skb, vf);
710 } 888 }
889 nla_nest_end(skb, vfinfo);
711 } 890 }
891
892 if (rtnl_port_fill(skb, dev))
893 goto nla_put_failure;
894
712 if (dev->rtnl_link_ops) { 895 if (dev->rtnl_link_ops) {
713 if (rtnl_link_fill(skb, dev) < 0) 896 if (rtnl_link_fill(skb, dev) < 0)
714 goto nla_put_failure; 897 goto nla_put_failure;
@@ -769,6 +952,22 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
769 [IFLA_LINKINFO] = { .type = NLA_NESTED }, 952 [IFLA_LINKINFO] = { .type = NLA_NESTED },
770 [IFLA_NET_NS_PID] = { .type = NLA_U32 }, 953 [IFLA_NET_NS_PID] = { .type = NLA_U32 },
771 [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, 954 [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
955 [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
956 [IFLA_VF_PORTS] = { .type = NLA_NESTED },
957 [IFLA_PORT_SELF] = { .type = NLA_NESTED },
958};
959EXPORT_SYMBOL(ifla_policy);
960
961static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
962 [IFLA_INFO_KIND] = { .type = NLA_STRING },
963 [IFLA_INFO_DATA] = { .type = NLA_NESTED },
964};
965
966static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
967 [IFLA_VF_INFO] = { .type = NLA_NESTED },
968};
969
970static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
772 [IFLA_VF_MAC] = { .type = NLA_BINARY, 971 [IFLA_VF_MAC] = { .type = NLA_BINARY,
773 .len = sizeof(struct ifla_vf_mac) }, 972 .len = sizeof(struct ifla_vf_mac) },
774 [IFLA_VF_VLAN] = { .type = NLA_BINARY, 973 [IFLA_VF_VLAN] = { .type = NLA_BINARY,
@@ -776,11 +975,19 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
776 [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, 975 [IFLA_VF_TX_RATE] = { .type = NLA_BINARY,
777 .len = sizeof(struct ifla_vf_tx_rate) }, 976 .len = sizeof(struct ifla_vf_tx_rate) },
778}; 977};
779EXPORT_SYMBOL(ifla_policy);
780 978
781static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { 979static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
782 [IFLA_INFO_KIND] = { .type = NLA_STRING }, 980 [IFLA_PORT_VF] = { .type = NLA_U32 },
783 [IFLA_INFO_DATA] = { .type = NLA_NESTED }, 981 [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
982 .len = PORT_PROFILE_MAX },
983 [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
984 .len = sizeof(struct ifla_port_vsi)},
985 [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
986 .len = PORT_UUID_MAX },
987 [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
988 .len = PORT_UUID_MAX },
989 [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
990 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
784}; 991};
785 992
786struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) 993struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
@@ -812,6 +1019,52 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
812 return 0; 1019 return 0;
813} 1020}
814 1021
1022static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
1023{
1024 int rem, err = -EINVAL;
1025 struct nlattr *vf;
1026 const struct net_device_ops *ops = dev->netdev_ops;
1027
1028 nla_for_each_nested(vf, attr, rem) {
1029 switch (nla_type(vf)) {
1030 case IFLA_VF_MAC: {
1031 struct ifla_vf_mac *ivm;
1032 ivm = nla_data(vf);
1033 err = -EOPNOTSUPP;
1034 if (ops->ndo_set_vf_mac)
1035 err = ops->ndo_set_vf_mac(dev, ivm->vf,
1036 ivm->mac);
1037 break;
1038 }
1039 case IFLA_VF_VLAN: {
1040 struct ifla_vf_vlan *ivv;
1041 ivv = nla_data(vf);
1042 err = -EOPNOTSUPP;
1043 if (ops->ndo_set_vf_vlan)
1044 err = ops->ndo_set_vf_vlan(dev, ivv->vf,
1045 ivv->vlan,
1046 ivv->qos);
1047 break;
1048 }
1049 case IFLA_VF_TX_RATE: {
1050 struct ifla_vf_tx_rate *ivt;
1051 ivt = nla_data(vf);
1052 err = -EOPNOTSUPP;
1053 if (ops->ndo_set_vf_tx_rate)
1054 err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
1055 ivt->rate);
1056 break;
1057 }
1058 default:
1059 err = -EINVAL;
1060 break;
1061 }
1062 if (err)
1063 break;
1064 }
1065 return err;
1066}
1067
815static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, 1068static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
816 struct nlattr **tb, char *ifname, int modified) 1069 struct nlattr **tb, char *ifname, int modified)
817{ 1070{
@@ -942,37 +1195,61 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
942 write_unlock_bh(&dev_base_lock); 1195 write_unlock_bh(&dev_base_lock);
943 } 1196 }
944 1197
945 if (tb[IFLA_VF_MAC]) { 1198 if (tb[IFLA_VFINFO_LIST]) {
946 struct ifla_vf_mac *ivm; 1199 struct nlattr *attr;
947 ivm = nla_data(tb[IFLA_VF_MAC]); 1200 int rem;
948 err = -EOPNOTSUPP; 1201 nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
949 if (ops->ndo_set_vf_mac) 1202 if (nla_type(attr) != IFLA_VF_INFO)
950 err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); 1203 goto errout;
951 if (err < 0) 1204 err = do_setvfinfo(dev, attr);
952 goto errout; 1205 if (err < 0)
953 modified = 1; 1206 goto errout;
1207 modified = 1;
1208 }
954 } 1209 }
1210 err = 0;
1211
1212 if (tb[IFLA_VF_PORTS]) {
1213 struct nlattr *port[IFLA_PORT_MAX+1];
1214 struct nlattr *attr;
1215 int vf;
1216 int rem;
955 1217
956 if (tb[IFLA_VF_VLAN]) {
957 struct ifla_vf_vlan *ivv;
958 ivv = nla_data(tb[IFLA_VF_VLAN]);
959 err = -EOPNOTSUPP; 1218 err = -EOPNOTSUPP;
960 if (ops->ndo_set_vf_vlan) 1219 if (!ops->ndo_set_vf_port)
961 err = ops->ndo_set_vf_vlan(dev, ivv->vf,
962 ivv->vlan,
963 ivv->qos);
964 if (err < 0)
965 goto errout; 1220 goto errout;
966 modified = 1; 1221
1222 nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
1223 if (nla_type(attr) != IFLA_VF_PORT)
1224 continue;
1225 err = nla_parse_nested(port, IFLA_PORT_MAX,
1226 attr, ifla_port_policy);
1227 if (err < 0)
1228 goto errout;
1229 if (!port[IFLA_PORT_VF]) {
1230 err = -EOPNOTSUPP;
1231 goto errout;
1232 }
1233 vf = nla_get_u32(port[IFLA_PORT_VF]);
1234 err = ops->ndo_set_vf_port(dev, vf, port);
1235 if (err < 0)
1236 goto errout;
1237 modified = 1;
1238 }
967 } 1239 }
968 err = 0; 1240 err = 0;
969 1241
970 if (tb[IFLA_VF_TX_RATE]) { 1242 if (tb[IFLA_PORT_SELF]) {
971 struct ifla_vf_tx_rate *ivt; 1243 struct nlattr *port[IFLA_PORT_MAX+1];
972 ivt = nla_data(tb[IFLA_VF_TX_RATE]); 1244
1245 err = nla_parse_nested(port, IFLA_PORT_MAX,
1246 tb[IFLA_PORT_SELF], ifla_port_policy);
1247 if (err < 0)
1248 goto errout;
1249
973 err = -EOPNOTSUPP; 1250 err = -EOPNOTSUPP;
974 if (ops->ndo_set_vf_tx_rate) 1251 if (ops->ndo_set_vf_port)
975 err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate); 1252 err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
976 if (err < 0) 1253 if (err < 0)
977 goto errout; 1254 goto errout;
978 modified = 1; 1255 modified = 1;
@@ -1336,7 +1613,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
1336 1613
1337 if (s_idx == 0) 1614 if (s_idx == 0)
1338 s_idx = 1; 1615 s_idx = 1;
1339 for (idx = 1; idx < NPROTO; idx++) { 1616 for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
1340 int type = cb->nlh->nlmsg_type-RTM_BASE; 1617 int type = cb->nlh->nlmsg_type-RTM_BASE;
1341 if (idx < s_idx || idx == PF_PACKET) 1618 if (idx < s_idx || idx == PF_PACKET)
1342 continue; 1619 continue;
@@ -1404,9 +1681,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1404 return 0; 1681 return 0;
1405 1682
1406 family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; 1683 family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family;
1407 if (family >= NPROTO)
1408 return -EAFNOSUPPORT;
1409
1410 sz_idx = type>>2; 1684 sz_idx = type>>2;
1411 kind = type&3; 1685 kind = type&3;
1412 1686
@@ -1474,6 +1748,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
1474 case NETDEV_POST_INIT: 1748 case NETDEV_POST_INIT:
1475 case NETDEV_REGISTER: 1749 case NETDEV_REGISTER:
1476 case NETDEV_CHANGE: 1750 case NETDEV_CHANGE:
1751 case NETDEV_PRE_TYPE_CHANGE:
1477 case NETDEV_GOING_DOWN: 1752 case NETDEV_GOING_DOWN:
1478 case NETDEV_UNREGISTER: 1753 case NETDEV_UNREGISTER:
1479 case NETDEV_UNREGISTER_BATCH: 1754 case NETDEV_UNREGISTER_BATCH:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 931981774b1a..66d9c416851e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -117,7 +117,7 @@ static const struct pipe_buf_operations sock_pipe_buf_ops = {
117 * 117 *
118 * Out of line support code for skb_put(). Not user callable. 118 * Out of line support code for skb_put(). Not user callable.
119 */ 119 */
120void skb_over_panic(struct sk_buff *skb, int sz, void *here) 120static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
121{ 121{
122 printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " 122 printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
123 "data:%p tail:%#lx end:%#lx dev:%s\n", 123 "data:%p tail:%#lx end:%#lx dev:%s\n",
@@ -126,7 +126,6 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here)
126 skb->dev ? skb->dev->name : "<NULL>"); 126 skb->dev ? skb->dev->name : "<NULL>");
127 BUG(); 127 BUG();
128} 128}
129EXPORT_SYMBOL(skb_over_panic);
130 129
131/** 130/**
132 * skb_under_panic - private function 131 * skb_under_panic - private function
@@ -137,7 +136,7 @@ EXPORT_SYMBOL(skb_over_panic);
137 * Out of line support code for skb_push(). Not user callable. 136 * Out of line support code for skb_push(). Not user callable.
138 */ 137 */
139 138
140void skb_under_panic(struct sk_buff *skb, int sz, void *here) 139static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
141{ 140{
142 printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " 141 printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
143 "data:%p tail:%#lx end:%#lx dev:%s\n", 142 "data:%p tail:%#lx end:%#lx dev:%s\n",
@@ -146,7 +145,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
146 skb->dev ? skb->dev->name : "<NULL>"); 145 skb->dev ? skb->dev->name : "<NULL>");
147 BUG(); 146 BUG();
148} 147}
149EXPORT_SYMBOL(skb_under_panic);
150 148
151/* Allocate a new skbuff. We do this ourselves so we can fill in a few 149/* Allocate a new skbuff. We do this ourselves so we can fill in a few
152 * 'private' fields and also do memory statistics to find all the 150 * 'private' fields and also do memory statistics to find all the
@@ -183,12 +181,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
183 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 181 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
184 if (!skb) 182 if (!skb)
185 goto out; 183 goto out;
184 prefetchw(skb);
186 185
187 size = SKB_DATA_ALIGN(size); 186 size = SKB_DATA_ALIGN(size);
188 data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), 187 data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
189 gfp_mask, node); 188 gfp_mask, node);
190 if (!data) 189 if (!data)
191 goto nodata; 190 goto nodata;
191 prefetchw(data + size);
192 192
193 /* 193 /*
194 * Only clear those fields we need to clear, not those that we will 194 * Only clear those fields we need to clear, not those that we will
@@ -210,15 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
210 210
211 /* make sure we initialize shinfo sequentially */ 211 /* make sure we initialize shinfo sequentially */
212 shinfo = skb_shinfo(skb); 212 shinfo = skb_shinfo(skb);
213 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
213 atomic_set(&shinfo->dataref, 1); 214 atomic_set(&shinfo->dataref, 1);
214 shinfo->nr_frags = 0;
215 shinfo->gso_size = 0;
216 shinfo->gso_segs = 0;
217 shinfo->gso_type = 0;
218 shinfo->ip6_frag_id = 0;
219 shinfo->tx_flags.flags = 0;
220 skb_frag_list_init(skb);
221 memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
222 215
223 if (fclone) { 216 if (fclone) {
224 struct sk_buff *child = skb + 1; 217 struct sk_buff *child = skb + 1;
@@ -507,16 +500,10 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
507 return 0; 500 return 0;
508 501
509 skb_release_head_state(skb); 502 skb_release_head_state(skb);
503
510 shinfo = skb_shinfo(skb); 504 shinfo = skb_shinfo(skb);
505 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
511 atomic_set(&shinfo->dataref, 1); 506 atomic_set(&shinfo->dataref, 1);
512 shinfo->nr_frags = 0;
513 shinfo->gso_size = 0;
514 shinfo->gso_segs = 0;
515 shinfo->gso_type = 0;
516 shinfo->ip6_frag_id = 0;
517 shinfo->tx_flags.flags = 0;
518 skb_frag_list_init(skb);
519 memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
520 507
521 memset(skb, 0, offsetof(struct sk_buff, tail)); 508 memset(skb, 0, offsetof(struct sk_buff, tail));
522 skb->data = skb->head + NET_SKB_PAD; 509 skb->data = skb->head + NET_SKB_PAD;
@@ -533,7 +520,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
533 new->transport_header = old->transport_header; 520 new->transport_header = old->transport_header;
534 new->network_header = old->network_header; 521 new->network_header = old->network_header;
535 new->mac_header = old->mac_header; 522 new->mac_header = old->mac_header;
536 skb_dst_set(new, dst_clone(skb_dst(old))); 523 skb_dst_copy(new, old);
524 new->rxhash = old->rxhash;
537#ifdef CONFIG_XFRM 525#ifdef CONFIG_XFRM
538 new->sp = secpath_get(old->sp); 526 new->sp = secpath_get(old->sp);
539#endif 527#endif
@@ -581,6 +569,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
581 C(len); 569 C(len);
582 C(data_len); 570 C(data_len);
583 C(mac_len); 571 C(mac_len);
572 C(rxhash);
584 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 573 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
585 n->cloned = 1; 574 n->cloned = 1;
586 n->nohdr = 0; 575 n->nohdr = 0;
@@ -1051,7 +1040,7 @@ EXPORT_SYMBOL(skb_push);
1051 */ 1040 */
1052unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) 1041unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
1053{ 1042{
1054 return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); 1043 return skb_pull_inline(skb, len);
1055} 1044}
1056EXPORT_SYMBOL(skb_pull); 1045EXPORT_SYMBOL(skb_pull);
1057 1046
diff --git a/net/core/sock.c b/net/core/sock.c
index c5812bbc2cc9..bf88a167c8f2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
307 */ 307 */
308 skb_len = skb->len; 308 skb_len = skb->len;
309 309
310 /* we escape from rcu protected region, make sure we dont leak
311 * a norefcounted dst
312 */
313 skb_dst_force(skb);
314
310 spin_lock_irqsave(&list->lock, flags); 315 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops); 316 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb); 317 __skb_queue_tail(list, skb);
@@ -327,6 +332,10 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
327 332
328 skb->dev = NULL; 333 skb->dev = NULL;
329 334
335 if (sk_rcvqueues_full(sk, skb)) {
336 atomic_inc(&sk->sk_drops);
337 goto discard_and_relse;
338 }
330 if (nested) 339 if (nested)
331 bh_lock_sock_nested(sk); 340 bh_lock_sock_nested(sk);
332 else 341 else
@@ -364,11 +373,11 @@ EXPORT_SYMBOL(sk_reset_txq);
364 373
365struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 374struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
366{ 375{
367 struct dst_entry *dst = sk->sk_dst_cache; 376 struct dst_entry *dst = __sk_dst_get(sk);
368 377
369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 378 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370 sk_tx_queue_clear(sk); 379 sk_tx_queue_clear(sk);
371 sk->sk_dst_cache = NULL; 380 rcu_assign_pointer(sk->sk_dst_cache, NULL);
372 dst_release(dst); 381 dst_release(dst);
373 return NULL; 382 return NULL;
374 } 383 }
@@ -1157,7 +1166,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1157 skb_queue_head_init(&newsk->sk_async_wait_queue); 1166 skb_queue_head_init(&newsk->sk_async_wait_queue);
1158#endif 1167#endif
1159 1168
1160 rwlock_init(&newsk->sk_dst_lock); 1169 spin_lock_init(&newsk->sk_dst_lock);
1161 rwlock_init(&newsk->sk_callback_lock); 1170 rwlock_init(&newsk->sk_callback_lock);
1162 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1171 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1163 af_callback_keys + newsk->sk_family, 1172 af_callback_keys + newsk->sk_family,
@@ -1207,7 +1216,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1207 */ 1216 */
1208 sk_refcnt_debug_inc(newsk); 1217 sk_refcnt_debug_inc(newsk);
1209 sk_set_socket(newsk, NULL); 1218 sk_set_socket(newsk, NULL);
1210 newsk->sk_sleep = NULL; 1219 newsk->sk_wq = NULL;
1211 1220
1212 if (newsk->sk_prot->sockets_allocated) 1221 if (newsk->sk_prot->sockets_allocated)
1213 percpu_counter_inc(newsk->sk_prot->sockets_allocated); 1222 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
@@ -1227,6 +1236,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1227 sk->sk_route_caps = dst->dev->features; 1236 sk->sk_route_caps = dst->dev->features;
1228 if (sk->sk_route_caps & NETIF_F_GSO) 1237 if (sk->sk_route_caps & NETIF_F_GSO)
1229 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1238 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1239 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1230 if (sk_can_gso(sk)) { 1240 if (sk_can_gso(sk)) {
1231 if (dst->header_len) { 1241 if (dst->header_len) {
1232 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1242 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
@@ -1395,7 +1405,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
1395 if (signal_pending(current)) 1405 if (signal_pending(current))
1396 break; 1406 break;
1397 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1407 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1398 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1408 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1399 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 1409 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1400 break; 1410 break;
1401 if (sk->sk_shutdown & SEND_SHUTDOWN) 1411 if (sk->sk_shutdown & SEND_SHUTDOWN)
@@ -1404,7 +1414,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
1404 break; 1414 break;
1405 timeo = schedule_timeout(timeo); 1415 timeo = schedule_timeout(timeo);
1406 } 1416 }
1407 finish_wait(sk->sk_sleep, &wait); 1417 finish_wait(sk_sleep(sk), &wait);
1408 return timeo; 1418 return timeo;
1409} 1419}
1410 1420
@@ -1531,6 +1541,7 @@ static void __release_sock(struct sock *sk)
1531 do { 1541 do {
1532 struct sk_buff *next = skb->next; 1542 struct sk_buff *next = skb->next;
1533 1543
1544 WARN_ON_ONCE(skb_dst_is_noref(skb));
1534 skb->next = NULL; 1545 skb->next = NULL;
1535 sk_backlog_rcv(sk, skb); 1546 sk_backlog_rcv(sk, skb);
1536 1547
@@ -1570,11 +1581,11 @@ int sk_wait_data(struct sock *sk, long *timeo)
1570 int rc; 1581 int rc;
1571 DEFINE_WAIT(wait); 1582 DEFINE_WAIT(wait);
1572 1583
1573 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1584 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1574 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1585 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1575 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); 1586 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1576 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 1587 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1577 finish_wait(sk->sk_sleep, &wait); 1588 finish_wait(sk_sleep(sk), &wait);
1578 return rc; 1589 return rc;
1579} 1590}
1580EXPORT_SYMBOL(sk_wait_data); 1591EXPORT_SYMBOL(sk_wait_data);
@@ -1796,41 +1807,53 @@ EXPORT_SYMBOL(sock_no_sendpage);
1796 1807
1797static void sock_def_wakeup(struct sock *sk) 1808static void sock_def_wakeup(struct sock *sk)
1798{ 1809{
1799 read_lock(&sk->sk_callback_lock); 1810 struct socket_wq *wq;
1800 if (sk_has_sleeper(sk)) 1811
1801 wake_up_interruptible_all(sk->sk_sleep); 1812 rcu_read_lock();
1802 read_unlock(&sk->sk_callback_lock); 1813 wq = rcu_dereference(sk->sk_wq);
1814 if (wq_has_sleeper(wq))
1815 wake_up_interruptible_all(&wq->wait);
1816 rcu_read_unlock();
1803} 1817}
1804 1818
1805static void sock_def_error_report(struct sock *sk) 1819static void sock_def_error_report(struct sock *sk)
1806{ 1820{
1807 read_lock(&sk->sk_callback_lock); 1821 struct socket_wq *wq;
1808 if (sk_has_sleeper(sk)) 1822
1809 wake_up_interruptible_poll(sk->sk_sleep, POLLERR); 1823 rcu_read_lock();
1824 wq = rcu_dereference(sk->sk_wq);
1825 if (wq_has_sleeper(wq))
1826 wake_up_interruptible_poll(&wq->wait, POLLERR);
1810 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 1827 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1811 read_unlock(&sk->sk_callback_lock); 1828 rcu_read_unlock();
1812} 1829}
1813 1830
1814static void sock_def_readable(struct sock *sk, int len) 1831static void sock_def_readable(struct sock *sk, int len)
1815{ 1832{
1816 read_lock(&sk->sk_callback_lock); 1833 struct socket_wq *wq;
1817 if (sk_has_sleeper(sk)) 1834
1818 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | 1835 rcu_read_lock();
1836 wq = rcu_dereference(sk->sk_wq);
1837 if (wq_has_sleeper(wq))
1838 wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
1819 POLLRDNORM | POLLRDBAND); 1839 POLLRDNORM | POLLRDBAND);
1820 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 1840 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1821 read_unlock(&sk->sk_callback_lock); 1841 rcu_read_unlock();
1822} 1842}
1823 1843
1824static void sock_def_write_space(struct sock *sk) 1844static void sock_def_write_space(struct sock *sk)
1825{ 1845{
1826 read_lock(&sk->sk_callback_lock); 1846 struct socket_wq *wq;
1847
1848 rcu_read_lock();
1827 1849
1828 /* Do not wake up a writer until he can make "significant" 1850 /* Do not wake up a writer until he can make "significant"
1829 * progress. --DaveM 1851 * progress. --DaveM
1830 */ 1852 */
1831 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 1853 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1832 if (sk_has_sleeper(sk)) 1854 wq = rcu_dereference(sk->sk_wq);
1833 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | 1855 if (wq_has_sleeper(wq))
1856 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1834 POLLWRNORM | POLLWRBAND); 1857 POLLWRNORM | POLLWRBAND);
1835 1858
1836 /* Should agree with poll, otherwise some programs break */ 1859 /* Should agree with poll, otherwise some programs break */
@@ -1838,7 +1861,7 @@ static void sock_def_write_space(struct sock *sk)
1838 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 1861 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1839 } 1862 }
1840 1863
1841 read_unlock(&sk->sk_callback_lock); 1864 rcu_read_unlock();
1842} 1865}
1843 1866
1844static void sock_def_destruct(struct sock *sk) 1867static void sock_def_destruct(struct sock *sk)
@@ -1885,7 +1908,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
1885 sk->sk_allocation = GFP_KERNEL; 1908 sk->sk_allocation = GFP_KERNEL;
1886 sk->sk_rcvbuf = sysctl_rmem_default; 1909 sk->sk_rcvbuf = sysctl_rmem_default;
1887 sk->sk_sndbuf = sysctl_wmem_default; 1910 sk->sk_sndbuf = sysctl_wmem_default;
1888 sk->sk_backlog.limit = sk->sk_rcvbuf << 1;
1889 sk->sk_state = TCP_CLOSE; 1911 sk->sk_state = TCP_CLOSE;
1890 sk_set_socket(sk, sock); 1912 sk_set_socket(sk, sock);
1891 1913
@@ -1893,12 +1915,12 @@ void sock_init_data(struct socket *sock, struct sock *sk)
1893 1915
1894 if (sock) { 1916 if (sock) {
1895 sk->sk_type = sock->type; 1917 sk->sk_type = sock->type;
1896 sk->sk_sleep = &sock->wait; 1918 sk->sk_wq = sock->wq;
1897 sock->sk = sk; 1919 sock->sk = sk;
1898 } else 1920 } else
1899 sk->sk_sleep = NULL; 1921 sk->sk_wq = NULL;
1900 1922
1901 rwlock_init(&sk->sk_dst_lock); 1923 spin_lock_init(&sk->sk_dst_lock);
1902 rwlock_init(&sk->sk_callback_lock); 1924 rwlock_init(&sk->sk_callback_lock);
1903 lockdep_set_class_and_name(&sk->sk_callback_lock, 1925 lockdep_set_class_and_name(&sk->sk_callback_lock,
1904 af_callback_keys + sk->sk_family, 1926 af_callback_keys + sk->sk_family,
diff --git a/net/core/stream.c b/net/core/stream.c
index a37debfeb1b2..cc196f42b8d8 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -28,15 +28,19 @@
28void sk_stream_write_space(struct sock *sk) 28void sk_stream_write_space(struct sock *sk)
29{ 29{
30 struct socket *sock = sk->sk_socket; 30 struct socket *sock = sk->sk_socket;
31 struct socket_wq *wq;
31 32
32 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { 33 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
33 clear_bit(SOCK_NOSPACE, &sock->flags); 34 clear_bit(SOCK_NOSPACE, &sock->flags);
34 35
35 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 36 rcu_read_lock();
36 wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | 37 wq = rcu_dereference(sk->sk_wq);
38 if (wq_has_sleeper(wq))
39 wake_up_interruptible_poll(&wq->wait, POLLOUT |
37 POLLWRNORM | POLLWRBAND); 40 POLLWRNORM | POLLWRBAND);
38 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) 41 if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
39 sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); 42 sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
43 rcu_read_unlock();
40 } 44 }
41} 45}
42 46
@@ -66,13 +70,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
66 if (signal_pending(tsk)) 70 if (signal_pending(tsk))
67 return sock_intr_errno(*timeo_p); 71 return sock_intr_errno(*timeo_p);
68 72
69 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 73 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
70 sk->sk_write_pending++; 74 sk->sk_write_pending++;
71 done = sk_wait_event(sk, timeo_p, 75 done = sk_wait_event(sk, timeo_p,
72 !sk->sk_err && 76 !sk->sk_err &&
73 !((1 << sk->sk_state) & 77 !((1 << sk->sk_state) &
74 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); 78 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
75 finish_wait(sk->sk_sleep, &wait); 79 finish_wait(sk_sleep(sk), &wait);
76 sk->sk_write_pending--; 80 sk->sk_write_pending--;
77 } while (!done); 81 } while (!done);
78 return 0; 82 return 0;
@@ -96,13 +100,13 @@ void sk_stream_wait_close(struct sock *sk, long timeout)
96 DEFINE_WAIT(wait); 100 DEFINE_WAIT(wait);
97 101
98 do { 102 do {
99 prepare_to_wait(sk->sk_sleep, &wait, 103 prepare_to_wait(sk_sleep(sk), &wait,
100 TASK_INTERRUPTIBLE); 104 TASK_INTERRUPTIBLE);
101 if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) 105 if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
102 break; 106 break;
103 } while (!signal_pending(current) && timeout); 107 } while (!signal_pending(current) && timeout);
104 108
105 finish_wait(sk->sk_sleep, &wait); 109 finish_wait(sk_sleep(sk), &wait);
106 } 110 }
107} 111}
108 112
@@ -126,7 +130,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
126 while (1) { 130 while (1) {
127 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 131 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
128 132
129 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 133 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
130 134
131 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 135 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
132 goto do_error; 136 goto do_error;
@@ -157,7 +161,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
157 *timeo_p = current_timeo; 161 *timeo_p = current_timeo;
158 } 162 }
159out: 163out:
160 finish_wait(sk->sk_sleep, &wait); 164 finish_wait(sk_sleep(sk), &wait);
161 return err; 165 return err;
162 166
163do_error: 167do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b7b6b8208f75..01eee5d984be 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -11,12 +11,72 @@
11#include <linux/socket.h> 11#include <linux/socket.h>
12#include <linux/netdevice.h> 12#include <linux/netdevice.h>
13#include <linux/ratelimit.h> 13#include <linux/ratelimit.h>
14#include <linux/vmalloc.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16 17
17#include <net/ip.h> 18#include <net/ip.h>
18#include <net/sock.h> 19#include <net/sock.h>
19 20
21#ifdef CONFIG_RPS
22static int rps_sock_flow_sysctl(ctl_table *table, int write,
23 void __user *buffer, size_t *lenp, loff_t *ppos)
24{
25 unsigned int orig_size, size;
26 int ret, i;
27 ctl_table tmp = {
28 .data = &size,
29 .maxlen = sizeof(size),
30 .mode = table->mode
31 };
32 struct rps_sock_flow_table *orig_sock_table, *sock_table;
33 static DEFINE_MUTEX(sock_flow_mutex);
34
35 mutex_lock(&sock_flow_mutex);
36
37 orig_sock_table = rps_sock_flow_table;
38 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
39
40 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
41
42 if (write) {
43 if (size) {
44 if (size > 1<<30) {
45 /* Enforce limit to prevent overflow */
46 mutex_unlock(&sock_flow_mutex);
47 return -EINVAL;
48 }
49 size = roundup_pow_of_two(size);
50 if (size != orig_size) {
51 sock_table =
52 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
53 if (!sock_table) {
54 mutex_unlock(&sock_flow_mutex);
55 return -ENOMEM;
56 }
57
58 sock_table->mask = size - 1;
59 } else
60 sock_table = orig_sock_table;
61
62 for (i = 0; i < size; i++)
63 sock_table->ents[i] = RPS_NO_CPU;
64 } else
65 sock_table = NULL;
66
67 if (sock_table != orig_sock_table) {
68 rcu_assign_pointer(rps_sock_flow_table, sock_table);
69 synchronize_rcu();
70 vfree(orig_sock_table);
71 }
72 }
73
74 mutex_unlock(&sock_flow_mutex);
75
76 return ret;
77}
78#endif /* CONFIG_RPS */
79
20static struct ctl_table net_core_table[] = { 80static struct ctl_table net_core_table[] = {
21#ifdef CONFIG_NET 81#ifdef CONFIG_NET
22 { 82 {
@@ -62,6 +122,13 @@ static struct ctl_table net_core_table[] = {
62 .proc_handler = proc_dointvec 122 .proc_handler = proc_dointvec
63 }, 123 },
64 { 124 {
125 .procname = "netdev_tstamp_prequeue",
126 .data = &netdev_tstamp_prequeue,
127 .maxlen = sizeof(int),
128 .mode = 0644,
129 .proc_handler = proc_dointvec
130 },
131 {
65 .procname = "message_cost", 132 .procname = "message_cost",
66 .data = &net_ratelimit_state.interval, 133 .data = &net_ratelimit_state.interval,
67 .maxlen = sizeof(int), 134 .maxlen = sizeof(int),
@@ -82,6 +149,14 @@ static struct ctl_table net_core_table[] = {
82 .mode = 0644, 149 .mode = 0644,
83 .proc_handler = proc_dointvec 150 .proc_handler = proc_dointvec
84 }, 151 },
152#ifdef CONFIG_RPS
153 {
154 .procname = "rps_sock_flow_entries",
155 .maxlen = sizeof(int),
156 .mode = 0644,
157 .proc_handler = rps_sock_flow_sysctl
158 },
159#endif
85#endif /* CONFIG_NET */ 160#endif /* CONFIG_NET */
86 { 161 {
87 .procname = "netdev_budget", 162 .procname = "netdev_budget",