aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c2115
1 files changed, 1139 insertions, 976 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index be9924f60ec3..660dd41aaaa6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -80,6 +80,7 @@
80#include <linux/types.h> 80#include <linux/types.h>
81#include <linux/kernel.h> 81#include <linux/kernel.h>
82#include <linux/hash.h> 82#include <linux/hash.h>
83#include <linux/slab.h>
83#include <linux/sched.h> 84#include <linux/sched.h>
84#include <linux/mutex.h> 85#include <linux/mutex.h>
85#include <linux/string.h> 86#include <linux/string.h>
@@ -100,8 +101,6 @@
100#include <linux/proc_fs.h> 101#include <linux/proc_fs.h>
101#include <linux/seq_file.h> 102#include <linux/seq_file.h>
102#include <linux/stat.h> 103#include <linux/stat.h>
103#include <linux/if_bridge.h>
104#include <linux/if_macvlan.h>
105#include <net/dst.h> 104#include <net/dst.h>
106#include <net/pkt_sched.h> 105#include <net/pkt_sched.h>
107#include <net/checksum.h> 106#include <net/checksum.h>
@@ -129,6 +128,7 @@
129#include <linux/jhash.h> 128#include <linux/jhash.h>
130#include <linux/random.h> 129#include <linux/random.h>
131#include <trace/events/napi.h> 130#include <trace/events/napi.h>
131#include <linux/pci.h>
132 132
133#include "net-sysfs.h" 133#include "net-sysfs.h"
134 134
@@ -206,6 +206,20 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
207} 207}
208 208
209static inline void rps_lock(struct softnet_data *sd)
210{
211#ifdef CONFIG_RPS
212 spin_lock(&sd->input_pkt_queue.lock);
213#endif
214}
215
216static inline void rps_unlock(struct softnet_data *sd)
217{
218#ifdef CONFIG_RPS
219 spin_unlock(&sd->input_pkt_queue.lock);
220#endif
221}
222
209/* Device list insertion */ 223/* Device list insertion */
210static int list_netdevice(struct net_device *dev) 224static int list_netdevice(struct net_device *dev)
211{ 225{
@@ -248,7 +262,7 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
248 * queue in the local softnet handler. 262 * queue in the local softnet handler.
249 */ 263 */
250 264
251DEFINE_PER_CPU(struct softnet_data, softnet_data); 265DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
252EXPORT_PER_CPU_SYMBOL(softnet_data); 266EXPORT_PER_CPU_SYMBOL(softnet_data);
253 267
254#ifdef CONFIG_LOCKDEP 268#ifdef CONFIG_LOCKDEP
@@ -772,47 +786,46 @@ EXPORT_SYMBOL(__dev_getfirstbyhwtype);
772 786
773struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 787struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
774{ 788{
775 struct net_device *dev; 789 struct net_device *dev, *ret = NULL;
776 790
777 rtnl_lock(); 791 rcu_read_lock();
778 dev = __dev_getfirstbyhwtype(net, type); 792 for_each_netdev_rcu(net, dev)
779 if (dev) 793 if (dev->type == type) {
780 dev_hold(dev); 794 dev_hold(dev);
781 rtnl_unlock(); 795 ret = dev;
782 return dev; 796 break;
797 }
798 rcu_read_unlock();
799 return ret;
783} 800}
784EXPORT_SYMBOL(dev_getfirstbyhwtype); 801EXPORT_SYMBOL(dev_getfirstbyhwtype);
785 802
786/** 803/**
787 * dev_get_by_flags - find any device with given flags 804 * dev_get_by_flags_rcu - find any device with given flags
788 * @net: the applicable net namespace 805 * @net: the applicable net namespace
789 * @if_flags: IFF_* values 806 * @if_flags: IFF_* values
790 * @mask: bitmask of bits in if_flags to check 807 * @mask: bitmask of bits in if_flags to check
791 * 808 *
792 * Search for any interface with the given flags. Returns NULL if a device 809 * Search for any interface with the given flags. Returns NULL if a device
793 * is not found or a pointer to the device. The device returned has 810 * is not found or a pointer to the device. Must be called inside
794 * had a reference added and the pointer is safe until the user calls 811 * rcu_read_lock(), and result refcount is unchanged.
795 * dev_put to indicate they have finished with it.
796 */ 812 */
797 813
798struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags, 814struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
799 unsigned short mask) 815 unsigned short mask)
800{ 816{
801 struct net_device *dev, *ret; 817 struct net_device *dev, *ret;
802 818
803 ret = NULL; 819 ret = NULL;
804 rcu_read_lock();
805 for_each_netdev_rcu(net, dev) { 820 for_each_netdev_rcu(net, dev) {
806 if (((dev->flags ^ if_flags) & mask) == 0) { 821 if (((dev->flags ^ if_flags) & mask) == 0) {
807 dev_hold(dev);
808 ret = dev; 822 ret = dev;
809 break; 823 break;
810 } 824 }
811 } 825 }
812 rcu_read_unlock();
813 return ret; 826 return ret;
814} 827}
815EXPORT_SYMBOL(dev_get_by_flags); 828EXPORT_SYMBOL(dev_get_by_flags_rcu);
816 829
817/** 830/**
818 * dev_valid_name - check if name is okay for network device 831 * dev_valid_name - check if name is okay for network device
@@ -935,18 +948,22 @@ int dev_alloc_name(struct net_device *dev, const char *name)
935} 948}
936EXPORT_SYMBOL(dev_alloc_name); 949EXPORT_SYMBOL(dev_alloc_name);
937 950
938static int dev_get_valid_name(struct net *net, const char *name, char *buf, 951static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
939 bool fmt)
940{ 952{
953 struct net *net;
954
955 BUG_ON(!dev_net(dev));
956 net = dev_net(dev);
957
941 if (!dev_valid_name(name)) 958 if (!dev_valid_name(name))
942 return -EINVAL; 959 return -EINVAL;
943 960
944 if (fmt && strchr(name, '%')) 961 if (fmt && strchr(name, '%'))
945 return __dev_alloc_name(net, name, buf); 962 return dev_alloc_name(dev, name);
946 else if (__dev_get_by_name(net, name)) 963 else if (__dev_get_by_name(net, name))
947 return -EEXIST; 964 return -EEXIST;
948 else if (buf != name) 965 else if (dev->name != name)
949 strlcpy(buf, name, IFNAMSIZ); 966 strlcpy(dev->name, name, IFNAMSIZ);
950 967
951 return 0; 968 return 0;
952} 969}
@@ -978,20 +995,15 @@ int dev_change_name(struct net_device *dev, const char *newname)
978 995
979 memcpy(oldname, dev->name, IFNAMSIZ); 996 memcpy(oldname, dev->name, IFNAMSIZ);
980 997
981 err = dev_get_valid_name(net, newname, dev->name, 1); 998 err = dev_get_valid_name(dev, newname, 1);
982 if (err < 0) 999 if (err < 0)
983 return err; 1000 return err;
984 1001
985rollback: 1002rollback:
986 /* For now only devices in the initial network namespace 1003 ret = device_rename(&dev->dev, dev->name);
987 * are in sysfs. 1004 if (ret) {
988 */ 1005 memcpy(dev->name, oldname, IFNAMSIZ);
989 if (net_eq(net, &init_net)) { 1006 return ret;
990 ret = device_rename(&dev->dev, dev->name);
991 if (ret) {
992 memcpy(dev->name, oldname, IFNAMSIZ);
993 return ret;
994 }
995 } 1007 }
996 1008
997 write_lock_bh(&dev_base_lock); 1009 write_lock_bh(&dev_base_lock);
@@ -1084,9 +1096,9 @@ void netdev_state_change(struct net_device *dev)
1084} 1096}
1085EXPORT_SYMBOL(netdev_state_change); 1097EXPORT_SYMBOL(netdev_state_change);
1086 1098
1087void netdev_bonding_change(struct net_device *dev, unsigned long event) 1099int netdev_bonding_change(struct net_device *dev, unsigned long event)
1088{ 1100{
1089 call_netdevice_notifiers(event, dev); 1101 return call_netdevice_notifiers(event, dev);
1090} 1102}
1091EXPORT_SYMBOL(netdev_bonding_change); 1103EXPORT_SYMBOL(netdev_bonding_change);
1092 1104
@@ -1113,19 +1125,7 @@ void dev_load(struct net *net, const char *name)
1113} 1125}
1114EXPORT_SYMBOL(dev_load); 1126EXPORT_SYMBOL(dev_load);
1115 1127
1116/** 1128static int __dev_open(struct net_device *dev)
1117 * dev_open - prepare an interface for use.
1118 * @dev: device to open
1119 *
1120 * Takes a device from down to up state. The device's private open
1121 * function is invoked and then the multicast lists are loaded. Finally
1122 * the device is moved into the up state and a %NETDEV_UP message is
1123 * sent to the netdev notifier chain.
1124 *
1125 * Calling this function on an active interface is a nop. On a failure
1126 * a negative errno code is returned.
1127 */
1128int dev_open(struct net_device *dev)
1129{ 1129{
1130 const struct net_device_ops *ops = dev->netdev_ops; 1130 const struct net_device_ops *ops = dev->netdev_ops;
1131 int ret; 1131 int ret;
@@ -1133,13 +1133,6 @@ int dev_open(struct net_device *dev)
1133 ASSERT_RTNL(); 1133 ASSERT_RTNL();
1134 1134
1135 /* 1135 /*
1136 * Is it already up?
1137 */
1138
1139 if (dev->flags & IFF_UP)
1140 return 0;
1141
1142 /*
1143 * Is it even present? 1136 * Is it even present?
1144 */ 1137 */
1145 if (!netif_device_present(dev)) 1138 if (!netif_device_present(dev))
@@ -1187,36 +1180,57 @@ int dev_open(struct net_device *dev)
1187 * Wakeup transmit queue engine 1180 * Wakeup transmit queue engine
1188 */ 1181 */
1189 dev_activate(dev); 1182 dev_activate(dev);
1190
1191 /*
1192 * ... and announce new interface.
1193 */
1194 call_netdevice_notifiers(NETDEV_UP, dev);
1195 } 1183 }
1196 1184
1197 return ret; 1185 return ret;
1198} 1186}
1199EXPORT_SYMBOL(dev_open);
1200 1187
1201/** 1188/**
1202 * dev_close - shutdown an interface. 1189 * dev_open - prepare an interface for use.
1203 * @dev: device to shutdown 1190 * @dev: device to open
1204 * 1191 *
1205 * This function moves an active device into down state. A 1192 * Takes a device from down to up state. The device's private open
1206 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1193 * function is invoked and then the multicast lists are loaded. Finally
1207 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1194 * the device is moved into the up state and a %NETDEV_UP message is
1208 * chain. 1195 * sent to the netdev notifier chain.
1196 *
1197 * Calling this function on an active interface is a nop. On a failure
1198 * a negative errno code is returned.
1209 */ 1199 */
1210int dev_close(struct net_device *dev) 1200int dev_open(struct net_device *dev)
1201{
1202 int ret;
1203
1204 /*
1205 * Is it already up?
1206 */
1207 if (dev->flags & IFF_UP)
1208 return 0;
1209
1210 /*
1211 * Open device
1212 */
1213 ret = __dev_open(dev);
1214 if (ret < 0)
1215 return ret;
1216
1217 /*
1218 * ... and announce new interface.
1219 */
1220 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221 call_netdevice_notifiers(NETDEV_UP, dev);
1222
1223 return ret;
1224}
1225EXPORT_SYMBOL(dev_open);
1226
1227static int __dev_close(struct net_device *dev)
1211{ 1228{
1212 const struct net_device_ops *ops = dev->netdev_ops; 1229 const struct net_device_ops *ops = dev->netdev_ops;
1213 ASSERT_RTNL();
1214 1230
1231 ASSERT_RTNL();
1215 might_sleep(); 1232 might_sleep();
1216 1233
1217 if (!(dev->flags & IFF_UP))
1218 return 0;
1219
1220 /* 1234 /*
1221 * Tell people we are going down, so that they can 1235 * Tell people we are going down, so that they can
1222 * prepare to death, when device is still operating. 1236 * prepare to death, when device is still operating.
@@ -1252,14 +1266,34 @@ int dev_close(struct net_device *dev)
1252 dev->flags &= ~IFF_UP; 1266 dev->flags &= ~IFF_UP;
1253 1267
1254 /* 1268 /*
1255 * Tell people we are down 1269 * Shutdown NET_DMA
1256 */ 1270 */
1257 call_netdevice_notifiers(NETDEV_DOWN, dev); 1271 net_dmaengine_put();
1272
1273 return 0;
1274}
1275
1276/**
1277 * dev_close - shutdown an interface.
1278 * @dev: device to shutdown
1279 *
1280 * This function moves an active device into down state. A
1281 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283 * chain.
1284 */
1285int dev_close(struct net_device *dev)
1286{
1287 if (!(dev->flags & IFF_UP))
1288 return 0;
1289
1290 __dev_close(dev);
1258 1291
1259 /* 1292 /*
1260 * Shutdown NET_DMA 1293 * Tell people we are down
1261 */ 1294 */
1262 net_dmaengine_put(); 1295 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1296 call_netdevice_notifiers(NETDEV_DOWN, dev);
1263 1297
1264 return 0; 1298 return 0;
1265} 1299}
@@ -1394,6 +1428,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
1394 1428
1395int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1429int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1396{ 1430{
1431 ASSERT_RTNL();
1397 return raw_notifier_call_chain(&netdev_chain, val, dev); 1432 return raw_notifier_call_chain(&netdev_chain, val, dev);
1398} 1433}
1399 1434
@@ -1412,7 +1447,7 @@ void net_disable_timestamp(void)
1412} 1447}
1413EXPORT_SYMBOL(net_disable_timestamp); 1448EXPORT_SYMBOL(net_disable_timestamp);
1414 1449
1415static inline void net_timestamp(struct sk_buff *skb) 1450static inline void net_timestamp_set(struct sk_buff *skb)
1416{ 1451{
1417 if (atomic_read(&netstamp_needed)) 1452 if (atomic_read(&netstamp_needed))
1418 __net_timestamp(skb); 1453 __net_timestamp(skb);
@@ -1420,6 +1455,12 @@ static inline void net_timestamp(struct sk_buff *skb)
1420 skb->tstamp.tv64 = 0; 1455 skb->tstamp.tv64 = 0;
1421} 1456}
1422 1457
1458static inline void net_timestamp_check(struct sk_buff *skb)
1459{
1460 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1461 __net_timestamp(skb);
1462}
1463
1423/** 1464/**
1424 * dev_forward_skb - loopback an skb to another netif 1465 * dev_forward_skb - loopback an skb to another netif
1425 * 1466 *
@@ -1428,7 +1469,7 @@ static inline void net_timestamp(struct sk_buff *skb)
1428 * 1469 *
1429 * return values: 1470 * return values:
1430 * NET_RX_SUCCESS (no congestion) 1471 * NET_RX_SUCCESS (no congestion)
1431 * NET_RX_DROP (packet was dropped) 1472 * NET_RX_DROP (packet was dropped, but freed)
1432 * 1473 *
1433 * dev_forward_skb can be used for injecting an skb from the 1474 * dev_forward_skb can be used for injecting an skb from the
1434 * start_xmit function of one device into the receive queue 1475 * start_xmit function of one device into the receive queue
@@ -1441,20 +1482,17 @@ static inline void net_timestamp(struct sk_buff *skb)
1441int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1482int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1442{ 1483{
1443 skb_orphan(skb); 1484 skb_orphan(skb);
1485 nf_reset(skb);
1444 1486
1445 if (!(dev->flags & IFF_UP)) 1487 if (!(dev->flags & IFF_UP) ||
1446 return NET_RX_DROP; 1488 (skb->len > (dev->mtu + dev->hard_header_len))) {
1447 1489 kfree_skb(skb);
1448 if (skb->len > (dev->mtu + dev->hard_header_len))
1449 return NET_RX_DROP; 1490 return NET_RX_DROP;
1450 1491 }
1451 skb_dst_drop(skb); 1492 skb_set_dev(skb, dev);
1452 skb->tstamp.tv64 = 0; 1493 skb->tstamp.tv64 = 0;
1453 skb->pkt_type = PACKET_HOST; 1494 skb->pkt_type = PACKET_HOST;
1454 skb->protocol = eth_type_trans(skb, dev); 1495 skb->protocol = eth_type_trans(skb, dev);
1455 skb->mark = 0;
1456 secpath_reset(skb);
1457 nf_reset(skb);
1458 return netif_rx(skb); 1496 return netif_rx(skb);
1459} 1497}
1460EXPORT_SYMBOL_GPL(dev_forward_skb); 1498EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -1470,9 +1508,9 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1470 1508
1471#ifdef CONFIG_NET_CLS_ACT 1509#ifdef CONFIG_NET_CLS_ACT
1472 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) 1510 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1473 net_timestamp(skb); 1511 net_timestamp_set(skb);
1474#else 1512#else
1475 net_timestamp(skb); 1513 net_timestamp_set(skb);
1476#endif 1514#endif
1477 1515
1478 rcu_read_lock(); 1516 rcu_read_lock();
@@ -1498,7 +1536,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1498 if (net_ratelimit()) 1536 if (net_ratelimit())
1499 printk(KERN_CRIT "protocol %04x is " 1537 printk(KERN_CRIT "protocol %04x is "
1500 "buggy, dev %s\n", 1538 "buggy, dev %s\n",
1501 skb2->protocol, dev->name); 1539 ntohs(skb2->protocol),
1540 dev->name);
1502 skb_reset_network_header(skb2); 1541 skb_reset_network_header(skb2);
1503 } 1542 }
1504 1543
@@ -1510,6 +1549,24 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1510 rcu_read_unlock(); 1549 rcu_read_unlock();
1511} 1550}
1512 1551
1552/*
1553 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1554 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1555 */
1556void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1557{
1558 unsigned int real_num = dev->real_num_tx_queues;
1559
1560 if (unlikely(txq > dev->num_tx_queues))
1561 ;
1562 else if (txq > real_num)
1563 dev->real_num_tx_queues = txq;
1564 else if (txq < real_num) {
1565 dev->real_num_tx_queues = txq;
1566 qdisc_reset_all_tx_gt(dev, txq);
1567 }
1568}
1569EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1513 1570
1514static inline void __netif_reschedule(struct Qdisc *q) 1571static inline void __netif_reschedule(struct Qdisc *q)
1515{ 1572{
@@ -1518,8 +1575,9 @@ static inline void __netif_reschedule(struct Qdisc *q)
1518 1575
1519 local_irq_save(flags); 1576 local_irq_save(flags);
1520 sd = &__get_cpu_var(softnet_data); 1577 sd = &__get_cpu_var(softnet_data);
1521 q->next_sched = sd->output_queue; 1578 q->next_sched = NULL;
1522 sd->output_queue = q; 1579 *sd->output_queue_tailp = q;
1580 sd->output_queue_tailp = &q->next_sched;
1523 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1581 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1524 local_irq_restore(flags); 1582 local_irq_restore(flags);
1525} 1583}
@@ -1614,6 +1672,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1614 return false; 1672 return false;
1615} 1673}
1616 1674
1675/**
1676 * skb_dev_set -- assign a new device to a buffer
1677 * @skb: buffer for the new device
1678 * @dev: network device
1679 *
1680 * If an skb is owned by a device already, we have to reset
1681 * all data private to the namespace a device belongs to
1682 * before assigning it a new device.
1683 */
1684#ifdef CONFIG_NET_NS
1685void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1686{
1687 skb_dst_drop(skb);
1688 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1689 secpath_reset(skb);
1690 nf_reset(skb);
1691 skb_init_secmark(skb);
1692 skb->mark = 0;
1693 skb->priority = 0;
1694 skb->nf_trace = 0;
1695 skb->ipvs_property = 0;
1696#ifdef CONFIG_NET_SCHED
1697 skb->tc_index = 0;
1698#endif
1699 }
1700 skb->dev = dev;
1701}
1702EXPORT_SYMBOL(skb_set_dev);
1703#endif /* CONFIG_NET_NS */
1704
1617/* 1705/*
1618 * Invalidate hardware checksum when packet is to be mangled, and 1706 * Invalidate hardware checksum when packet is to be mangled, and
1619 * complete checksum manually on outgoing path. 1707 * complete checksum manually on outgoing path.
@@ -1734,18 +1822,27 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);
1734 * 2. No high memory really exists on this machine. 1822 * 2. No high memory really exists on this machine.
1735 */ 1823 */
1736 1824
1737static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1825static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1738{ 1826{
1739#ifdef CONFIG_HIGHMEM 1827#ifdef CONFIG_HIGHMEM
1740 int i; 1828 int i;
1829 if (!(dev->features & NETIF_F_HIGHDMA)) {
1830 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1831 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1832 return 1;
1833 }
1741 1834
1742 if (dev->features & NETIF_F_HIGHDMA) 1835 if (PCI_DMA_BUS_IS_PHYS) {
1743 return 0; 1836 struct device *pdev = dev->dev.parent;
1744
1745 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1746 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1747 return 1;
1748 1837
1838 if (!pdev)
1839 return 0;
1840 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1841 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1842 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1843 return 1;
1844 }
1845 }
1749#endif 1846#endif
1750 return 0; 1847 return 0;
1751} 1848}
@@ -1803,6 +1900,41 @@ static int dev_gso_segment(struct sk_buff *skb)
1803 return 0; 1900 return 0;
1804} 1901}
1805 1902
1903/*
1904 * Try to orphan skb early, right before transmission by the device.
1905 * We cannot orphan skb if tx timestamp is requested, since
1906 * drivers need to call skb_tstamp_tx() to send the timestamp.
1907 */
1908static inline void skb_orphan_try(struct sk_buff *skb)
1909{
1910 struct sock *sk = skb->sk;
1911
1912 if (sk && !skb_tx(skb)->flags) {
1913 /* skb_tx_hash() wont be able to get sk.
1914 * We copy sk_hash into skb->rxhash
1915 */
1916 if (!skb->rxhash)
1917 skb->rxhash = sk->sk_hash;
1918 skb_orphan(skb);
1919 }
1920}
1921
1922/*
1923 * Returns true if either:
1924 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1925 * 2. skb is fragmented and the device does not support SG, or if
1926 * at least one of fragments is in highmem and device does not
1927 * support DMA from it.
1928 */
1929static inline int skb_needs_linearize(struct sk_buff *skb,
1930 struct net_device *dev)
1931{
1932 return skb_is_nonlinear(skb) &&
1933 ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
1934 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1935 illegal_highdma(dev, skb))));
1936}
1937
1806int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1938int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1807 struct netdev_queue *txq) 1939 struct netdev_queue *txq)
1808{ 1940{
@@ -1813,13 +1945,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1813 if (!list_empty(&ptype_all)) 1945 if (!list_empty(&ptype_all))
1814 dev_queue_xmit_nit(skb, dev); 1946 dev_queue_xmit_nit(skb, dev);
1815 1947
1816 if (netif_needs_gso(dev, skb)) {
1817 if (unlikely(dev_gso_segment(skb)))
1818 goto out_kfree_skb;
1819 if (skb->next)
1820 goto gso;
1821 }
1822
1823 /* 1948 /*
1824 * If device doesnt need skb->dst, release it right now while 1949 * If device doesnt need skb->dst, release it right now while
1825 * its hot in this cpu cache 1950 * its hot in this cpu cache
@@ -1827,23 +1952,34 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1827 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 1952 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1828 skb_dst_drop(skb); 1953 skb_dst_drop(skb);
1829 1954
1955 skb_orphan_try(skb);
1956
1957 if (netif_needs_gso(dev, skb)) {
1958 if (unlikely(dev_gso_segment(skb)))
1959 goto out_kfree_skb;
1960 if (skb->next)
1961 goto gso;
1962 } else {
1963 if (skb_needs_linearize(skb, dev) &&
1964 __skb_linearize(skb))
1965 goto out_kfree_skb;
1966
1967 /* If packet is not checksummed and device does not
1968 * support checksumming for this protocol, complete
1969 * checksumming here.
1970 */
1971 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1972 skb_set_transport_header(skb, skb->csum_start -
1973 skb_headroom(skb));
1974 if (!dev_can_checksum(dev, skb) &&
1975 skb_checksum_help(skb))
1976 goto out_kfree_skb;
1977 }
1978 }
1979
1830 rc = ops->ndo_start_xmit(skb, dev); 1980 rc = ops->ndo_start_xmit(skb, dev);
1831 if (rc == NETDEV_TX_OK) 1981 if (rc == NETDEV_TX_OK)
1832 txq_trans_update(txq); 1982 txq_trans_update(txq);
1833 /*
1834 * TODO: if skb_orphan() was called by
1835 * dev->hard_start_xmit() (for example, the unmodified
1836 * igb driver does that; bnx2 doesn't), then
1837 * skb_tx_software_timestamp() will be unable to send
1838 * back the time stamp.
1839 *
1840 * How can this be prevented? Always create another
1841 * reference to the socket before calling
1842 * dev->hard_start_xmit()? Prevent that skb_orphan()
1843 * does anything in dev->hard_start_xmit() by clearing
1844 * the skb destructor before the call and restoring it
1845 * afterwards, then doing the skb_orphan() ourselves?
1846 */
1847 return rc; 1983 return rc;
1848 } 1984 }
1849 1985
@@ -1853,6 +1989,14 @@ gso:
1853 1989
1854 skb->next = nskb->next; 1990 skb->next = nskb->next;
1855 nskb->next = NULL; 1991 nskb->next = NULL;
1992
1993 /*
1994 * If device doesnt need nskb->dst, release it right now while
1995 * its hot in this cpu cache
1996 */
1997 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1998 skb_dst_drop(nskb);
1999
1856 rc = ops->ndo_start_xmit(nskb, dev); 2000 rc = ops->ndo_start_xmit(nskb, dev);
1857 if (unlikely(rc != NETDEV_TX_OK)) { 2001 if (unlikely(rc != NETDEV_TX_OK)) {
1858 if (rc & ~NETDEV_TX_MASK) 2002 if (rc & ~NETDEV_TX_MASK)
@@ -1874,7 +2018,7 @@ out_kfree_skb:
1874 return rc; 2018 return rc;
1875} 2019}
1876 2020
1877static u32 skb_tx_hashrnd; 2021static u32 hashrnd __read_mostly;
1878 2022
1879u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 2023u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1880{ 2024{
@@ -1890,9 +2034,8 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1890 if (skb->sk && skb->sk->sk_hash) 2034 if (skb->sk && skb->sk->sk_hash)
1891 hash = skb->sk->sk_hash; 2035 hash = skb->sk->sk_hash;
1892 else 2036 else
1893 hash = skb->protocol; 2037 hash = (__force u16) skb->protocol ^ skb->rxhash;
1894 2038 hash = jhash_1word(hash, hashrnd);
1895 hash = jhash_1word(hash, skb_tx_hashrnd);
1896 2039
1897 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 2040 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1898} 2041}
@@ -1902,10 +2045,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1902{ 2045{
1903 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 2046 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1904 if (net_ratelimit()) { 2047 if (net_ratelimit()) {
1905 WARN(1, "%s selects TX queue %d, but " 2048 pr_warning("%s selects TX queue %d, but "
1906 "real number of TX queues is %d\n", 2049 "real number of TX queues is %d\n",
1907 dev->name, queue_index, 2050 dev->name, queue_index, dev->real_num_tx_queues);
1908 dev->real_num_tx_queues);
1909 } 2051 }
1910 return 0; 2052 return 0;
1911 } 2053 }
@@ -1915,24 +2057,27 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1915static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2057static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1916 struct sk_buff *skb) 2058 struct sk_buff *skb)
1917{ 2059{
1918 u16 queue_index; 2060 int queue_index;
1919 struct sock *sk = skb->sk; 2061 const struct net_device_ops *ops = dev->netdev_ops;
1920 2062
1921 if (sk_tx_queue_recorded(sk)) { 2063 if (ops->ndo_select_queue) {
1922 queue_index = sk_tx_queue_get(sk); 2064 queue_index = ops->ndo_select_queue(dev, skb);
2065 queue_index = dev_cap_txqueue(dev, queue_index);
1923 } else { 2066 } else {
1924 const struct net_device_ops *ops = dev->netdev_ops; 2067 struct sock *sk = skb->sk;
2068 queue_index = sk_tx_queue_get(sk);
2069 if (queue_index < 0) {
1925 2070
1926 if (ops->ndo_select_queue) {
1927 queue_index = ops->ndo_select_queue(dev, skb);
1928 queue_index = dev_cap_txqueue(dev, queue_index);
1929 } else {
1930 queue_index = 0; 2071 queue_index = 0;
1931 if (dev->real_num_tx_queues > 1) 2072 if (dev->real_num_tx_queues > 1)
1932 queue_index = skb_tx_hash(dev, skb); 2073 queue_index = skb_tx_hash(dev, skb);
1933 2074
1934 if (sk && sk->sk_dst_cache) 2075 if (sk) {
1935 sk_tx_queue_set(sk, queue_index); 2076 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2077
2078 if (dst && skb_dst(skb) == dst)
2079 sk_tx_queue_set(sk, queue_index);
2080 }
1936 } 2081 }
1937 } 2082 }
1938 2083
@@ -1945,32 +2090,56 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1945 struct netdev_queue *txq) 2090 struct netdev_queue *txq)
1946{ 2091{
1947 spinlock_t *root_lock = qdisc_lock(q); 2092 spinlock_t *root_lock = qdisc_lock(q);
2093 bool contended = qdisc_is_running(q);
1948 int rc; 2094 int rc;
1949 2095
2096 /*
2097 * Heuristic to force contended enqueues to serialize on a
2098 * separate lock before trying to get qdisc main lock.
2099 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2100 * and dequeue packets faster.
2101 */
2102 if (unlikely(contended))
2103 spin_lock(&q->busylock);
2104
1950 spin_lock(root_lock); 2105 spin_lock(root_lock);
1951 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2106 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1952 kfree_skb(skb); 2107 kfree_skb(skb);
1953 rc = NET_XMIT_DROP; 2108 rc = NET_XMIT_DROP;
1954 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2109 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1955 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) { 2110 qdisc_run_begin(q)) {
1956 /* 2111 /*
1957 * This is a work-conserving queue; there are no old skbs 2112 * This is a work-conserving queue; there are no old skbs
1958 * waiting to be sent out; and the qdisc is not running - 2113 * waiting to be sent out; and the qdisc is not running -
1959 * xmit the skb directly. 2114 * xmit the skb directly.
1960 */ 2115 */
2116 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2117 skb_dst_force(skb);
1961 __qdisc_update_bstats(q, skb->len); 2118 __qdisc_update_bstats(q, skb->len);
1962 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) 2119 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2120 if (unlikely(contended)) {
2121 spin_unlock(&q->busylock);
2122 contended = false;
2123 }
1963 __qdisc_run(q); 2124 __qdisc_run(q);
1964 else 2125 } else
1965 clear_bit(__QDISC_STATE_RUNNING, &q->state); 2126 qdisc_run_end(q);
1966 2127
1967 rc = NET_XMIT_SUCCESS; 2128 rc = NET_XMIT_SUCCESS;
1968 } else { 2129 } else {
2130 skb_dst_force(skb);
1969 rc = qdisc_enqueue_root(skb, q); 2131 rc = qdisc_enqueue_root(skb, q);
1970 qdisc_run(q); 2132 if (qdisc_run_begin(q)) {
2133 if (unlikely(contended)) {
2134 spin_unlock(&q->busylock);
2135 contended = false;
2136 }
2137 __qdisc_run(q);
2138 }
1971 } 2139 }
1972 spin_unlock(root_lock); 2140 spin_unlock(root_lock);
1973 2141 if (unlikely(contended))
2142 spin_unlock(&q->busylock);
1974 return rc; 2143 return rc;
1975} 2144}
1976 2145
@@ -2006,42 +2175,13 @@ int dev_queue_xmit(struct sk_buff *skb)
2006 struct Qdisc *q; 2175 struct Qdisc *q;
2007 int rc = -ENOMEM; 2176 int rc = -ENOMEM;
2008 2177
2009 /* GSO will handle the following emulations directly. */
2010 if (netif_needs_gso(dev, skb))
2011 goto gso;
2012
2013 if (skb_has_frags(skb) &&
2014 !(dev->features & NETIF_F_FRAGLIST) &&
2015 __skb_linearize(skb))
2016 goto out_kfree_skb;
2017
2018 /* Fragmented skb is linearized if device does not support SG,
2019 * or if at least one of fragments is in highmem and device
2020 * does not support DMA from it.
2021 */
2022 if (skb_shinfo(skb)->nr_frags &&
2023 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
2024 __skb_linearize(skb))
2025 goto out_kfree_skb;
2026
2027 /* If packet is not checksummed and device does not support
2028 * checksumming for this protocol, complete checksumming here.
2029 */
2030 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2031 skb_set_transport_header(skb, skb->csum_start -
2032 skb_headroom(skb));
2033 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2034 goto out_kfree_skb;
2035 }
2036
2037gso:
2038 /* Disable soft irqs for various locks below. Also 2178 /* Disable soft irqs for various locks below. Also
2039 * stops preemption for RCU. 2179 * stops preemption for RCU.
2040 */ 2180 */
2041 rcu_read_lock_bh(); 2181 rcu_read_lock_bh();
2042 2182
2043 txq = dev_pick_tx(dev, skb); 2183 txq = dev_pick_tx(dev, skb);
2044 q = rcu_dereference(txq->qdisc); 2184 q = rcu_dereference_bh(txq->qdisc);
2045 2185
2046#ifdef CONFIG_NET_CLS_ACT 2186#ifdef CONFIG_NET_CLS_ACT
2047 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2187 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
@@ -2093,7 +2233,6 @@ gso:
2093 rc = -ENETDOWN; 2233 rc = -ENETDOWN;
2094 rcu_read_unlock_bh(); 2234 rcu_read_unlock_bh();
2095 2235
2096out_kfree_skb:
2097 kfree_skb(skb); 2236 kfree_skb(skb);
2098 return rc; 2237 return rc;
2099out: 2238out:
@@ -2108,11 +2247,244 @@ EXPORT_SYMBOL(dev_queue_xmit);
2108 =======================================================================*/ 2247 =======================================================================*/
2109 2248
2110int netdev_max_backlog __read_mostly = 1000; 2249int netdev_max_backlog __read_mostly = 1000;
2250int netdev_tstamp_prequeue __read_mostly = 1;
2111int netdev_budget __read_mostly = 300; 2251int netdev_budget __read_mostly = 300;
2112int weight_p __read_mostly = 64; /* old backlog weight */ 2252int weight_p __read_mostly = 64; /* old backlog weight */
2113 2253
2114DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 2254/* Called with irq disabled */
2255static inline void ____napi_schedule(struct softnet_data *sd,
2256 struct napi_struct *napi)
2257{
2258 list_add_tail(&napi->poll_list, &sd->poll_list);
2259 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2260}
2261
2262#ifdef CONFIG_RPS
2263
2264/* One global table that all flow-based protocols share. */
2265struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2266EXPORT_SYMBOL(rps_sock_flow_table);
2267
2268/*
2269 * get_rps_cpu is called from netif_receive_skb and returns the target
2270 * CPU from the RPS map of the receiving queue for a given skb.
2271 * rcu_read_lock must be held on entry.
2272 */
2273static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2274 struct rps_dev_flow **rflowp)
2275{
2276 struct ipv6hdr *ip6;
2277 struct iphdr *ip;
2278 struct netdev_rx_queue *rxqueue;
2279 struct rps_map *map;
2280 struct rps_dev_flow_table *flow_table;
2281 struct rps_sock_flow_table *sock_flow_table;
2282 int cpu = -1;
2283 u8 ip_proto;
2284 u16 tcpu;
2285 u32 addr1, addr2, ihl;
2286 union {
2287 u32 v32;
2288 u16 v16[2];
2289 } ports;
2290
2291 if (skb_rx_queue_recorded(skb)) {
2292 u16 index = skb_get_rx_queue(skb);
2293 if (unlikely(index >= dev->num_rx_queues)) {
2294 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2295 "on queue %u, but number of RX queues is %u\n",
2296 dev->name, index, dev->num_rx_queues);
2297 goto done;
2298 }
2299 rxqueue = dev->_rx + index;
2300 } else
2301 rxqueue = dev->_rx;
2302
2303 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2304 goto done;
2305
2306 if (skb->rxhash)
2307 goto got_hash; /* Skip hash computation on packet header */
2308
2309 switch (skb->protocol) {
2310 case __constant_htons(ETH_P_IP):
2311 if (!pskb_may_pull(skb, sizeof(*ip)))
2312 goto done;
2313
2314 ip = (struct iphdr *) skb->data;
2315 ip_proto = ip->protocol;
2316 addr1 = (__force u32) ip->saddr;
2317 addr2 = (__force u32) ip->daddr;
2318 ihl = ip->ihl;
2319 break;
2320 case __constant_htons(ETH_P_IPV6):
2321 if (!pskb_may_pull(skb, sizeof(*ip6)))
2322 goto done;
2323
2324 ip6 = (struct ipv6hdr *) skb->data;
2325 ip_proto = ip6->nexthdr;
2326 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2327 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2328 ihl = (40 >> 2);
2329 break;
2330 default:
2331 goto done;
2332 }
2333 switch (ip_proto) {
2334 case IPPROTO_TCP:
2335 case IPPROTO_UDP:
2336 case IPPROTO_DCCP:
2337 case IPPROTO_ESP:
2338 case IPPROTO_AH:
2339 case IPPROTO_SCTP:
2340 case IPPROTO_UDPLITE:
2341 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2342 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2343 if (ports.v16[1] < ports.v16[0])
2344 swap(ports.v16[0], ports.v16[1]);
2345 break;
2346 }
2347 default:
2348 ports.v32 = 0;
2349 break;
2350 }
2351
2352 /* get a consistent hash (same value on both flow directions) */
2353 if (addr2 < addr1)
2354 swap(addr1, addr2);
2355 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2356 if (!skb->rxhash)
2357 skb->rxhash = 1;
2358
2359got_hash:
2360 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2361 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2362 if (flow_table && sock_flow_table) {
2363 u16 next_cpu;
2364 struct rps_dev_flow *rflow;
2365
2366 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2367 tcpu = rflow->cpu;
2368
2369 next_cpu = sock_flow_table->ents[skb->rxhash &
2370 sock_flow_table->mask];
2371
2372 /*
2373 * If the desired CPU (where last recvmsg was done) is
2374 * different from current CPU (one in the rx-queue flow
2375 * table entry), switch if one of the following holds:
2376 * - Current CPU is unset (equal to RPS_NO_CPU).
2377 * - Current CPU is offline.
2378 * - The current CPU's queue tail has advanced beyond the
2379 * last packet that was enqueued using this table entry.
2380 * This guarantees that all previous packets for the flow
2381 * have been dequeued, thus preserving in order delivery.
2382 */
2383 if (unlikely(tcpu != next_cpu) &&
2384 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2385 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2386 rflow->last_qtail)) >= 0)) {
2387 tcpu = rflow->cpu = next_cpu;
2388 if (tcpu != RPS_NO_CPU)
2389 rflow->last_qtail = per_cpu(softnet_data,
2390 tcpu).input_queue_head;
2391 }
2392 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2393 *rflowp = rflow;
2394 cpu = tcpu;
2395 goto done;
2396 }
2397 }
2115 2398
2399 map = rcu_dereference(rxqueue->rps_map);
2400 if (map) {
2401 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2402
2403 if (cpu_online(tcpu)) {
2404 cpu = tcpu;
2405 goto done;
2406 }
2407 }
2408
2409done:
2410 return cpu;
2411}
2412
2413/* Called from hardirq (IPI) context */
2414static void rps_trigger_softirq(void *data)
2415{
2416 struct softnet_data *sd = data;
2417
2418 ____napi_schedule(sd, &sd->backlog);
2419 sd->received_rps++;
2420}
2421
2422#endif /* CONFIG_RPS */
2423
2424/*
2425 * Check if this softnet_data structure is another cpu one
2426 * If yes, queue it to our IPI list and return 1
2427 * If no, return 0
2428 */
2429static int rps_ipi_queued(struct softnet_data *sd)
2430{
2431#ifdef CONFIG_RPS
2432 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2433
2434 if (sd != mysd) {
2435 sd->rps_ipi_next = mysd->rps_ipi_list;
2436 mysd->rps_ipi_list = sd;
2437
2438 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2439 return 1;
2440 }
2441#endif /* CONFIG_RPS */
2442 return 0;
2443}
2444
2445/*
2446 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2447 * queue (may be a remote CPU queue).
2448 */
2449static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2450 unsigned int *qtail)
2451{
2452 struct softnet_data *sd;
2453 unsigned long flags;
2454
2455 sd = &per_cpu(softnet_data, cpu);
2456
2457 local_irq_save(flags);
2458
2459 rps_lock(sd);
2460 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2461 if (skb_queue_len(&sd->input_pkt_queue)) {
2462enqueue:
2463 __skb_queue_tail(&sd->input_pkt_queue, skb);
2464 input_queue_tail_incr_save(sd, qtail);
2465 rps_unlock(sd);
2466 local_irq_restore(flags);
2467 return NET_RX_SUCCESS;
2468 }
2469
2470 /* Schedule NAPI for backlog device
2471 * We can use non atomic operation since we own the queue lock
2472 */
2473 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2474 if (!rps_ipi_queued(sd))
2475 ____napi_schedule(sd, &sd->backlog);
2476 }
2477 goto enqueue;
2478 }
2479
2480 sd->dropped++;
2481 rps_unlock(sd);
2482
2483 local_irq_restore(flags);
2484
2485 kfree_skb(skb);
2486 return NET_RX_DROP;
2487}
2116 2488
2117/** 2489/**
2118 * netif_rx - post buffer to the network code 2490 * netif_rx - post buffer to the network code
@@ -2131,41 +2503,40 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2131 2503
2132int netif_rx(struct sk_buff *skb) 2504int netif_rx(struct sk_buff *skb)
2133{ 2505{
2134 struct softnet_data *queue; 2506 int ret;
2135 unsigned long flags;
2136 2507
2137 /* if netpoll wants it, pretend we never saw it */ 2508 /* if netpoll wants it, pretend we never saw it */
2138 if (netpoll_rx(skb)) 2509 if (netpoll_rx(skb))
2139 return NET_RX_DROP; 2510 return NET_RX_DROP;
2140 2511
2141 if (!skb->tstamp.tv64) 2512 if (netdev_tstamp_prequeue)
2142 net_timestamp(skb); 2513 net_timestamp_check(skb);
2143 2514
2144 /* 2515#ifdef CONFIG_RPS
2145 * The code is rearranged so that the path is the most 2516 {
2146 * short when CPU is congested, but is still operating. 2517 struct rps_dev_flow voidflow, *rflow = &voidflow;
2147 */ 2518 int cpu;
2148 local_irq_save(flags);
2149 queue = &__get_cpu_var(softnet_data);
2150 2519
2151 __get_cpu_var(netdev_rx_stat).total++; 2520 preempt_disable();
2152 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 2521 rcu_read_lock();
2153 if (queue->input_pkt_queue.qlen) {
2154enqueue:
2155 __skb_queue_tail(&queue->input_pkt_queue, skb);
2156 local_irq_restore(flags);
2157 return NET_RX_SUCCESS;
2158 }
2159 2522
2160 napi_schedule(&queue->backlog); 2523 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2161 goto enqueue; 2524 if (cpu < 0)
2162 } 2525 cpu = smp_processor_id();
2163 2526
2164 __get_cpu_var(netdev_rx_stat).dropped++; 2527 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2165 local_irq_restore(flags);
2166 2528
2167 kfree_skb(skb); 2529 rcu_read_unlock();
2168 return NET_RX_DROP; 2530 preempt_enable();
2531 }
2532#else
2533 {
2534 unsigned int qtail;
2535 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2536 put_cpu();
2537 }
2538#endif
2539 return ret;
2169} 2540}
2170EXPORT_SYMBOL(netif_rx); 2541EXPORT_SYMBOL(netif_rx);
2171 2542
@@ -2210,6 +2581,7 @@ static void net_tx_action(struct softirq_action *h)
2210 local_irq_disable(); 2581 local_irq_disable();
2211 head = sd->output_queue; 2582 head = sd->output_queue;
2212 sd->output_queue = NULL; 2583 sd->output_queue = NULL;
2584 sd->output_queue_tailp = &sd->output_queue;
2213 local_irq_enable(); 2585 local_irq_enable();
2214 2586
2215 while (head) { 2587 while (head) {
@@ -2247,66 +2619,14 @@ static inline int deliver_skb(struct sk_buff *skb,
2247 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2619 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2248} 2620}
2249 2621
2250#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) 2622#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2251 2623 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2252#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2253/* This hook is defined here for ATM LANE */ 2624/* This hook is defined here for ATM LANE */
2254int (*br_fdb_test_addr_hook)(struct net_device *dev, 2625int (*br_fdb_test_addr_hook)(struct net_device *dev,
2255 unsigned char *addr) __read_mostly; 2626 unsigned char *addr) __read_mostly;
2256EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 2627EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2257#endif 2628#endif
2258 2629
2259/*
2260 * If bridge module is loaded call bridging hook.
2261 * returns NULL if packet was consumed.
2262 */
2263struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2264 struct sk_buff *skb) __read_mostly;
2265EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2266
2267static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2268 struct packet_type **pt_prev, int *ret,
2269 struct net_device *orig_dev)
2270{
2271 struct net_bridge_port *port;
2272
2273 if (skb->pkt_type == PACKET_LOOPBACK ||
2274 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2275 return skb;
2276
2277 if (*pt_prev) {
2278 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2279 *pt_prev = NULL;
2280 }
2281
2282 return br_handle_frame_hook(port, skb);
2283}
2284#else
2285#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
2286#endif
2287
2288#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2289struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2290EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2291
2292static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2293 struct packet_type **pt_prev,
2294 int *ret,
2295 struct net_device *orig_dev)
2296{
2297 if (skb->dev->macvlan_port == NULL)
2298 return skb;
2299
2300 if (*pt_prev) {
2301 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2302 *pt_prev = NULL;
2303 }
2304 return macvlan_handle_frame_hook(skb);
2305}
2306#else
2307#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2308#endif
2309
2310#ifdef CONFIG_NET_CLS_ACT 2630#ifdef CONFIG_NET_CLS_ACT
2311/* TODO: Maybe we should just force sch_ingress to be compiled in 2631/* TODO: Maybe we should just force sch_ingress to be compiled in
2312 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2632 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
@@ -2324,10 +2644,10 @@ static int ing_filter(struct sk_buff *skb)
2324 int result = TC_ACT_OK; 2644 int result = TC_ACT_OK;
2325 struct Qdisc *q; 2645 struct Qdisc *q;
2326 2646
2327 if (MAX_RED_LOOP < ttl++) { 2647 if (unlikely(MAX_RED_LOOP < ttl++)) {
2328 printk(KERN_WARNING 2648 if (net_ratelimit())
2329 "Redir loop detected Dropping packet (%d->%d)\n", 2649 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2330 skb->skb_iif, dev->ifindex); 2650 skb->skb_iif, dev->ifindex);
2331 return TC_ACT_SHOT; 2651 return TC_ACT_SHOT;
2332 } 2652 }
2333 2653
@@ -2357,9 +2677,6 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2357 if (*pt_prev) { 2677 if (*pt_prev) {
2358 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2678 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2359 *pt_prev = NULL; 2679 *pt_prev = NULL;
2360 } else {
2361 /* Huh? Why does turning on AF_PACKET affect this? */
2362 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2363 } 2680 }
2364 2681
2365 switch (ing_filter(skb)) { 2682 switch (ing_filter(skb)) {
@@ -2403,30 +2720,113 @@ void netif_nit_deliver(struct sk_buff *skb)
2403} 2720}
2404 2721
2405/** 2722/**
2406 * netif_receive_skb - process receive buffer from network 2723 * netdev_rx_handler_register - register receive handler
2407 * @skb: buffer to process 2724 * @dev: device to register a handler for
2725 * @rx_handler: receive handler to register
2726 * @rx_handler_data: data pointer that is used by rx handler
2408 * 2727 *
2409 * netif_receive_skb() is the main receive data processing function. 2728 * Register a receive hander for a device. This handler will then be
2410 * It always succeeds. The buffer may be dropped during processing 2729 * called from __netif_receive_skb. A negative errno code is returned
2411 * for congestion control or by the protocol layers. 2730 * on a failure.
2412 * 2731 *
2413 * This function may only be called from softirq context and interrupts 2732 * The caller must hold the rtnl_mutex.
2414 * should be enabled. 2733 */
2734int netdev_rx_handler_register(struct net_device *dev,
2735 rx_handler_func_t *rx_handler,
2736 void *rx_handler_data)
2737{
2738 ASSERT_RTNL();
2739
2740 if (dev->rx_handler)
2741 return -EBUSY;
2742
2743 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2744 rcu_assign_pointer(dev->rx_handler, rx_handler);
2745
2746 return 0;
2747}
2748EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2749
2750/**
2751 * netdev_rx_handler_unregister - unregister receive handler
2752 * @dev: device to unregister a handler from
2415 * 2753 *
2416 * Return values (usually ignored): 2754 * Unregister a receive hander from a device.
2417 * NET_RX_SUCCESS: no congestion 2755 *
2418 * NET_RX_DROP: packet was dropped 2756 * The caller must hold the rtnl_mutex.
2419 */ 2757 */
2420int netif_receive_skb(struct sk_buff *skb) 2758void netdev_rx_handler_unregister(struct net_device *dev)
2759{
2760
2761 ASSERT_RTNL();
2762 rcu_assign_pointer(dev->rx_handler, NULL);
2763 rcu_assign_pointer(dev->rx_handler_data, NULL);
2764}
2765EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2766
2767static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2768 struct net_device *master)
2769{
2770 if (skb->pkt_type == PACKET_HOST) {
2771 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2772
2773 memcpy(dest, master->dev_addr, ETH_ALEN);
2774 }
2775}
2776
2777/* On bonding slaves other than the currently active slave, suppress
2778 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2779 * ARP on active-backup slaves with arp_validate enabled.
2780 */
2781int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2782{
2783 struct net_device *dev = skb->dev;
2784
2785 if (master->priv_flags & IFF_MASTER_ARPMON)
2786 dev->last_rx = jiffies;
2787
2788 if ((master->priv_flags & IFF_MASTER_ALB) &&
2789 (master->priv_flags & IFF_BRIDGE_PORT)) {
2790 /* Do address unmangle. The local destination address
2791 * will be always the one master has. Provides the right
2792 * functionality in a bridge.
2793 */
2794 skb_bond_set_mac_by_master(skb, master);
2795 }
2796
2797 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2798 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2799 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2800 return 0;
2801
2802 if (master->priv_flags & IFF_MASTER_ALB) {
2803 if (skb->pkt_type != PACKET_BROADCAST &&
2804 skb->pkt_type != PACKET_MULTICAST)
2805 return 0;
2806 }
2807 if (master->priv_flags & IFF_MASTER_8023AD &&
2808 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2809 return 0;
2810
2811 return 1;
2812 }
2813 return 0;
2814}
2815EXPORT_SYMBOL(__skb_bond_should_drop);
2816
2817static int __netif_receive_skb(struct sk_buff *skb)
2421{ 2818{
2422 struct packet_type *ptype, *pt_prev; 2819 struct packet_type *ptype, *pt_prev;
2820 rx_handler_func_t *rx_handler;
2423 struct net_device *orig_dev; 2821 struct net_device *orig_dev;
2822 struct net_device *master;
2424 struct net_device *null_or_orig; 2823 struct net_device *null_or_orig;
2824 struct net_device *orig_or_bond;
2425 int ret = NET_RX_DROP; 2825 int ret = NET_RX_DROP;
2426 __be16 type; 2826 __be16 type;
2427 2827
2428 if (!skb->tstamp.tv64) 2828 if (!netdev_tstamp_prequeue)
2429 net_timestamp(skb); 2829 net_timestamp_check(skb);
2430 2830
2431 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 2831 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2432 return NET_RX_SUCCESS; 2832 return NET_RX_SUCCESS;
@@ -2438,17 +2838,28 @@ int netif_receive_skb(struct sk_buff *skb)
2438 if (!skb->skb_iif) 2838 if (!skb->skb_iif)
2439 skb->skb_iif = skb->dev->ifindex; 2839 skb->skb_iif = skb->dev->ifindex;
2440 2840
2841 /*
2842 * bonding note: skbs received on inactive slaves should only
2843 * be delivered to pkt handlers that are exact matches. Also
2844 * the deliver_no_wcard flag will be set. If packet handlers
2845 * are sensitive to duplicate packets these skbs will need to
2846 * be dropped at the handler. The vlan accel path may have
2847 * already set the deliver_no_wcard flag.
2848 */
2441 null_or_orig = NULL; 2849 null_or_orig = NULL;
2442 orig_dev = skb->dev; 2850 orig_dev = skb->dev;
2443 if (orig_dev->master) { 2851 master = ACCESS_ONCE(orig_dev->master);
2444 if (skb_bond_should_drop(skb)) 2852 if (skb->deliver_no_wcard)
2853 null_or_orig = orig_dev;
2854 else if (master) {
2855 if (skb_bond_should_drop(skb, master)) {
2856 skb->deliver_no_wcard = 1;
2445 null_or_orig = orig_dev; /* deliver only exact match */ 2857 null_or_orig = orig_dev; /* deliver only exact match */
2446 else 2858 } else
2447 skb->dev = orig_dev->master; 2859 skb->dev = master;
2448 } 2860 }
2449 2861
2450 __get_cpu_var(netdev_rx_stat).total++; 2862 __this_cpu_inc(softnet_data.processed);
2451
2452 skb_reset_network_header(skb); 2863 skb_reset_network_header(skb);
2453 skb_reset_transport_header(skb); 2864 skb_reset_transport_header(skb);
2454 skb->mac_len = skb->network_header - skb->mac_header; 2865 skb->mac_len = skb->network_header - skb->mac_header;
@@ -2480,19 +2891,36 @@ int netif_receive_skb(struct sk_buff *skb)
2480ncls: 2891ncls:
2481#endif 2892#endif
2482 2893
2483 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); 2894 /* Handle special case of bridge or macvlan */
2484 if (!skb) 2895 rx_handler = rcu_dereference(skb->dev->rx_handler);
2485 goto out; 2896 if (rx_handler) {
2486 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); 2897 if (pt_prev) {
2487 if (!skb) 2898 ret = deliver_skb(skb, pt_prev, orig_dev);
2488 goto out; 2899 pt_prev = NULL;
2900 }
2901 skb = rx_handler(skb);
2902 if (!skb)
2903 goto out;
2904 }
2905
2906 /*
2907 * Make sure frames received on VLAN interfaces stacked on
2908 * bonding interfaces still make their way to any base bonding
2909 * device that may have registered for a specific ptype. The
2910 * handler may have to adjust skb->dev and orig_dev.
2911 */
2912 orig_or_bond = orig_dev;
2913 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2914 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2915 orig_or_bond = vlan_dev_real_dev(skb->dev);
2916 }
2489 2917
2490 type = skb->protocol; 2918 type = skb->protocol;
2491 list_for_each_entry_rcu(ptype, 2919 list_for_each_entry_rcu(ptype,
2492 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 2920 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2493 if (ptype->type == type && 2921 if (ptype->type == type && (ptype->dev == null_or_orig ||
2494 (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2922 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2495 ptype->dev == orig_dev)) { 2923 ptype->dev == orig_or_bond)) {
2496 if (pt_prev) 2924 if (pt_prev)
2497 ret = deliver_skb(skb, pt_prev, orig_dev); 2925 ret = deliver_skb(skb, pt_prev, orig_dev);
2498 pt_prev = ptype; 2926 pt_prev = ptype;
@@ -2513,20 +2941,81 @@ out:
2513 rcu_read_unlock(); 2941 rcu_read_unlock();
2514 return ret; 2942 return ret;
2515} 2943}
2944
2945/**
2946 * netif_receive_skb - process receive buffer from network
2947 * @skb: buffer to process
2948 *
2949 * netif_receive_skb() is the main receive data processing function.
2950 * It always succeeds. The buffer may be dropped during processing
2951 * for congestion control or by the protocol layers.
2952 *
2953 * This function may only be called from softirq context and interrupts
2954 * should be enabled.
2955 *
2956 * Return values (usually ignored):
2957 * NET_RX_SUCCESS: no congestion
2958 * NET_RX_DROP: packet was dropped
2959 */
2960int netif_receive_skb(struct sk_buff *skb)
2961{
2962 if (netdev_tstamp_prequeue)
2963 net_timestamp_check(skb);
2964
2965 if (skb_defer_rx_timestamp(skb))
2966 return NET_RX_SUCCESS;
2967
2968#ifdef CONFIG_RPS
2969 {
2970 struct rps_dev_flow voidflow, *rflow = &voidflow;
2971 int cpu, ret;
2972
2973 rcu_read_lock();
2974
2975 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2976
2977 if (cpu >= 0) {
2978 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2979 rcu_read_unlock();
2980 } else {
2981 rcu_read_unlock();
2982 ret = __netif_receive_skb(skb);
2983 }
2984
2985 return ret;
2986 }
2987#else
2988 return __netif_receive_skb(skb);
2989#endif
2990}
2516EXPORT_SYMBOL(netif_receive_skb); 2991EXPORT_SYMBOL(netif_receive_skb);
2517 2992
2518/* Network device is going away, flush any packets still pending */ 2993/* Network device is going away, flush any packets still pending
2994 * Called with irqs disabled.
2995 */
2519static void flush_backlog(void *arg) 2996static void flush_backlog(void *arg)
2520{ 2997{
2521 struct net_device *dev = arg; 2998 struct net_device *dev = arg;
2522 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2999 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2523 struct sk_buff *skb, *tmp; 3000 struct sk_buff *skb, *tmp;
2524 3001
2525 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) 3002 rps_lock(sd);
3003 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3004 if (skb->dev == dev) {
3005 __skb_unlink(skb, &sd->input_pkt_queue);
3006 kfree_skb(skb);
3007 input_queue_head_incr(sd);
3008 }
3009 }
3010 rps_unlock(sd);
3011
3012 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2526 if (skb->dev == dev) { 3013 if (skb->dev == dev) {
2527 __skb_unlink(skb, &queue->input_pkt_queue); 3014 __skb_unlink(skb, &sd->process_queue);
2528 kfree_skb(skb); 3015 kfree_skb(skb);
3016 input_queue_head_incr(sd);
2529 } 3017 }
3018 }
2530} 3019}
2531 3020
2532static int napi_gro_complete(struct sk_buff *skb) 3021static int napi_gro_complete(struct sk_buff *skb)
@@ -2561,7 +3050,7 @@ out:
2561 return netif_receive_skb(skb); 3050 return netif_receive_skb(skb);
2562} 3051}
2563 3052
2564void napi_gro_flush(struct napi_struct *napi) 3053static void napi_gro_flush(struct napi_struct *napi)
2565{ 3054{
2566 struct sk_buff *skb, *next; 3055 struct sk_buff *skb, *next;
2567 3056
@@ -2574,7 +3063,6 @@ void napi_gro_flush(struct napi_struct *napi)
2574 napi->gro_count = 0; 3063 napi->gro_count = 0;
2575 napi->gro_list = NULL; 3064 napi->gro_list = NULL;
2576} 3065}
2577EXPORT_SYMBOL(napi_gro_flush);
2578 3066
2579enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3067enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2580{ 3068{
@@ -2586,7 +3074,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2586 int mac_len; 3074 int mac_len;
2587 enum gro_result ret; 3075 enum gro_result ret;
2588 3076
2589 if (!(skb->dev->features & NETIF_F_GRO)) 3077 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
2590 goto normal; 3078 goto normal;
2591 3079
2592 if (skb_is_gso(skb) || skb_has_frags(skb)) 3080 if (skb_is_gso(skb) || skb_has_frags(skb))
@@ -2655,7 +3143,7 @@ pull:
2655 put_page(skb_shinfo(skb)->frags[0].page); 3143 put_page(skb_shinfo(skb)->frags[0].page);
2656 memmove(skb_shinfo(skb)->frags, 3144 memmove(skb_shinfo(skb)->frags,
2657 skb_shinfo(skb)->frags + 1, 3145 skb_shinfo(skb)->frags + 1,
2658 --skb_shinfo(skb)->nr_frags); 3146 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
2659 } 3147 }
2660 } 3148 }
2661 3149
@@ -2673,9 +3161,6 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2673{ 3161{
2674 struct sk_buff *p; 3162 struct sk_buff *p;
2675 3163
2676 if (netpoll_rx_on(skb))
2677 return GRO_NORMAL;
2678
2679 for (p = napi->gro_list; p; p = p->next) { 3164 for (p = napi->gro_list; p; p = p->next) {
2680 NAPI_GRO_CB(p)->same_flow = 3165 NAPI_GRO_CB(p)->same_flow =
2681 (p->dev == skb->dev) && 3166 (p->dev == skb->dev) &&
@@ -2761,7 +3246,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2761 switch (ret) { 3246 switch (ret) {
2762 case GRO_NORMAL: 3247 case GRO_NORMAL:
2763 case GRO_HELD: 3248 case GRO_HELD:
2764 skb->protocol = eth_type_trans(skb, napi->dev); 3249 skb->protocol = eth_type_trans(skb, skb->dev);
2765 3250
2766 if (ret == GRO_HELD) 3251 if (ret == GRO_HELD)
2767 skb_gro_pull(skb, -ETH_HLEN); 3252 skb_gro_pull(skb, -ETH_HLEN);
@@ -2830,27 +3315,87 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
2830} 3315}
2831EXPORT_SYMBOL(napi_gro_frags); 3316EXPORT_SYMBOL(napi_gro_frags);
2832 3317
3318/*
3319 * net_rps_action sends any pending IPI's for rps.
3320 * Note: called with local irq disabled, but exits with local irq enabled.
3321 */
3322static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3323{
3324#ifdef CONFIG_RPS
3325 struct softnet_data *remsd = sd->rps_ipi_list;
3326
3327 if (remsd) {
3328 sd->rps_ipi_list = NULL;
3329
3330 local_irq_enable();
3331
3332 /* Send pending IPI's to kick RPS processing on remote cpus. */
3333 while (remsd) {
3334 struct softnet_data *next = remsd->rps_ipi_next;
3335
3336 if (cpu_online(remsd->cpu))
3337 __smp_call_function_single(remsd->cpu,
3338 &remsd->csd, 0);
3339 remsd = next;
3340 }
3341 } else
3342#endif
3343 local_irq_enable();
3344}
3345
2833static int process_backlog(struct napi_struct *napi, int quota) 3346static int process_backlog(struct napi_struct *napi, int quota)
2834{ 3347{
2835 int work = 0; 3348 int work = 0;
2836 struct softnet_data *queue = &__get_cpu_var(softnet_data); 3349 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
2837 unsigned long start_time = jiffies;
2838 3350
3351#ifdef CONFIG_RPS
3352 /* Check if we have pending ipi, its better to send them now,
3353 * not waiting net_rx_action() end.
3354 */
3355 if (sd->rps_ipi_list) {
3356 local_irq_disable();
3357 net_rps_action_and_irq_enable(sd);
3358 }
3359#endif
2839 napi->weight = weight_p; 3360 napi->weight = weight_p;
2840 do { 3361 local_irq_disable();
3362 while (work < quota) {
2841 struct sk_buff *skb; 3363 struct sk_buff *skb;
3364 unsigned int qlen;
2842 3365
2843 local_irq_disable(); 3366 while ((skb = __skb_dequeue(&sd->process_queue))) {
2844 skb = __skb_dequeue(&queue->input_pkt_queue);
2845 if (!skb) {
2846 __napi_complete(napi);
2847 local_irq_enable(); 3367 local_irq_enable();
2848 break; 3368 __netif_receive_skb(skb);
3369 local_irq_disable();
3370 input_queue_head_incr(sd);
3371 if (++work >= quota) {
3372 local_irq_enable();
3373 return work;
3374 }
2849 } 3375 }
2850 local_irq_enable();
2851 3376
2852 netif_receive_skb(skb); 3377 rps_lock(sd);
2853 } while (++work < quota && jiffies == start_time); 3378 qlen = skb_queue_len(&sd->input_pkt_queue);
3379 if (qlen)
3380 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3381 &sd->process_queue);
3382
3383 if (qlen < quota - work) {
3384 /*
3385 * Inline a custom version of __napi_complete().
3386 * only current cpu owns and manipulates this napi,
3387 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3388 * we can use a plain write instead of clear_bit(),
3389 * and we dont need an smp_mb() memory barrier.
3390 */
3391 list_del(&napi->poll_list);
3392 napi->state = 0;
3393
3394 quota = work + qlen;
3395 }
3396 rps_unlock(sd);
3397 }
3398 local_irq_enable();
2854 3399
2855 return work; 3400 return work;
2856} 3401}
@@ -2866,8 +3411,7 @@ void __napi_schedule(struct napi_struct *n)
2866 unsigned long flags; 3411 unsigned long flags;
2867 3412
2868 local_irq_save(flags); 3413 local_irq_save(flags);
2869 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); 3414 ____napi_schedule(&__get_cpu_var(softnet_data), n);
2870 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2871 local_irq_restore(flags); 3415 local_irq_restore(flags);
2872} 3416}
2873EXPORT_SYMBOL(__napi_schedule); 3417EXPORT_SYMBOL(__napi_schedule);
@@ -2938,17 +3482,16 @@ void netif_napi_del(struct napi_struct *napi)
2938} 3482}
2939EXPORT_SYMBOL(netif_napi_del); 3483EXPORT_SYMBOL(netif_napi_del);
2940 3484
2941
2942static void net_rx_action(struct softirq_action *h) 3485static void net_rx_action(struct softirq_action *h)
2943{ 3486{
2944 struct list_head *list = &__get_cpu_var(softnet_data).poll_list; 3487 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2945 unsigned long time_limit = jiffies + 2; 3488 unsigned long time_limit = jiffies + 2;
2946 int budget = netdev_budget; 3489 int budget = netdev_budget;
2947 void *have; 3490 void *have;
2948 3491
2949 local_irq_disable(); 3492 local_irq_disable();
2950 3493
2951 while (!list_empty(list)) { 3494 while (!list_empty(&sd->poll_list)) {
2952 struct napi_struct *n; 3495 struct napi_struct *n;
2953 int work, weight; 3496 int work, weight;
2954 3497
@@ -2966,7 +3509,7 @@ static void net_rx_action(struct softirq_action *h)
2966 * entries to the tail of this list, and only ->poll() 3509 * entries to the tail of this list, and only ->poll()
2967 * calls can remove this head entry from the list. 3510 * calls can remove this head entry from the list.
2968 */ 3511 */
2969 n = list_entry(list->next, struct napi_struct, poll_list); 3512 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
2970 3513
2971 have = netpoll_poll_lock(n); 3514 have = netpoll_poll_lock(n);
2972 3515
@@ -3001,13 +3544,13 @@ static void net_rx_action(struct softirq_action *h)
3001 napi_complete(n); 3544 napi_complete(n);
3002 local_irq_disable(); 3545 local_irq_disable();
3003 } else 3546 } else
3004 list_move_tail(&n->poll_list, list); 3547 list_move_tail(&n->poll_list, &sd->poll_list);
3005 } 3548 }
3006 3549
3007 netpoll_poll_unlock(have); 3550 netpoll_poll_unlock(have);
3008 } 3551 }
3009out: 3552out:
3010 local_irq_enable(); 3553 net_rps_action_and_irq_enable(sd);
3011 3554
3012#ifdef CONFIG_NET_DMA 3555#ifdef CONFIG_NET_DMA
3013 /* 3556 /*
@@ -3020,7 +3563,7 @@ out:
3020 return; 3563 return;
3021 3564
3022softnet_break: 3565softnet_break:
3023 __get_cpu_var(netdev_rx_stat).time_squeeze++; 3566 sd->time_squeeze++;
3024 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3567 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3025 goto out; 3568 goto out;
3026} 3569}
@@ -3183,10 +3726,11 @@ void dev_seq_stop(struct seq_file *seq, void *v)
3183 3726
3184static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 3727static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3185{ 3728{
3186 const struct net_device_stats *stats = dev_get_stats(dev); 3729 struct rtnl_link_stats64 temp;
3730 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3187 3731
3188 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " 3732 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3189 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", 3733 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3190 dev->name, stats->rx_bytes, stats->rx_packets, 3734 dev->name, stats->rx_bytes, stats->rx_packets,
3191 stats->rx_errors, 3735 stats->rx_errors,
3192 stats->rx_dropped + stats->rx_missed_errors, 3736 stats->rx_dropped + stats->rx_missed_errors,
@@ -3221,17 +3765,17 @@ static int dev_seq_show(struct seq_file *seq, void *v)
3221 return 0; 3765 return 0;
3222} 3766}
3223 3767
3224static struct netif_rx_stats *softnet_get_online(loff_t *pos) 3768static struct softnet_data *softnet_get_online(loff_t *pos)
3225{ 3769{
3226 struct netif_rx_stats *rc = NULL; 3770 struct softnet_data *sd = NULL;
3227 3771
3228 while (*pos < nr_cpu_ids) 3772 while (*pos < nr_cpu_ids)
3229 if (cpu_online(*pos)) { 3773 if (cpu_online(*pos)) {
3230 rc = &per_cpu(netdev_rx_stat, *pos); 3774 sd = &per_cpu(softnet_data, *pos);
3231 break; 3775 break;
3232 } else 3776 } else
3233 ++*pos; 3777 ++*pos;
3234 return rc; 3778 return sd;
3235} 3779}
3236 3780
3237static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 3781static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
@@ -3251,12 +3795,12 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
3251 3795
3252static int softnet_seq_show(struct seq_file *seq, void *v) 3796static int softnet_seq_show(struct seq_file *seq, void *v)
3253{ 3797{
3254 struct netif_rx_stats *s = v; 3798 struct softnet_data *sd = v;
3255 3799
3256 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3800 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3257 s->total, s->dropped, s->time_squeeze, 0, 3801 sd->processed, sd->dropped, sd->time_squeeze, 0,
3258 0, 0, 0, 0, /* was fastroute */ 3802 0, 0, 0, 0, /* was fastroute */
3259 s->cpu_collision); 3803 sd->cpu_collision, sd->received_rps);
3260 return 0; 3804 return 0;
3261} 3805}
3262 3806
@@ -3479,11 +4023,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
3479 4023
3480 slave->master = master; 4024 slave->master = master;
3481 4025
3482 synchronize_net(); 4026 if (old) {
3483 4027 synchronize_net();
3484 if (old)
3485 dev_put(old); 4028 dev_put(old);
3486 4029 }
3487 if (master) 4030 if (master)
3488 slave->flags |= IFF_SLAVE; 4031 slave->flags |= IFF_SLAVE;
3489 else 4032 else
@@ -3640,10 +4183,10 @@ void __dev_set_rx_mode(struct net_device *dev)
3640 /* Unicast addresses changes may only happen under the rtnl, 4183 /* Unicast addresses changes may only happen under the rtnl,
3641 * therefore calling __dev_set_promiscuity here is safe. 4184 * therefore calling __dev_set_promiscuity here is safe.
3642 */ 4185 */
3643 if (dev->uc.count > 0 && !dev->uc_promisc) { 4186 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
3644 __dev_set_promiscuity(dev, 1); 4187 __dev_set_promiscuity(dev, 1);
3645 dev->uc_promisc = 1; 4188 dev->uc_promisc = 1;
3646 } else if (dev->uc.count == 0 && dev->uc_promisc) { 4189 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
3647 __dev_set_promiscuity(dev, -1); 4190 __dev_set_promiscuity(dev, -1);
3648 dev->uc_promisc = 0; 4191 dev->uc_promisc = 0;
3649 } 4192 }
@@ -3660,562 +4203,6 @@ void dev_set_rx_mode(struct net_device *dev)
3660 netif_addr_unlock_bh(dev); 4203 netif_addr_unlock_bh(dev);
3661} 4204}
3662 4205
3663/* hw addresses list handling functions */
3664
3665static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3666 int addr_len, unsigned char addr_type)
3667{
3668 struct netdev_hw_addr *ha;
3669 int alloc_size;
3670
3671 if (addr_len > MAX_ADDR_LEN)
3672 return -EINVAL;
3673
3674 list_for_each_entry(ha, &list->list, list) {
3675 if (!memcmp(ha->addr, addr, addr_len) &&
3676 ha->type == addr_type) {
3677 ha->refcount++;
3678 return 0;
3679 }
3680 }
3681
3682
3683 alloc_size = sizeof(*ha);
3684 if (alloc_size < L1_CACHE_BYTES)
3685 alloc_size = L1_CACHE_BYTES;
3686 ha = kmalloc(alloc_size, GFP_ATOMIC);
3687 if (!ha)
3688 return -ENOMEM;
3689 memcpy(ha->addr, addr, addr_len);
3690 ha->type = addr_type;
3691 ha->refcount = 1;
3692 ha->synced = false;
3693 list_add_tail_rcu(&ha->list, &list->list);
3694 list->count++;
3695 return 0;
3696}
3697
3698static void ha_rcu_free(struct rcu_head *head)
3699{
3700 struct netdev_hw_addr *ha;
3701
3702 ha = container_of(head, struct netdev_hw_addr, rcu_head);
3703 kfree(ha);
3704}
3705
3706static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3707 int addr_len, unsigned char addr_type)
3708{
3709 struct netdev_hw_addr *ha;
3710
3711 list_for_each_entry(ha, &list->list, list) {
3712 if (!memcmp(ha->addr, addr, addr_len) &&
3713 (ha->type == addr_type || !addr_type)) {
3714 if (--ha->refcount)
3715 return 0;
3716 list_del_rcu(&ha->list);
3717 call_rcu(&ha->rcu_head, ha_rcu_free);
3718 list->count--;
3719 return 0;
3720 }
3721 }
3722 return -ENOENT;
3723}
3724
3725static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3726 struct netdev_hw_addr_list *from_list,
3727 int addr_len,
3728 unsigned char addr_type)
3729{
3730 int err;
3731 struct netdev_hw_addr *ha, *ha2;
3732 unsigned char type;
3733
3734 list_for_each_entry(ha, &from_list->list, list) {
3735 type = addr_type ? addr_type : ha->type;
3736 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3737 if (err)
3738 goto unroll;
3739 }
3740 return 0;
3741
3742unroll:
3743 list_for_each_entry(ha2, &from_list->list, list) {
3744 if (ha2 == ha)
3745 break;
3746 type = addr_type ? addr_type : ha2->type;
3747 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3748 }
3749 return err;
3750}
3751
3752static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3753 struct netdev_hw_addr_list *from_list,
3754 int addr_len,
3755 unsigned char addr_type)
3756{
3757 struct netdev_hw_addr *ha;
3758 unsigned char type;
3759
3760 list_for_each_entry(ha, &from_list->list, list) {
3761 type = addr_type ? addr_type : ha->type;
3762 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3763 }
3764}
3765
3766static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3767 struct netdev_hw_addr_list *from_list,
3768 int addr_len)
3769{
3770 int err = 0;
3771 struct netdev_hw_addr *ha, *tmp;
3772
3773 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3774 if (!ha->synced) {
3775 err = __hw_addr_add(to_list, ha->addr,
3776 addr_len, ha->type);
3777 if (err)
3778 break;
3779 ha->synced = true;
3780 ha->refcount++;
3781 } else if (ha->refcount == 1) {
3782 __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3783 __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3784 }
3785 }
3786 return err;
3787}
3788
3789static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3790 struct netdev_hw_addr_list *from_list,
3791 int addr_len)
3792{
3793 struct netdev_hw_addr *ha, *tmp;
3794
3795 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3796 if (ha->synced) {
3797 __hw_addr_del(to_list, ha->addr,
3798 addr_len, ha->type);
3799 ha->synced = false;
3800 __hw_addr_del(from_list, ha->addr,
3801 addr_len, ha->type);
3802 }
3803 }
3804}
3805
3806static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3807{
3808 struct netdev_hw_addr *ha, *tmp;
3809
3810 list_for_each_entry_safe(ha, tmp, &list->list, list) {
3811 list_del_rcu(&ha->list);
3812 call_rcu(&ha->rcu_head, ha_rcu_free);
3813 }
3814 list->count = 0;
3815}
3816
3817static void __hw_addr_init(struct netdev_hw_addr_list *list)
3818{
3819 INIT_LIST_HEAD(&list->list);
3820 list->count = 0;
3821}
3822
3823/* Device addresses handling functions */
3824
3825static void dev_addr_flush(struct net_device *dev)
3826{
3827 /* rtnl_mutex must be held here */
3828
3829 __hw_addr_flush(&dev->dev_addrs);
3830 dev->dev_addr = NULL;
3831}
3832
3833static int dev_addr_init(struct net_device *dev)
3834{
3835 unsigned char addr[MAX_ADDR_LEN];
3836 struct netdev_hw_addr *ha;
3837 int err;
3838
3839 /* rtnl_mutex must be held here */
3840
3841 __hw_addr_init(&dev->dev_addrs);
3842 memset(addr, 0, sizeof(addr));
3843 err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3844 NETDEV_HW_ADDR_T_LAN);
3845 if (!err) {
3846 /*
3847 * Get the first (previously created) address from the list
3848 * and set dev_addr pointer to this location.
3849 */
3850 ha = list_first_entry(&dev->dev_addrs.list,
3851 struct netdev_hw_addr, list);
3852 dev->dev_addr = ha->addr;
3853 }
3854 return err;
3855}
3856
3857/**
3858 * dev_addr_add - Add a device address
3859 * @dev: device
3860 * @addr: address to add
3861 * @addr_type: address type
3862 *
3863 * Add a device address to the device or increase the reference count if
3864 * it already exists.
3865 *
3866 * The caller must hold the rtnl_mutex.
3867 */
3868int dev_addr_add(struct net_device *dev, unsigned char *addr,
3869 unsigned char addr_type)
3870{
3871 int err;
3872
3873 ASSERT_RTNL();
3874
3875 err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3876 if (!err)
3877 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3878 return err;
3879}
3880EXPORT_SYMBOL(dev_addr_add);
3881
3882/**
3883 * dev_addr_del - Release a device address.
3884 * @dev: device
3885 * @addr: address to delete
3886 * @addr_type: address type
3887 *
3888 * Release reference to a device address and remove it from the device
3889 * if the reference count drops to zero.
3890 *
3891 * The caller must hold the rtnl_mutex.
3892 */
3893int dev_addr_del(struct net_device *dev, unsigned char *addr,
3894 unsigned char addr_type)
3895{
3896 int err;
3897 struct netdev_hw_addr *ha;
3898
3899 ASSERT_RTNL();
3900
3901 /*
3902 * We can not remove the first address from the list because
3903 * dev->dev_addr points to that.
3904 */
3905 ha = list_first_entry(&dev->dev_addrs.list,
3906 struct netdev_hw_addr, list);
3907 if (ha->addr == dev->dev_addr && ha->refcount == 1)
3908 return -ENOENT;
3909
3910 err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3911 addr_type);
3912 if (!err)
3913 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3914 return err;
3915}
3916EXPORT_SYMBOL(dev_addr_del);
3917
3918/**
3919 * dev_addr_add_multiple - Add device addresses from another device
3920 * @to_dev: device to which addresses will be added
3921 * @from_dev: device from which addresses will be added
3922 * @addr_type: address type - 0 means type will be used from from_dev
3923 *
3924 * Add device addresses of the one device to another.
3925 **
3926 * The caller must hold the rtnl_mutex.
3927 */
3928int dev_addr_add_multiple(struct net_device *to_dev,
3929 struct net_device *from_dev,
3930 unsigned char addr_type)
3931{
3932 int err;
3933
3934 ASSERT_RTNL();
3935
3936 if (from_dev->addr_len != to_dev->addr_len)
3937 return -EINVAL;
3938 err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3939 to_dev->addr_len, addr_type);
3940 if (!err)
3941 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3942 return err;
3943}
3944EXPORT_SYMBOL(dev_addr_add_multiple);
3945
3946/**
3947 * dev_addr_del_multiple - Delete device addresses by another device
3948 * @to_dev: device where the addresses will be deleted
3949 * @from_dev: device by which addresses the addresses will be deleted
3950 * @addr_type: address type - 0 means type will used from from_dev
3951 *
3952 * Deletes addresses in to device by the list of addresses in from device.
3953 *
3954 * The caller must hold the rtnl_mutex.
3955 */
3956int dev_addr_del_multiple(struct net_device *to_dev,
3957 struct net_device *from_dev,
3958 unsigned char addr_type)
3959{
3960 ASSERT_RTNL();
3961
3962 if (from_dev->addr_len != to_dev->addr_len)
3963 return -EINVAL;
3964 __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3965 to_dev->addr_len, addr_type);
3966 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3967 return 0;
3968}
3969EXPORT_SYMBOL(dev_addr_del_multiple);
3970
3971/* multicast addresses handling functions */
3972
3973int __dev_addr_delete(struct dev_addr_list **list, int *count,
3974 void *addr, int alen, int glbl)
3975{
3976 struct dev_addr_list *da;
3977
3978 for (; (da = *list) != NULL; list = &da->next) {
3979 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3980 alen == da->da_addrlen) {
3981 if (glbl) {
3982 int old_glbl = da->da_gusers;
3983 da->da_gusers = 0;
3984 if (old_glbl == 0)
3985 break;
3986 }
3987 if (--da->da_users)
3988 return 0;
3989
3990 *list = da->next;
3991 kfree(da);
3992 (*count)--;
3993 return 0;
3994 }
3995 }
3996 return -ENOENT;
3997}
3998
3999int __dev_addr_add(struct dev_addr_list **list, int *count,
4000 void *addr, int alen, int glbl)
4001{
4002 struct dev_addr_list *da;
4003
4004 for (da = *list; da != NULL; da = da->next) {
4005 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4006 da->da_addrlen == alen) {
4007 if (glbl) {
4008 int old_glbl = da->da_gusers;
4009 da->da_gusers = 1;
4010 if (old_glbl)
4011 return 0;
4012 }
4013 da->da_users++;
4014 return 0;
4015 }
4016 }
4017
4018 da = kzalloc(sizeof(*da), GFP_ATOMIC);
4019 if (da == NULL)
4020 return -ENOMEM;
4021 memcpy(da->da_addr, addr, alen);
4022 da->da_addrlen = alen;
4023 da->da_users = 1;
4024 da->da_gusers = glbl ? 1 : 0;
4025 da->next = *list;
4026 *list = da;
4027 (*count)++;
4028 return 0;
4029}
4030
4031/**
4032 * dev_unicast_delete - Release secondary unicast address.
4033 * @dev: device
4034 * @addr: address to delete
4035 *
4036 * Release reference to a secondary unicast address and remove it
4037 * from the device if the reference count drops to zero.
4038 *
4039 * The caller must hold the rtnl_mutex.
4040 */
4041int dev_unicast_delete(struct net_device *dev, void *addr)
4042{
4043 int err;
4044
4045 ASSERT_RTNL();
4046
4047 netif_addr_lock_bh(dev);
4048 err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4049 NETDEV_HW_ADDR_T_UNICAST);
4050 if (!err)
4051 __dev_set_rx_mode(dev);
4052 netif_addr_unlock_bh(dev);
4053 return err;
4054}
4055EXPORT_SYMBOL(dev_unicast_delete);
4056
4057/**
4058 * dev_unicast_add - add a secondary unicast address
4059 * @dev: device
4060 * @addr: address to add
4061 *
4062 * Add a secondary unicast address to the device or increase
4063 * the reference count if it already exists.
4064 *
4065 * The caller must hold the rtnl_mutex.
4066 */
4067int dev_unicast_add(struct net_device *dev, void *addr)
4068{
4069 int err;
4070
4071 ASSERT_RTNL();
4072
4073 netif_addr_lock_bh(dev);
4074 err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4075 NETDEV_HW_ADDR_T_UNICAST);
4076 if (!err)
4077 __dev_set_rx_mode(dev);
4078 netif_addr_unlock_bh(dev);
4079 return err;
4080}
4081EXPORT_SYMBOL(dev_unicast_add);
4082
4083int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4084 struct dev_addr_list **from, int *from_count)
4085{
4086 struct dev_addr_list *da, *next;
4087 int err = 0;
4088
4089 da = *from;
4090 while (da != NULL) {
4091 next = da->next;
4092 if (!da->da_synced) {
4093 err = __dev_addr_add(to, to_count,
4094 da->da_addr, da->da_addrlen, 0);
4095 if (err < 0)
4096 break;
4097 da->da_synced = 1;
4098 da->da_users++;
4099 } else if (da->da_users == 1) {
4100 __dev_addr_delete(to, to_count,
4101 da->da_addr, da->da_addrlen, 0);
4102 __dev_addr_delete(from, from_count,
4103 da->da_addr, da->da_addrlen, 0);
4104 }
4105 da = next;
4106 }
4107 return err;
4108}
4109EXPORT_SYMBOL_GPL(__dev_addr_sync);
4110
4111void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4112 struct dev_addr_list **from, int *from_count)
4113{
4114 struct dev_addr_list *da, *next;
4115
4116 da = *from;
4117 while (da != NULL) {
4118 next = da->next;
4119 if (da->da_synced) {
4120 __dev_addr_delete(to, to_count,
4121 da->da_addr, da->da_addrlen, 0);
4122 da->da_synced = 0;
4123 __dev_addr_delete(from, from_count,
4124 da->da_addr, da->da_addrlen, 0);
4125 }
4126 da = next;
4127 }
4128}
4129EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4130
4131/**
4132 * dev_unicast_sync - Synchronize device's unicast list to another device
4133 * @to: destination device
4134 * @from: source device
4135 *
4136 * Add newly added addresses to the destination device and release
4137 * addresses that have no users left. The source device must be
4138 * locked by netif_tx_lock_bh.
4139 *
4140 * This function is intended to be called from the dev->set_rx_mode
4141 * function of layered software devices.
4142 */
4143int dev_unicast_sync(struct net_device *to, struct net_device *from)
4144{
4145 int err = 0;
4146
4147 if (to->addr_len != from->addr_len)
4148 return -EINVAL;
4149
4150 netif_addr_lock_bh(to);
4151 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4152 if (!err)
4153 __dev_set_rx_mode(to);
4154 netif_addr_unlock_bh(to);
4155 return err;
4156}
4157EXPORT_SYMBOL(dev_unicast_sync);
4158
4159/**
4160 * dev_unicast_unsync - Remove synchronized addresses from the destination device
4161 * @to: destination device
4162 * @from: source device
4163 *
4164 * Remove all addresses that were added to the destination device by
4165 * dev_unicast_sync(). This function is intended to be called from the
4166 * dev->stop function of layered software devices.
4167 */
4168void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4169{
4170 if (to->addr_len != from->addr_len)
4171 return;
4172
4173 netif_addr_lock_bh(from);
4174 netif_addr_lock(to);
4175 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4176 __dev_set_rx_mode(to);
4177 netif_addr_unlock(to);
4178 netif_addr_unlock_bh(from);
4179}
4180EXPORT_SYMBOL(dev_unicast_unsync);
4181
4182static void dev_unicast_flush(struct net_device *dev)
4183{
4184 netif_addr_lock_bh(dev);
4185 __hw_addr_flush(&dev->uc);
4186 netif_addr_unlock_bh(dev);
4187}
4188
4189static void dev_unicast_init(struct net_device *dev)
4190{
4191 __hw_addr_init(&dev->uc);
4192}
4193
4194
4195static void __dev_addr_discard(struct dev_addr_list **list)
4196{
4197 struct dev_addr_list *tmp;
4198
4199 while (*list != NULL) {
4200 tmp = *list;
4201 *list = tmp->next;
4202 if (tmp->da_users > tmp->da_gusers)
4203 printk("__dev_addr_discard: address leakage! "
4204 "da_users=%d\n", tmp->da_users);
4205 kfree(tmp);
4206 }
4207}
4208
4209static void dev_addr_discard(struct net_device *dev)
4210{
4211 netif_addr_lock_bh(dev);
4212
4213 __dev_addr_discard(&dev->mc_list);
4214 dev->mc_count = 0;
4215
4216 netif_addr_unlock_bh(dev);
4217}
4218
4219/** 4206/**
4220 * dev_get_flags - get flags reported to userspace 4207 * dev_get_flags - get flags reported to userspace
4221 * @dev: device 4208 * @dev: device
@@ -4247,18 +4234,10 @@ unsigned dev_get_flags(const struct net_device *dev)
4247} 4234}
4248EXPORT_SYMBOL(dev_get_flags); 4235EXPORT_SYMBOL(dev_get_flags);
4249 4236
4250/** 4237int __dev_change_flags(struct net_device *dev, unsigned int flags)
4251 * dev_change_flags - change device settings
4252 * @dev: device
4253 * @flags: device state flags
4254 *
4255 * Change settings on device based state flags. The flags are
4256 * in the userspace exported format.
4257 */
4258int dev_change_flags(struct net_device *dev, unsigned flags)
4259{ 4238{
4260 int ret, changes;
4261 int old_flags = dev->flags; 4239 int old_flags = dev->flags;
4240 int ret;
4262 4241
4263 ASSERT_RTNL(); 4242 ASSERT_RTNL();
4264 4243
@@ -4289,17 +4268,12 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
4289 4268
4290 ret = 0; 4269 ret = 0;
4291 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 4270 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4292 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); 4271 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4293 4272
4294 if (!ret) 4273 if (!ret)
4295 dev_set_rx_mode(dev); 4274 dev_set_rx_mode(dev);
4296 } 4275 }
4297 4276
4298 if (dev->flags & IFF_UP &&
4299 ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4300 IFF_VOLATILE)))
4301 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4302
4303 if ((flags ^ dev->gflags) & IFF_PROMISC) { 4277 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4304 int inc = (flags & IFF_PROMISC) ? 1 : -1; 4278 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4305 4279
@@ -4318,11 +4292,47 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
4318 dev_set_allmulti(dev, inc); 4292 dev_set_allmulti(dev, inc);
4319 } 4293 }
4320 4294
4321 /* Exclude state transition flags, already notified */ 4295 return ret;
4322 changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); 4296}
4297
4298void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4299{
4300 unsigned int changes = dev->flags ^ old_flags;
4301
4302 if (changes & IFF_UP) {
4303 if (dev->flags & IFF_UP)
4304 call_netdevice_notifiers(NETDEV_UP, dev);
4305 else
4306 call_netdevice_notifiers(NETDEV_DOWN, dev);
4307 }
4308
4309 if (dev->flags & IFF_UP &&
4310 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4311 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4312}
4313
4314/**
4315 * dev_change_flags - change device settings
4316 * @dev: device
4317 * @flags: device state flags
4318 *
4319 * Change settings on device based state flags. The flags are
4320 * in the userspace exported format.
4321 */
4322int dev_change_flags(struct net_device *dev, unsigned flags)
4323{
4324 int ret, changes;
4325 int old_flags = dev->flags;
4326
4327 ret = __dev_change_flags(dev, flags);
4328 if (ret < 0)
4329 return ret;
4330
4331 changes = old_flags ^ dev->flags;
4323 if (changes) 4332 if (changes)
4324 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 4333 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4325 4334
4335 __dev_notify_flags(dev, old_flags);
4326 return ret; 4336 return ret;
4327} 4337}
4328EXPORT_SYMBOL(dev_change_flags); 4338EXPORT_SYMBOL(dev_change_flags);
@@ -4503,8 +4513,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4503 return -EINVAL; 4513 return -EINVAL;
4504 if (!netif_device_present(dev)) 4514 if (!netif_device_present(dev))
4505 return -ENODEV; 4515 return -ENODEV;
4506 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, 4516 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4507 dev->addr_len, 1);
4508 4517
4509 case SIOCDELMULTI: 4518 case SIOCDELMULTI:
4510 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4519 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
@@ -4512,8 +4521,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4512 return -EINVAL; 4521 return -EINVAL;
4513 if (!netif_device_present(dev)) 4522 if (!netif_device_present(dev))
4514 return -ENODEV; 4523 return -ENODEV;
4515 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, 4524 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4516 dev->addr_len, 1);
4517 4525
4518 case SIOCSIFTXQLEN: 4526 case SIOCSIFTXQLEN:
4519 if (ifr->ifr_qlen < 0) 4527 if (ifr->ifr_qlen < 0)
@@ -4813,11 +4821,15 @@ static void rollback_registered_many(struct list_head *head)
4813 */ 4821 */
4814 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4822 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4815 4823
4824 if (!dev->rtnl_link_ops ||
4825 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4826 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4827
4816 /* 4828 /*
4817 * Flush the unicast and multicast chains 4829 * Flush the unicast and multicast chains
4818 */ 4830 */
4819 dev_unicast_flush(dev); 4831 dev_uc_flush(dev);
4820 dev_addr_discard(dev); 4832 dev_mc_flush(dev);
4821 4833
4822 if (dev->netdev_ops->ndo_uninit) 4834 if (dev->netdev_ops->ndo_uninit)
4823 dev->netdev_ops->ndo_uninit(dev); 4835 dev->netdev_ops->ndo_uninit(dev);
@@ -4830,10 +4842,10 @@ static void rollback_registered_many(struct list_head *head)
4830 } 4842 }
4831 4843
4832 /* Process any work delayed until the end of the batch */ 4844 /* Process any work delayed until the end of the batch */
4833 dev = list_entry(head->next, struct net_device, unreg_list); 4845 dev = list_first_entry(head, struct net_device, unreg_list);
4834 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 4846 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4835 4847
4836 synchronize_net(); 4848 rcu_barrier();
4837 4849
4838 list_for_each_entry(dev, head, unreg_list) 4850 list_for_each_entry(dev, head, unreg_list)
4839 dev_put(dev); 4851 dev_put(dev);
@@ -4966,6 +4978,24 @@ int register_netdevice(struct net_device *dev)
4966 4978
4967 dev->iflink = -1; 4979 dev->iflink = -1;
4968 4980
4981#ifdef CONFIG_RPS
4982 if (!dev->num_rx_queues) {
4983 /*
4984 * Allocate a single RX queue if driver never called
4985 * alloc_netdev_mq
4986 */
4987
4988 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4989 if (!dev->_rx) {
4990 ret = -ENOMEM;
4991 goto out;
4992 }
4993
4994 dev->_rx->first = dev->_rx;
4995 atomic_set(&dev->_rx->count, 1);
4996 dev->num_rx_queues = 1;
4997 }
4998#endif
4969 /* Init, if this function is available */ 4999 /* Init, if this function is available */
4970 if (dev->netdev_ops->ndo_init) { 5000 if (dev->netdev_ops->ndo_init) {
4971 ret = dev->netdev_ops->ndo_init(dev); 5001 ret = dev->netdev_ops->ndo_init(dev);
@@ -4976,7 +5006,7 @@ int register_netdevice(struct net_device *dev)
4976 } 5006 }
4977 } 5007 }
4978 5008
4979 ret = dev_get_valid_name(net, dev->name, dev->name, 0); 5009 ret = dev_get_valid_name(dev, dev->name, 0);
4980 if (ret) 5010 if (ret)
4981 goto err_uninit; 5011 goto err_uninit;
4982 5012
@@ -5005,8 +5035,6 @@ int register_netdevice(struct net_device *dev)
5005 if (dev->features & NETIF_F_SG) 5035 if (dev->features & NETIF_F_SG)
5006 dev->features |= NETIF_F_GSO; 5036 dev->features |= NETIF_F_GSO;
5007 5037
5008 netdev_initialize_kobject(dev);
5009
5010 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5038 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5011 ret = notifier_to_errno(ret); 5039 ret = notifier_to_errno(ret);
5012 if (ret) 5040 if (ret)
@@ -5039,7 +5067,9 @@ int register_netdevice(struct net_device *dev)
5039 * Prevent userspace races by waiting until the network 5067 * Prevent userspace races by waiting until the network
5040 * device is fully setup before sending notifications. 5068 * device is fully setup before sending notifications.
5041 */ 5069 */
5042 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 5070 if (!dev->rtnl_link_ops ||
5071 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5072 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5043 5073
5044out: 5074out:
5045 return ret; 5075 return ret;
@@ -5216,7 +5246,7 @@ void netdev_run_todo(void)
5216 5246
5217 while (!list_empty(&list)) { 5247 while (!list_empty(&list)) {
5218 struct net_device *dev 5248 struct net_device *dev
5219 = list_entry(list.next, struct net_device, todo_list); 5249 = list_first_entry(&list, struct net_device, todo_list);
5220 list_del(&dev->todo_list); 5250 list_del(&dev->todo_list);
5221 5251
5222 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 5252 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
@@ -5249,20 +5279,22 @@ void netdev_run_todo(void)
5249/** 5279/**
5250 * dev_txq_stats_fold - fold tx_queues stats 5280 * dev_txq_stats_fold - fold tx_queues stats
5251 * @dev: device to get statistics from 5281 * @dev: device to get statistics from
5252 * @stats: struct net_device_stats to hold results 5282 * @stats: struct rtnl_link_stats64 to hold results
5253 */ 5283 */
5254void dev_txq_stats_fold(const struct net_device *dev, 5284void dev_txq_stats_fold(const struct net_device *dev,
5255 struct net_device_stats *stats) 5285 struct rtnl_link_stats64 *stats)
5256{ 5286{
5257 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; 5287 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5258 unsigned int i; 5288 unsigned int i;
5259 struct netdev_queue *txq; 5289 struct netdev_queue *txq;
5260 5290
5261 for (i = 0; i < dev->num_tx_queues; i++) { 5291 for (i = 0; i < dev->num_tx_queues; i++) {
5262 txq = netdev_get_tx_queue(dev, i); 5292 txq = netdev_get_tx_queue(dev, i);
5293 spin_lock_bh(&txq->_xmit_lock);
5263 tx_bytes += txq->tx_bytes; 5294 tx_bytes += txq->tx_bytes;
5264 tx_packets += txq->tx_packets; 5295 tx_packets += txq->tx_packets;
5265 tx_dropped += txq->tx_dropped; 5296 tx_dropped += txq->tx_dropped;
5297 spin_unlock_bh(&txq->_xmit_lock);
5266 } 5298 }
5267 if (tx_bytes || tx_packets || tx_dropped) { 5299 if (tx_bytes || tx_packets || tx_dropped) {
5268 stats->tx_bytes = tx_bytes; 5300 stats->tx_bytes = tx_bytes;
@@ -5272,23 +5304,53 @@ void dev_txq_stats_fold(const struct net_device *dev,
5272} 5304}
5273EXPORT_SYMBOL(dev_txq_stats_fold); 5305EXPORT_SYMBOL(dev_txq_stats_fold);
5274 5306
5307/* Convert net_device_stats to rtnl_link_stats64. They have the same
5308 * fields in the same order, with only the type differing.
5309 */
5310static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5311 const struct net_device_stats *netdev_stats)
5312{
5313#if BITS_PER_LONG == 64
5314 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5315 memcpy(stats64, netdev_stats, sizeof(*stats64));
5316#else
5317 size_t i, n = sizeof(*stats64) / sizeof(u64);
5318 const unsigned long *src = (const unsigned long *)netdev_stats;
5319 u64 *dst = (u64 *)stats64;
5320
5321 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5322 sizeof(*stats64) / sizeof(u64));
5323 for (i = 0; i < n; i++)
5324 dst[i] = src[i];
5325#endif
5326}
5327
5275/** 5328/**
5276 * dev_get_stats - get network device statistics 5329 * dev_get_stats - get network device statistics
5277 * @dev: device to get statistics from 5330 * @dev: device to get statistics from
5331 * @storage: place to store stats
5278 * 5332 *
5279 * Get network statistics from device. The device driver may provide 5333 * Get network statistics from device. Return @storage.
5280 * its own method by setting dev->netdev_ops->get_stats; otherwise 5334 * The device driver may provide its own method by setting
5281 * the internal statistics structure is used. 5335 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5336 * otherwise the internal statistics structure is used.
5282 */ 5337 */
5283const struct net_device_stats *dev_get_stats(struct net_device *dev) 5338struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5339 struct rtnl_link_stats64 *storage)
5284{ 5340{
5285 const struct net_device_ops *ops = dev->netdev_ops; 5341 const struct net_device_ops *ops = dev->netdev_ops;
5286 5342
5287 if (ops->ndo_get_stats) 5343 if (ops->ndo_get_stats64) {
5288 return ops->ndo_get_stats(dev); 5344 memset(storage, 0, sizeof(*storage));
5289 5345 return ops->ndo_get_stats64(dev, storage);
5290 dev_txq_stats_fold(dev, &dev->stats); 5346 }
5291 return &dev->stats; 5347 if (ops->ndo_get_stats) {
5348 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5349 return storage;
5350 }
5351 netdev_stats_to_stats64(storage, &dev->stats);
5352 dev_txq_stats_fold(dev, storage);
5353 return storage;
5292} 5354}
5293EXPORT_SYMBOL(dev_get_stats); 5355EXPORT_SYMBOL(dev_get_stats);
5294 5356
@@ -5324,6 +5386,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5324 struct net_device *dev; 5386 struct net_device *dev;
5325 size_t alloc_size; 5387 size_t alloc_size;
5326 struct net_device *p; 5388 struct net_device *p;
5389#ifdef CONFIG_RPS
5390 struct netdev_rx_queue *rx;
5391 int i;
5392#endif
5327 5393
5328 BUG_ON(strlen(name) >= sizeof(dev->name)); 5394 BUG_ON(strlen(name) >= sizeof(dev->name));
5329 5395
@@ -5349,13 +5415,32 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5349 goto free_p; 5415 goto free_p;
5350 } 5416 }
5351 5417
5418#ifdef CONFIG_RPS
5419 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5420 if (!rx) {
5421 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5422 "rx queues.\n");
5423 goto free_tx;
5424 }
5425
5426 atomic_set(&rx->count, queue_count);
5427
5428 /*
5429 * Set a pointer to first element in the array which holds the
5430 * reference count.
5431 */
5432 for (i = 0; i < queue_count; i++)
5433 rx[i].first = rx;
5434#endif
5435
5352 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5436 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5353 dev->padded = (char *)dev - (char *)p; 5437 dev->padded = (char *)dev - (char *)p;
5354 5438
5355 if (dev_addr_init(dev)) 5439 if (dev_addr_init(dev))
5356 goto free_tx; 5440 goto free_rx;
5357 5441
5358 dev_unicast_init(dev); 5442 dev_mc_init(dev);
5443 dev_uc_init(dev);
5359 5444
5360 dev_net_set(dev, &init_net); 5445 dev_net_set(dev, &init_net);
5361 5446
@@ -5363,10 +5448,17 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5363 dev->num_tx_queues = queue_count; 5448 dev->num_tx_queues = queue_count;
5364 dev->real_num_tx_queues = queue_count; 5449 dev->real_num_tx_queues = queue_count;
5365 5450
5451#ifdef CONFIG_RPS
5452 dev->_rx = rx;
5453 dev->num_rx_queues = queue_count;
5454#endif
5455
5366 dev->gso_max_size = GSO_MAX_SIZE; 5456 dev->gso_max_size = GSO_MAX_SIZE;
5367 5457
5368 netdev_init_queues(dev); 5458 netdev_init_queues(dev);
5369 5459
5460 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5461 dev->ethtool_ntuple_list.count = 0;
5370 INIT_LIST_HEAD(&dev->napi_list); 5462 INIT_LIST_HEAD(&dev->napi_list);
5371 INIT_LIST_HEAD(&dev->unreg_list); 5463 INIT_LIST_HEAD(&dev->unreg_list);
5372 INIT_LIST_HEAD(&dev->link_watch_list); 5464 INIT_LIST_HEAD(&dev->link_watch_list);
@@ -5375,9 +5467,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5375 strcpy(dev->name, name); 5467 strcpy(dev->name, name);
5376 return dev; 5468 return dev;
5377 5469
5470free_rx:
5471#ifdef CONFIG_RPS
5472 kfree(rx);
5378free_tx: 5473free_tx:
5474#endif
5379 kfree(tx); 5475 kfree(tx);
5380
5381free_p: 5476free_p:
5382 kfree(p); 5477 kfree(p);
5383 return NULL; 5478 return NULL;
@@ -5403,6 +5498,9 @@ void free_netdev(struct net_device *dev)
5403 /* Flush device addresses */ 5498 /* Flush device addresses */
5404 dev_addr_flush(dev); 5499 dev_addr_flush(dev);
5405 5500
5501 /* Clear ethtool n-tuple list */
5502 ethtool_ntuple_flush(dev);
5503
5406 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5504 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5407 netif_napi_del(p); 5505 netif_napi_del(p);
5408 5506
@@ -5520,15 +5618,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5520 if (dev->features & NETIF_F_NETNS_LOCAL) 5618 if (dev->features & NETIF_F_NETNS_LOCAL)
5521 goto out; 5619 goto out;
5522 5620
5523#ifdef CONFIG_SYSFS
5524 /* Don't allow real devices to be moved when sysfs
5525 * is enabled.
5526 */
5527 err = -EINVAL;
5528 if (dev->dev.parent)
5529 goto out;
5530#endif
5531
5532 /* Ensure the device has been registrered */ 5621 /* Ensure the device has been registrered */
5533 err = -EINVAL; 5622 err = -EINVAL;
5534 if (dev->reg_state != NETREG_REGISTERED) 5623 if (dev->reg_state != NETREG_REGISTERED)
@@ -5547,7 +5636,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5547 /* We get here if we can't use the current device name */ 5636 /* We get here if we can't use the current device name */
5548 if (!pat) 5637 if (!pat)
5549 goto out; 5638 goto out;
5550 if (dev_get_valid_name(net, pat, dev->name, 1)) 5639 if (dev_get_valid_name(dev, pat, 1))
5551 goto out; 5640 goto out;
5552 } 5641 }
5553 5642
@@ -5576,10 +5665,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5576 /* 5665 /*
5577 * Flush the unicast and multicast chains 5666 * Flush the unicast and multicast chains
5578 */ 5667 */
5579 dev_unicast_flush(dev); 5668 dev_uc_flush(dev);
5580 dev_addr_discard(dev); 5669 dev_mc_flush(dev);
5581
5582 netdev_unregister_kobject(dev);
5583 5670
5584 /* Actually switch the network namespace */ 5671 /* Actually switch the network namespace */
5585 dev_net_set(dev, net); 5672 dev_net_set(dev, net);
@@ -5593,7 +5680,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5593 } 5680 }
5594 5681
5595 /* Fixup kobjects */ 5682 /* Fixup kobjects */
5596 err = netdev_register_kobject(dev); 5683 err = device_rename(&dev->dev, dev->name);
5597 WARN_ON(err); 5684 WARN_ON(err);
5598 5685
5599 /* Add the device back in the hashes */ 5686 /* Add the device back in the hashes */
@@ -5620,7 +5707,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5620 void *ocpu) 5707 void *ocpu)
5621{ 5708{
5622 struct sk_buff **list_skb; 5709 struct sk_buff **list_skb;
5623 struct Qdisc **list_net;
5624 struct sk_buff *skb; 5710 struct sk_buff *skb;
5625 unsigned int cpu, oldcpu = (unsigned long)ocpu; 5711 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5626 struct softnet_data *sd, *oldsd; 5712 struct softnet_data *sd, *oldsd;
@@ -5641,20 +5727,26 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5641 *list_skb = oldsd->completion_queue; 5727 *list_skb = oldsd->completion_queue;
5642 oldsd->completion_queue = NULL; 5728 oldsd->completion_queue = NULL;
5643 5729
5644 /* Find end of our output_queue. */
5645 list_net = &sd->output_queue;
5646 while (*list_net)
5647 list_net = &(*list_net)->next_sched;
5648 /* Append output queue from offline CPU. */ 5730 /* Append output queue from offline CPU. */
5649 *list_net = oldsd->output_queue; 5731 if (oldsd->output_queue) {
5650 oldsd->output_queue = NULL; 5732 *sd->output_queue_tailp = oldsd->output_queue;
5733 sd->output_queue_tailp = oldsd->output_queue_tailp;
5734 oldsd->output_queue = NULL;
5735 oldsd->output_queue_tailp = &oldsd->output_queue;
5736 }
5651 5737
5652 raise_softirq_irqoff(NET_TX_SOFTIRQ); 5738 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5653 local_irq_enable(); 5739 local_irq_enable();
5654 5740
5655 /* Process offline CPU's input_pkt_queue */ 5741 /* Process offline CPU's input_pkt_queue */
5656 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) 5742 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5743 netif_rx(skb);
5744 input_queue_head_incr(oldsd);
5745 }
5746 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5657 netif_rx(skb); 5747 netif_rx(skb);
5748 input_queue_head_incr(oldsd);
5749 }
5658 5750
5659 return NOTIFY_OK; 5751 return NOTIFY_OK;
5660} 5752}
@@ -5763,6 +5855,68 @@ char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5763 return buffer; 5855 return buffer;
5764} 5856}
5765 5857
5858static int __netdev_printk(const char *level, const struct net_device *dev,
5859 struct va_format *vaf)
5860{
5861 int r;
5862
5863 if (dev && dev->dev.parent)
5864 r = dev_printk(level, dev->dev.parent, "%s: %pV",
5865 netdev_name(dev), vaf);
5866 else if (dev)
5867 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5868 else
5869 r = printk("%s(NULL net_device): %pV", level, vaf);
5870
5871 return r;
5872}
5873
5874int netdev_printk(const char *level, const struct net_device *dev,
5875 const char *format, ...)
5876{
5877 struct va_format vaf;
5878 va_list args;
5879 int r;
5880
5881 va_start(args, format);
5882
5883 vaf.fmt = format;
5884 vaf.va = &args;
5885
5886 r = __netdev_printk(level, dev, &vaf);
5887 va_end(args);
5888
5889 return r;
5890}
5891EXPORT_SYMBOL(netdev_printk);
5892
5893#define define_netdev_printk_level(func, level) \
5894int func(const struct net_device *dev, const char *fmt, ...) \
5895{ \
5896 int r; \
5897 struct va_format vaf; \
5898 va_list args; \
5899 \
5900 va_start(args, fmt); \
5901 \
5902 vaf.fmt = fmt; \
5903 vaf.va = &args; \
5904 \
5905 r = __netdev_printk(level, dev, &vaf); \
5906 va_end(args); \
5907 \
5908 return r; \
5909} \
5910EXPORT_SYMBOL(func);
5911
5912define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5913define_netdev_printk_level(netdev_alert, KERN_ALERT);
5914define_netdev_printk_level(netdev_crit, KERN_CRIT);
5915define_netdev_printk_level(netdev_err, KERN_ERR);
5916define_netdev_printk_level(netdev_warn, KERN_WARNING);
5917define_netdev_printk_level(netdev_notice, KERN_NOTICE);
5918define_netdev_printk_level(netdev_info, KERN_INFO);
5919
5766static void __net_exit netdev_exit(struct net *net) 5920static void __net_exit netdev_exit(struct net *net)
5767{ 5921{
5768 kfree(net->dev_name_head); 5922 kfree(net->dev_name_head);
@@ -5870,17 +6024,26 @@ static int __init net_dev_init(void)
5870 */ 6024 */
5871 6025
5872 for_each_possible_cpu(i) { 6026 for_each_possible_cpu(i) {
5873 struct softnet_data *queue; 6027 struct softnet_data *sd = &per_cpu(softnet_data, i);
5874 6028
5875 queue = &per_cpu(softnet_data, i); 6029 memset(sd, 0, sizeof(*sd));
5876 skb_queue_head_init(&queue->input_pkt_queue); 6030 skb_queue_head_init(&sd->input_pkt_queue);
5877 queue->completion_queue = NULL; 6031 skb_queue_head_init(&sd->process_queue);
5878 INIT_LIST_HEAD(&queue->poll_list); 6032 sd->completion_queue = NULL;
6033 INIT_LIST_HEAD(&sd->poll_list);
6034 sd->output_queue = NULL;
6035 sd->output_queue_tailp = &sd->output_queue;
6036#ifdef CONFIG_RPS
6037 sd->csd.func = rps_trigger_softirq;
6038 sd->csd.info = sd;
6039 sd->csd.flags = 0;
6040 sd->cpu = i;
6041#endif
5879 6042
5880 queue->backlog.poll = process_backlog; 6043 sd->backlog.poll = process_backlog;
5881 queue->backlog.weight = weight_p; 6044 sd->backlog.weight = weight_p;
5882 queue->backlog.gro_list = NULL; 6045 sd->backlog.gro_list = NULL;
5883 queue->backlog.gro_count = 0; 6046 sd->backlog.gro_count = 0;
5884 } 6047 }
5885 6048
5886 dev_boot_phase = 0; 6049 dev_boot_phase = 0;
@@ -5915,7 +6078,7 @@ subsys_initcall(net_dev_init);
5915 6078
5916static int __init initialize_hashrnd(void) 6079static int __init initialize_hashrnd(void)
5917{ 6080{
5918 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); 6081 get_random_bytes(&hashrnd, sizeof(hashrnd));
5919 return 0; 6082 return 0;
5920} 6083}
5921 6084