aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c1758
1 files changed, 1096 insertions, 662 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 660dd41aaaa6..9c58c1ec41a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,7 +128,11 @@
128#include <linux/jhash.h> 128#include <linux/jhash.h>
129#include <linux/random.h> 129#include <linux/random.h>
130#include <trace/events/napi.h> 130#include <trace/events/napi.h>
131#include <trace/events/net.h>
132#include <trace/events/skb.h>
131#include <linux/pci.h> 133#include <linux/pci.h>
134#include <linux/inetdevice.h>
135#include <linux/cpu_rmap.h>
132 136
133#include "net-sysfs.h" 137#include "net-sysfs.h"
134 138
@@ -371,6 +375,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
371 * --ANK (980803) 375 * --ANK (980803)
372 */ 376 */
373 377
378static inline struct list_head *ptype_head(const struct packet_type *pt)
379{
380 if (pt->type == htons(ETH_P_ALL))
381 return &ptype_all;
382 else
383 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384}
385
374/** 386/**
375 * dev_add_pack - add packet handler 387 * dev_add_pack - add packet handler
376 * @pt: packet type declaration 388 * @pt: packet type declaration
@@ -386,16 +398,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
386 398
387void dev_add_pack(struct packet_type *pt) 399void dev_add_pack(struct packet_type *pt)
388{ 400{
389 int hash; 401 struct list_head *head = ptype_head(pt);
390 402
391 spin_lock_bh(&ptype_lock); 403 spin_lock(&ptype_lock);
392 if (pt->type == htons(ETH_P_ALL)) 404 list_add_rcu(&pt->list, head);
393 list_add_rcu(&pt->list, &ptype_all); 405 spin_unlock(&ptype_lock);
394 else {
395 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
396 list_add_rcu(&pt->list, &ptype_base[hash]);
397 }
398 spin_unlock_bh(&ptype_lock);
399} 406}
400EXPORT_SYMBOL(dev_add_pack); 407EXPORT_SYMBOL(dev_add_pack);
401 408
@@ -414,15 +421,10 @@ EXPORT_SYMBOL(dev_add_pack);
414 */ 421 */
415void __dev_remove_pack(struct packet_type *pt) 422void __dev_remove_pack(struct packet_type *pt)
416{ 423{
417 struct list_head *head; 424 struct list_head *head = ptype_head(pt);
418 struct packet_type *pt1; 425 struct packet_type *pt1;
419 426
420 spin_lock_bh(&ptype_lock); 427 spin_lock(&ptype_lock);
421
422 if (pt->type == htons(ETH_P_ALL))
423 head = &ptype_all;
424 else
425 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
426 428
427 list_for_each_entry(pt1, head, list) { 429 list_for_each_entry(pt1, head, list) {
428 if (pt == pt1) { 430 if (pt == pt1) {
@@ -433,7 +435,7 @@ void __dev_remove_pack(struct packet_type *pt)
433 435
434 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
435out: 437out:
436 spin_unlock_bh(&ptype_lock); 438 spin_unlock(&ptype_lock);
437} 439}
438EXPORT_SYMBOL(__dev_remove_pack); 440EXPORT_SYMBOL(__dev_remove_pack);
439 441
@@ -742,34 +744,32 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
742EXPORT_SYMBOL(dev_get_by_index); 744EXPORT_SYMBOL(dev_get_by_index);
743 745
744/** 746/**
745 * dev_getbyhwaddr - find a device by its hardware address 747 * dev_getbyhwaddr_rcu - find a device by its hardware address
746 * @net: the applicable net namespace 748 * @net: the applicable net namespace
747 * @type: media type of device 749 * @type: media type of device
748 * @ha: hardware address 750 * @ha: hardware address
749 * 751 *
750 * Search for an interface by MAC address. Returns NULL if the device 752 * Search for an interface by MAC address. Returns NULL if the device
751 * is not found or a pointer to the device. The caller must hold the 753 * is not found or a pointer to the device.
752 * rtnl semaphore. The returned device has not had its ref count increased 754 * The caller must hold RCU or RTNL.
755 * The returned device has not had its ref count increased
753 * and the caller must therefore be careful about locking 756 * and the caller must therefore be careful about locking
754 * 757 *
755 * BUGS:
756 * If the API was consistent this would be __dev_get_by_hwaddr
757 */ 758 */
758 759
759struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 760struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 const char *ha)
760{ 762{
761 struct net_device *dev; 763 struct net_device *dev;
762 764
763 ASSERT_RTNL(); 765 for_each_netdev_rcu(net, dev)
764
765 for_each_netdev(net, dev)
766 if (dev->type == type && 766 if (dev->type == type &&
767 !memcmp(dev->dev_addr, ha, dev->addr_len)) 767 !memcmp(dev->dev_addr, ha, dev->addr_len))
768 return dev; 768 return dev;
769 769
770 return NULL; 770 return NULL;
771} 771}
772EXPORT_SYMBOL(dev_getbyhwaddr); 772EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 773
774struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 774struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775{ 775{
@@ -948,7 +948,7 @@ int dev_alloc_name(struct net_device *dev, const char *name)
948} 948}
949EXPORT_SYMBOL(dev_alloc_name); 949EXPORT_SYMBOL(dev_alloc_name);
950 950
951static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) 951static int dev_get_valid_name(struct net_device *dev, const char *name)
952{ 952{
953 struct net *net; 953 struct net *net;
954 954
@@ -958,7 +958,7 @@ static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt
958 if (!dev_valid_name(name)) 958 if (!dev_valid_name(name))
959 return -EINVAL; 959 return -EINVAL;
960 960
961 if (fmt && strchr(name, '%')) 961 if (strchr(name, '%'))
962 return dev_alloc_name(dev, name); 962 return dev_alloc_name(dev, name);
963 else if (__dev_get_by_name(net, name)) 963 else if (__dev_get_by_name(net, name))
964 return -EEXIST; 964 return -EEXIST;
@@ -995,7 +995,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
995 995
996 memcpy(oldname, dev->name, IFNAMSIZ); 996 memcpy(oldname, dev->name, IFNAMSIZ);
997 997
998 err = dev_get_valid_name(dev, newname, 1); 998 err = dev_get_valid_name(dev, newname);
999 if (err < 0) 999 if (err < 0)
1000 return err; 1000 return err;
1001 1001
@@ -1007,7 +1007,7 @@ rollback:
1007 } 1007 }
1008 1008
1009 write_lock_bh(&dev_base_lock); 1009 write_lock_bh(&dev_base_lock);
1010 hlist_del(&dev->name_hlist); 1010 hlist_del_rcu(&dev->name_hlist);
1011 write_unlock_bh(&dev_base_lock); 1011 write_unlock_bh(&dev_base_lock);
1012 1012
1013 synchronize_rcu(); 1013 synchronize_rcu();
@@ -1115,13 +1115,21 @@ EXPORT_SYMBOL(netdev_bonding_change);
1115void dev_load(struct net *net, const char *name) 1115void dev_load(struct net *net, const char *name)
1116{ 1116{
1117 struct net_device *dev; 1117 struct net_device *dev;
1118 int no_module;
1118 1119
1119 rcu_read_lock(); 1120 rcu_read_lock();
1120 dev = dev_get_by_name_rcu(net, name); 1121 dev = dev_get_by_name_rcu(net, name);
1121 rcu_read_unlock(); 1122 rcu_read_unlock();
1122 1123
1123 if (!dev && capable(CAP_NET_ADMIN)) 1124 no_module = !dev;
1124 request_module("%s", name); 1125 if (no_module && capable(CAP_NET_ADMIN))
1126 no_module = request_module("netdev-%s", name);
1127 if (no_module && capable(CAP_SYS_MODULE)) {
1128 if (!request_module("%s", name))
1129 pr_err("Loading kernel module for a network device "
1130"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1131"instead\n", name);
1132 }
1125} 1133}
1126EXPORT_SYMBOL(dev_load); 1134EXPORT_SYMBOL(dev_load);
1127 1135
@@ -1132,9 +1140,6 @@ static int __dev_open(struct net_device *dev)
1132 1140
1133 ASSERT_RTNL(); 1141 ASSERT_RTNL();
1134 1142
1135 /*
1136 * Is it even present?
1137 */
1138 if (!netif_device_present(dev)) 1143 if (!netif_device_present(dev))
1139 return -ENODEV; 1144 return -ENODEV;
1140 1145
@@ -1143,9 +1148,6 @@ static int __dev_open(struct net_device *dev)
1143 if (ret) 1148 if (ret)
1144 return ret; 1149 return ret;
1145 1150
1146 /*
1147 * Call device private open method
1148 */
1149 set_bit(__LINK_STATE_START, &dev->state); 1151 set_bit(__LINK_STATE_START, &dev->state);
1150 1152
1151 if (ops->ndo_validate_addr) 1153 if (ops->ndo_validate_addr)
@@ -1154,31 +1156,12 @@ static int __dev_open(struct net_device *dev)
1154 if (!ret && ops->ndo_open) 1156 if (!ret && ops->ndo_open)
1155 ret = ops->ndo_open(dev); 1157 ret = ops->ndo_open(dev);
1156 1158
1157 /*
1158 * If it went open OK then:
1159 */
1160
1161 if (ret) 1159 if (ret)
1162 clear_bit(__LINK_STATE_START, &dev->state); 1160 clear_bit(__LINK_STATE_START, &dev->state);
1163 else { 1161 else {
1164 /*
1165 * Set the flags.
1166 */
1167 dev->flags |= IFF_UP; 1162 dev->flags |= IFF_UP;
1168
1169 /*
1170 * Enable NET_DMA
1171 */
1172 net_dmaengine_get(); 1163 net_dmaengine_get();
1173
1174 /*
1175 * Initialize multicasting status
1176 */
1177 dev_set_rx_mode(dev); 1164 dev_set_rx_mode(dev);
1178
1179 /*
1180 * Wakeup transmit queue engine
1181 */
1182 dev_activate(dev); 1165 dev_activate(dev);
1183 } 1166 }
1184 1167
@@ -1201,22 +1184,13 @@ int dev_open(struct net_device *dev)
1201{ 1184{
1202 int ret; 1185 int ret;
1203 1186
1204 /*
1205 * Is it already up?
1206 */
1207 if (dev->flags & IFF_UP) 1187 if (dev->flags & IFF_UP)
1208 return 0; 1188 return 0;
1209 1189
1210 /*
1211 * Open device
1212 */
1213 ret = __dev_open(dev); 1190 ret = __dev_open(dev);
1214 if (ret < 0) 1191 if (ret < 0)
1215 return ret; 1192 return ret;
1216 1193
1217 /*
1218 * ... and announce new interface.
1219 */
1220 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1194 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221 call_netdevice_notifiers(NETDEV_UP, dev); 1195 call_netdevice_notifiers(NETDEV_UP, dev);
1222 1196
@@ -1224,52 +1198,78 @@ int dev_open(struct net_device *dev)
1224} 1198}
1225EXPORT_SYMBOL(dev_open); 1199EXPORT_SYMBOL(dev_open);
1226 1200
1227static int __dev_close(struct net_device *dev) 1201static int __dev_close_many(struct list_head *head)
1228{ 1202{
1229 const struct net_device_ops *ops = dev->netdev_ops; 1203 struct net_device *dev;
1230 1204
1231 ASSERT_RTNL(); 1205 ASSERT_RTNL();
1232 might_sleep(); 1206 might_sleep();
1233 1207
1234 /* 1208 list_for_each_entry(dev, head, unreg_list) {
1235 * Tell people we are going down, so that they can 1209 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1236 * prepare to death, when device is still operating.
1237 */
1238 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1239 1210
1240 clear_bit(__LINK_STATE_START, &dev->state); 1211 clear_bit(__LINK_STATE_START, &dev->state);
1241 1212
1242 /* Synchronize to scheduled poll. We cannot touch poll list, 1213 /* Synchronize to scheduled poll. We cannot touch poll list, it
1243 * it can be even on different cpu. So just clear netif_running(). 1214 * can be even on different cpu. So just clear netif_running().
1244 * 1215 *
1245 * dev->stop() will invoke napi_disable() on all of it's 1216 * dev->stop() will invoke napi_disable() on all of it's
1246 * napi_struct instances on this device. 1217 * napi_struct instances on this device.
1247 */ 1218 */
1248 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1219 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 }
1249 1221
1250 dev_deactivate(dev); 1222 dev_deactivate_many(head);
1251 1223
1252 /* 1224 list_for_each_entry(dev, head, unreg_list) {
1253 * Call the device specific close. This cannot fail. 1225 const struct net_device_ops *ops = dev->netdev_ops;
1254 * Only if device is UP
1255 *
1256 * We allow it to be called even after a DETACH hot-plug
1257 * event.
1258 */
1259 if (ops->ndo_stop)
1260 ops->ndo_stop(dev);
1261 1226
1262 /* 1227 /*
1263 * Device is now down. 1228 * Call the device specific close. This cannot fail.
1264 */ 1229 * Only if device is UP
1230 *
1231 * We allow it to be called even after a DETACH hot-plug
1232 * event.
1233 */
1234 if (ops->ndo_stop)
1235 ops->ndo_stop(dev);
1265 1236
1266 dev->flags &= ~IFF_UP; 1237 dev->flags &= ~IFF_UP;
1238 net_dmaengine_put();
1239 }
1267 1240
1268 /* 1241 return 0;
1269 * Shutdown NET_DMA 1242}
1270 */ 1243
1271 net_dmaengine_put(); 1244static int __dev_close(struct net_device *dev)
1245{
1246 int retval;
1247 LIST_HEAD(single);
1272 1248
1249 list_add(&dev->unreg_list, &single);
1250 retval = __dev_close_many(&single);
1251 list_del(&single);
1252 return retval;
1253}
1254
1255static int dev_close_many(struct list_head *head)
1256{
1257 struct net_device *dev, *tmp;
1258 LIST_HEAD(tmp_list);
1259
1260 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 if (!(dev->flags & IFF_UP))
1262 list_move(&dev->unreg_list, &tmp_list);
1263
1264 __dev_close_many(head);
1265
1266 list_for_each_entry(dev, head, unreg_list) {
1267 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 }
1270
1271 /* rollback_registered_many needs the complete original list */
1272 list_splice(&tmp_list, head);
1273 return 0; 1273 return 0;
1274} 1274}
1275 1275
@@ -1284,17 +1284,13 @@ static int __dev_close(struct net_device *dev)
1284 */ 1284 */
1285int dev_close(struct net_device *dev) 1285int dev_close(struct net_device *dev)
1286{ 1286{
1287 if (!(dev->flags & IFF_UP)) 1287 if (dev->flags & IFF_UP) {
1288 return 0; 1288 LIST_HEAD(single);
1289
1290 __dev_close(dev);
1291
1292 /*
1293 * Tell people we are down
1294 */
1295 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1296 call_netdevice_notifiers(NETDEV_DOWN, dev);
1297 1289
1290 list_add(&dev->unreg_list, &single);
1291 dev_close_many(&single);
1292 list_del(&single);
1293 }
1298 return 0; 1294 return 0;
1299} 1295}
1300EXPORT_SYMBOL(dev_close); 1296EXPORT_SYMBOL(dev_close);
@@ -1310,26 +1306,32 @@ EXPORT_SYMBOL(dev_close);
1310 */ 1306 */
1311void dev_disable_lro(struct net_device *dev) 1307void dev_disable_lro(struct net_device *dev)
1312{ 1308{
1313 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1309 u32 flags;
1314 dev->ethtool_ops->set_flags) { 1310
1315 u32 flags = dev->ethtool_ops->get_flags(dev); 1311 /*
1316 if (flags & ETH_FLAG_LRO) { 1312 * If we're trying to disable lro on a vlan device
1317 flags &= ~ETH_FLAG_LRO; 1313 * use the underlying physical device instead
1318 dev->ethtool_ops->set_flags(dev, flags); 1314 */
1319 } 1315 if (is_vlan_dev(dev))
1320 } 1316 dev = vlan_dev_real_dev(dev);
1321 WARN_ON(dev->features & NETIF_F_LRO); 1317
1318 if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1319 flags = dev->ethtool_ops->get_flags(dev);
1320 else
1321 flags = ethtool_op_get_flags(dev);
1322
1323 if (!(flags & ETH_FLAG_LRO))
1324 return;
1325
1326 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1327 if (unlikely(dev->features & NETIF_F_LRO))
1328 netdev_WARN(dev, "failed to disable LRO!\n");
1322} 1329}
1323EXPORT_SYMBOL(dev_disable_lro); 1330EXPORT_SYMBOL(dev_disable_lro);
1324 1331
1325 1332
1326static int dev_boot_phase = 1; 1333static int dev_boot_phase = 1;
1327 1334
1328/*
1329 * Device change register/unregister. These are not inline or static
1330 * as we export them to the world.
1331 */
1332
1333/** 1335/**
1334 * register_netdevice_notifier - register a network notifier block 1336 * register_netdevice_notifier - register a network notifier block
1335 * @nb: notifier 1337 * @nb: notifier
@@ -1431,6 +1433,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1431 ASSERT_RTNL(); 1433 ASSERT_RTNL();
1432 return raw_notifier_call_chain(&netdev_chain, val, dev); 1434 return raw_notifier_call_chain(&netdev_chain, val, dev);
1433} 1435}
1436EXPORT_SYMBOL(call_netdevice_notifiers);
1434 1437
1435/* When > 0 there are consumers of rx skb time stamps */ 1438/* When > 0 there are consumers of rx skb time stamps */
1436static atomic_t netstamp_needed = ATOMIC_INIT(0); 1439static atomic_t netstamp_needed = ATOMIC_INIT(0);
@@ -1461,6 +1464,27 @@ static inline void net_timestamp_check(struct sk_buff *skb)
1461 __net_timestamp(skb); 1464 __net_timestamp(skb);
1462} 1465}
1463 1466
1467static inline bool is_skb_forwardable(struct net_device *dev,
1468 struct sk_buff *skb)
1469{
1470 unsigned int len;
1471
1472 if (!(dev->flags & IFF_UP))
1473 return false;
1474
1475 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1476 if (skb->len <= len)
1477 return true;
1478
1479 /* if TSO is enabled, we don't care about the length as the packet
1480 * could be forwarded without being segmented before
1481 */
1482 if (skb_is_gso(skb))
1483 return true;
1484
1485 return false;
1486}
1487
1464/** 1488/**
1465 * dev_forward_skb - loopback an skb to another netif 1489 * dev_forward_skb - loopback an skb to another netif
1466 * 1490 *
@@ -1484,8 +1508,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1484 skb_orphan(skb); 1508 skb_orphan(skb);
1485 nf_reset(skb); 1509 nf_reset(skb);
1486 1510
1487 if (!(dev->flags & IFF_UP) || 1511 if (unlikely(!is_skb_forwardable(dev, skb))) {
1488 (skb->len > (dev->mtu + dev->hard_header_len))) { 1512 atomic_long_inc(&dev->rx_dropped);
1489 kfree_skb(skb); 1513 kfree_skb(skb);
1490 return NET_RX_DROP; 1514 return NET_RX_DROP;
1491 } 1515 }
@@ -1497,6 +1521,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1497} 1521}
1498EXPORT_SYMBOL_GPL(dev_forward_skb); 1522EXPORT_SYMBOL_GPL(dev_forward_skb);
1499 1523
1524static inline int deliver_skb(struct sk_buff *skb,
1525 struct packet_type *pt_prev,
1526 struct net_device *orig_dev)
1527{
1528 atomic_inc(&skb->users);
1529 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1530}
1531
1500/* 1532/*
1501 * Support routine. Sends outgoing frames to any network 1533 * Support routine. Sends outgoing frames to any network
1502 * taps currently in use. 1534 * taps currently in use.
@@ -1505,13 +1537,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb);
1505static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1537static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1506{ 1538{
1507 struct packet_type *ptype; 1539 struct packet_type *ptype;
1508 1540 struct sk_buff *skb2 = NULL;
1509#ifdef CONFIG_NET_CLS_ACT 1541 struct packet_type *pt_prev = NULL;
1510 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1511 net_timestamp_set(skb);
1512#else
1513 net_timestamp_set(skb);
1514#endif
1515 1542
1516 rcu_read_lock(); 1543 rcu_read_lock();
1517 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1544 list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1521,10 +1548,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1521 if ((ptype->dev == dev || !ptype->dev) && 1548 if ((ptype->dev == dev || !ptype->dev) &&
1522 (ptype->af_packet_priv == NULL || 1549 (ptype->af_packet_priv == NULL ||
1523 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1550 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1524 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1551 if (pt_prev) {
1552 deliver_skb(skb2, pt_prev, skb->dev);
1553 pt_prev = ptype;
1554 continue;
1555 }
1556
1557 skb2 = skb_clone(skb, GFP_ATOMIC);
1525 if (!skb2) 1558 if (!skb2)
1526 break; 1559 break;
1527 1560
1561 net_timestamp_set(skb2);
1562
1528 /* skb->nh should be correctly 1563 /* skb->nh should be correctly
1529 set by sender, so that the second statement is 1564 set by sender, so that the second statement is
1530 just protection against buggy protocols. 1565 just protection against buggy protocols.
@@ -1543,31 +1578,121 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1543 1578
1544 skb2->transport_header = skb2->network_header; 1579 skb2->transport_header = skb2->network_header;
1545 skb2->pkt_type = PACKET_OUTGOING; 1580 skb2->pkt_type = PACKET_OUTGOING;
1546 ptype->func(skb2, skb->dev, ptype, skb->dev); 1581 pt_prev = ptype;
1547 } 1582 }
1548 } 1583 }
1584 if (pt_prev)
1585 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1549 rcu_read_unlock(); 1586 rcu_read_unlock();
1550} 1587}
1551 1588
1589/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1590 * @dev: Network device
1591 * @txq: number of queues available
1592 *
1593 * If real_num_tx_queues is changed the tc mappings may no longer be
1594 * valid. To resolve this verify the tc mapping remains valid and if
1595 * not NULL the mapping. With no priorities mapping to this
1596 * offset/count pair it will no longer be used. In the worst case TC0
1597 * is invalid nothing can be done so disable priority mappings. If is
1598 * expected that drivers will fix this mapping if they can before
1599 * calling netif_set_real_num_tx_queues.
1600 */
1601static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1602{
1603 int i;
1604 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1605
1606 /* If TC0 is invalidated disable TC mapping */
1607 if (tc->offset + tc->count > txq) {
1608 pr_warning("Number of in use tx queues changed "
1609 "invalidating tc mappings. Priority "
1610 "traffic classification disabled!\n");
1611 dev->num_tc = 0;
1612 return;
1613 }
1614
1615 /* Invalidated prio to tc mappings set to TC0 */
1616 for (i = 1; i < TC_BITMASK + 1; i++) {
1617 int q = netdev_get_prio_tc_map(dev, i);
1618
1619 tc = &dev->tc_to_txq[q];
1620 if (tc->offset + tc->count > txq) {
1621 pr_warning("Number of in use tx queues "
1622 "changed. Priority %i to tc "
1623 "mapping %i is no longer valid "
1624 "setting map to 0\n",
1625 i, q);
1626 netdev_set_prio_tc_map(dev, i, 0);
1627 }
1628 }
1629}
1630
1552/* 1631/*
1553 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1632 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1554 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1633 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1555 */ 1634 */
1556void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1635int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1557{ 1636{
1558 unsigned int real_num = dev->real_num_tx_queues; 1637 int rc;
1638
1639 if (txq < 1 || txq > dev->num_tx_queues)
1640 return -EINVAL;
1641
1642 if (dev->reg_state == NETREG_REGISTERED ||
1643 dev->reg_state == NETREG_UNREGISTERING) {
1644 ASSERT_RTNL();
1559 1645
1560 if (unlikely(txq > dev->num_tx_queues)) 1646 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1561 ; 1647 txq);
1562 else if (txq > real_num) 1648 if (rc)
1563 dev->real_num_tx_queues = txq; 1649 return rc;
1564 else if (txq < real_num) { 1650
1565 dev->real_num_tx_queues = txq; 1651 if (dev->num_tc)
1566 qdisc_reset_all_tx_gt(dev, txq); 1652 netif_setup_tc(dev, txq);
1653
1654 if (txq < dev->real_num_tx_queues)
1655 qdisc_reset_all_tx_gt(dev, txq);
1567 } 1656 }
1657
1658 dev->real_num_tx_queues = txq;
1659 return 0;
1568} 1660}
1569EXPORT_SYMBOL(netif_set_real_num_tx_queues); 1661EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1570 1662
1663#ifdef CONFIG_RPS
1664/**
1665 * netif_set_real_num_rx_queues - set actual number of RX queues used
1666 * @dev: Network device
1667 * @rxq: Actual number of RX queues
1668 *
1669 * This must be called either with the rtnl_lock held or before
1670 * registration of the net device. Returns 0 on success, or a
1671 * negative error code. If called before registration, it always
1672 * succeeds.
1673 */
1674int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1675{
1676 int rc;
1677
1678 if (rxq < 1 || rxq > dev->num_rx_queues)
1679 return -EINVAL;
1680
1681 if (dev->reg_state == NETREG_REGISTERED) {
1682 ASSERT_RTNL();
1683
1684 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1685 rxq);
1686 if (rc)
1687 return rc;
1688 }
1689
1690 dev->real_num_rx_queues = rxq;
1691 return 0;
1692}
1693EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1694#endif
1695
1571static inline void __netif_reschedule(struct Qdisc *q) 1696static inline void __netif_reschedule(struct Qdisc *q)
1572{ 1697{
1573 struct softnet_data *sd; 1698 struct softnet_data *sd;
@@ -1646,32 +1771,6 @@ void netif_device_attach(struct net_device *dev)
1646} 1771}
1647EXPORT_SYMBOL(netif_device_attach); 1772EXPORT_SYMBOL(netif_device_attach);
1648 1773
1649static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1650{
1651 return ((features & NETIF_F_GEN_CSUM) ||
1652 ((features & NETIF_F_IP_CSUM) &&
1653 protocol == htons(ETH_P_IP)) ||
1654 ((features & NETIF_F_IPV6_CSUM) &&
1655 protocol == htons(ETH_P_IPV6)) ||
1656 ((features & NETIF_F_FCOE_CRC) &&
1657 protocol == htons(ETH_P_FCOE)));
1658}
1659
1660static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1661{
1662 if (can_checksum_protocol(dev->features, skb->protocol))
1663 return true;
1664
1665 if (skb->protocol == htons(ETH_P_8021Q)) {
1666 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1667 if (can_checksum_protocol(dev->features & dev->vlan_features,
1668 veh->h_vlan_encapsulated_proto))
1669 return true;
1670 }
1671
1672 return false;
1673}
1674
1675/** 1774/**
1676 * skb_dev_set -- assign a new device to a buffer 1775 * skb_dev_set -- assign a new device to a buffer
1677 * @skb: buffer for the new device 1776 * @skb: buffer for the new device
@@ -1719,7 +1818,7 @@ int skb_checksum_help(struct sk_buff *skb)
1719 goto out_set_summed; 1818 goto out_set_summed;
1720 } 1819 }
1721 1820
1722 offset = skb->csum_start - skb_headroom(skb); 1821 offset = skb_checksum_start_offset(skb);
1723 BUG_ON(offset >= skb_headlen(skb)); 1822 BUG_ON(offset >= skb_headlen(skb));
1724 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1823 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1725 1824
@@ -1751,13 +1850,25 @@ EXPORT_SYMBOL(skb_checksum_help);
1751 * It may return NULL if the skb requires no segmentation. This is 1850 * It may return NULL if the skb requires no segmentation. This is
1752 * only possible when GSO is used for verifying header integrity. 1851 * only possible when GSO is used for verifying header integrity.
1753 */ 1852 */
1754struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1853struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1755{ 1854{
1756 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1855 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1757 struct packet_type *ptype; 1856 struct packet_type *ptype;
1758 __be16 type = skb->protocol; 1857 __be16 type = skb->protocol;
1858 int vlan_depth = ETH_HLEN;
1759 int err; 1859 int err;
1760 1860
1861 while (type == htons(ETH_P_8021Q)) {
1862 struct vlan_hdr *vh;
1863
1864 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1865 return ERR_PTR(-EINVAL);
1866
1867 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1868 type = vh->h_vlan_encapsulated_proto;
1869 vlan_depth += VLAN_HLEN;
1870 }
1871
1761 skb_reset_mac_header(skb); 1872 skb_reset_mac_header(skb);
1762 skb->mac_len = skb->network_header - skb->mac_header; 1873 skb->mac_len = skb->network_header - skb->mac_header;
1763 __skb_pull(skb, skb->mac_len); 1874 __skb_pull(skb, skb->mac_len);
@@ -1769,8 +1880,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1769 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1880 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1770 dev->ethtool_ops->get_drvinfo(dev, &info); 1881 dev->ethtool_ops->get_drvinfo(dev, &info);
1771 1882
1772 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " 1883 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1773 "ip_summed=%d",
1774 info.driver, dev ? dev->features : 0L, 1884 info.driver, dev ? dev->features : 0L,
1775 skb->sk ? skb->sk->sk_route_caps : 0L, 1885 skb->sk ? skb->sk->sk_route_caps : 0L,
1776 skb->len, skb->data_len, skb->ip_summed); 1886 skb->len, skb->data_len, skb->ip_summed);
@@ -1873,16 +1983,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
1873/** 1983/**
1874 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1984 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1875 * @skb: buffer to segment 1985 * @skb: buffer to segment
1986 * @features: device features as applicable to this skb
1876 * 1987 *
1877 * This function segments the given skb and stores the list of segments 1988 * This function segments the given skb and stores the list of segments
1878 * in skb->next. 1989 * in skb->next.
1879 */ 1990 */
1880static int dev_gso_segment(struct sk_buff *skb) 1991static int dev_gso_segment(struct sk_buff *skb, int features)
1881{ 1992{
1882 struct net_device *dev = skb->dev;
1883 struct sk_buff *segs; 1993 struct sk_buff *segs;
1884 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1885 NETIF_F_SG : 0);
1886 1994
1887 segs = skb_gso_segment(skb, features); 1995 segs = skb_gso_segment(skb, features);
1888 1996
@@ -1902,14 +2010,14 @@ static int dev_gso_segment(struct sk_buff *skb)
1902 2010
1903/* 2011/*
1904 * Try to orphan skb early, right before transmission by the device. 2012 * Try to orphan skb early, right before transmission by the device.
1905 * We cannot orphan skb if tx timestamp is requested, since 2013 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1906 * drivers need to call skb_tstamp_tx() to send the timestamp. 2014 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1907 */ 2015 */
1908static inline void skb_orphan_try(struct sk_buff *skb) 2016static inline void skb_orphan_try(struct sk_buff *skb)
1909{ 2017{
1910 struct sock *sk = skb->sk; 2018 struct sock *sk = skb->sk;
1911 2019
1912 if (sk && !skb_tx(skb)->flags) { 2020 if (sk && !skb_shinfo(skb)->tx_flags) {
1913 /* skb_tx_hash() wont be able to get sk. 2021 /* skb_tx_hash() wont be able to get sk.
1914 * We copy sk_hash into skb->rxhash 2022 * We copy sk_hash into skb->rxhash
1915 */ 2023 */
@@ -1919,6 +2027,53 @@ static inline void skb_orphan_try(struct sk_buff *skb)
1919 } 2027 }
1920} 2028}
1921 2029
2030static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2031{
2032 return ((features & NETIF_F_GEN_CSUM) ||
2033 ((features & NETIF_F_V4_CSUM) &&
2034 protocol == htons(ETH_P_IP)) ||
2035 ((features & NETIF_F_V6_CSUM) &&
2036 protocol == htons(ETH_P_IPV6)) ||
2037 ((features & NETIF_F_FCOE_CRC) &&
2038 protocol == htons(ETH_P_FCOE)));
2039}
2040
2041static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2042{
2043 if (!can_checksum_protocol(features, protocol)) {
2044 features &= ~NETIF_F_ALL_CSUM;
2045 features &= ~NETIF_F_SG;
2046 } else if (illegal_highdma(skb->dev, skb)) {
2047 features &= ~NETIF_F_SG;
2048 }
2049
2050 return features;
2051}
2052
2053u32 netif_skb_features(struct sk_buff *skb)
2054{
2055 __be16 protocol = skb->protocol;
2056 u32 features = skb->dev->features;
2057
2058 if (protocol == htons(ETH_P_8021Q)) {
2059 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2060 protocol = veh->h_vlan_encapsulated_proto;
2061 } else if (!vlan_tx_tag_present(skb)) {
2062 return harmonize_features(skb, protocol, features);
2063 }
2064
2065 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2066
2067 if (protocol != htons(ETH_P_8021Q)) {
2068 return harmonize_features(skb, protocol, features);
2069 } else {
2070 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2071 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2072 return harmonize_features(skb, protocol, features);
2073 }
2074}
2075EXPORT_SYMBOL(netif_skb_features);
2076
1922/* 2077/*
1923 * Returns true if either: 2078 * Returns true if either:
1924 * 1. skb has frag_list and the device doesn't support FRAGLIST, or 2079 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
@@ -1927,12 +2082,13 @@ static inline void skb_orphan_try(struct sk_buff *skb)
1927 * support DMA from it. 2082 * support DMA from it.
1928 */ 2083 */
1929static inline int skb_needs_linearize(struct sk_buff *skb, 2084static inline int skb_needs_linearize(struct sk_buff *skb,
1930 struct net_device *dev) 2085 int features)
1931{ 2086{
1932 return skb_is_nonlinear(skb) && 2087 return skb_is_nonlinear(skb) &&
1933 ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || 2088 ((skb_has_frag_list(skb) &&
1934 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || 2089 !(features & NETIF_F_FRAGLIST)) ||
1935 illegal_highdma(dev, skb)))); 2090 (skb_shinfo(skb)->nr_frags &&
2091 !(features & NETIF_F_SG)));
1936} 2092}
1937 2093
1938int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 2094int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
@@ -1940,27 +2096,41 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1940{ 2096{
1941 const struct net_device_ops *ops = dev->netdev_ops; 2097 const struct net_device_ops *ops = dev->netdev_ops;
1942 int rc = NETDEV_TX_OK; 2098 int rc = NETDEV_TX_OK;
2099 unsigned int skb_len;
1943 2100
1944 if (likely(!skb->next)) { 2101 if (likely(!skb->next)) {
1945 if (!list_empty(&ptype_all)) 2102 u32 features;
1946 dev_queue_xmit_nit(skb, dev);
1947 2103
1948 /* 2104 /*
1949 * If device doesnt need skb->dst, release it right now while 2105 * If device doesn't need skb->dst, release it right now while
1950 * its hot in this cpu cache 2106 * its hot in this cpu cache
1951 */ 2107 */
1952 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2108 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1953 skb_dst_drop(skb); 2109 skb_dst_drop(skb);
1954 2110
2111 if (!list_empty(&ptype_all))
2112 dev_queue_xmit_nit(skb, dev);
2113
1955 skb_orphan_try(skb); 2114 skb_orphan_try(skb);
1956 2115
1957 if (netif_needs_gso(dev, skb)) { 2116 features = netif_skb_features(skb);
1958 if (unlikely(dev_gso_segment(skb))) 2117
2118 if (vlan_tx_tag_present(skb) &&
2119 !(features & NETIF_F_HW_VLAN_TX)) {
2120 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2121 if (unlikely(!skb))
2122 goto out;
2123
2124 skb->vlan_tci = 0;
2125 }
2126
2127 if (netif_needs_gso(skb, features)) {
2128 if (unlikely(dev_gso_segment(skb, features)))
1959 goto out_kfree_skb; 2129 goto out_kfree_skb;
1960 if (skb->next) 2130 if (skb->next)
1961 goto gso; 2131 goto gso;
1962 } else { 2132 } else {
1963 if (skb_needs_linearize(skb, dev) && 2133 if (skb_needs_linearize(skb, features) &&
1964 __skb_linearize(skb)) 2134 __skb_linearize(skb))
1965 goto out_kfree_skb; 2135 goto out_kfree_skb;
1966 2136
@@ -1969,15 +2139,17 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1969 * checksumming here. 2139 * checksumming here.
1970 */ 2140 */
1971 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2141 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1972 skb_set_transport_header(skb, skb->csum_start - 2142 skb_set_transport_header(skb,
1973 skb_headroom(skb)); 2143 skb_checksum_start_offset(skb));
1974 if (!dev_can_checksum(dev, skb) && 2144 if (!(features & NETIF_F_ALL_CSUM) &&
1975 skb_checksum_help(skb)) 2145 skb_checksum_help(skb))
1976 goto out_kfree_skb; 2146 goto out_kfree_skb;
1977 } 2147 }
1978 } 2148 }
1979 2149
2150 skb_len = skb->len;
1980 rc = ops->ndo_start_xmit(skb, dev); 2151 rc = ops->ndo_start_xmit(skb, dev);
2152 trace_net_dev_xmit(skb, rc, dev, skb_len);
1981 if (rc == NETDEV_TX_OK) 2153 if (rc == NETDEV_TX_OK)
1982 txq_trans_update(txq); 2154 txq_trans_update(txq);
1983 return rc; 2155 return rc;
@@ -1991,13 +2163,15 @@ gso:
1991 nskb->next = NULL; 2163 nskb->next = NULL;
1992 2164
1993 /* 2165 /*
1994 * If device doesnt need nskb->dst, release it right now while 2166 * If device doesn't need nskb->dst, release it right now while
1995 * its hot in this cpu cache 2167 * its hot in this cpu cache
1996 */ 2168 */
1997 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2169 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1998 skb_dst_drop(nskb); 2170 skb_dst_drop(nskb);
1999 2171
2172 skb_len = nskb->len;
2000 rc = ops->ndo_start_xmit(nskb, dev); 2173 rc = ops->ndo_start_xmit(nskb, dev);
2174 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2001 if (unlikely(rc != NETDEV_TX_OK)) { 2175 if (unlikely(rc != NETDEV_TX_OK)) {
2002 if (rc & ~NETDEV_TX_MASK) 2176 if (rc & ~NETDEV_TX_MASK)
2003 goto out_kfree_gso_skb; 2177 goto out_kfree_gso_skb;
@@ -2015,31 +2189,45 @@ out_kfree_gso_skb:
2015 skb->destructor = DEV_GSO_CB(skb)->destructor; 2189 skb->destructor = DEV_GSO_CB(skb)->destructor;
2016out_kfree_skb: 2190out_kfree_skb:
2017 kfree_skb(skb); 2191 kfree_skb(skb);
2192out:
2018 return rc; 2193 return rc;
2019} 2194}
2020 2195
2021static u32 hashrnd __read_mostly; 2196static u32 hashrnd __read_mostly;
2022 2197
2023u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 2198/*
2199 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2200 * to be used as a distribution range.
2201 */
2202u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2203 unsigned int num_tx_queues)
2024{ 2204{
2025 u32 hash; 2205 u32 hash;
2206 u16 qoffset = 0;
2207 u16 qcount = num_tx_queues;
2026 2208
2027 if (skb_rx_queue_recorded(skb)) { 2209 if (skb_rx_queue_recorded(skb)) {
2028 hash = skb_get_rx_queue(skb); 2210 hash = skb_get_rx_queue(skb);
2029 while (unlikely(hash >= dev->real_num_tx_queues)) 2211 while (unlikely(hash >= num_tx_queues))
2030 hash -= dev->real_num_tx_queues; 2212 hash -= num_tx_queues;
2031 return hash; 2213 return hash;
2032 } 2214 }
2033 2215
2216 if (dev->num_tc) {
2217 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2218 qoffset = dev->tc_to_txq[tc].offset;
2219 qcount = dev->tc_to_txq[tc].count;
2220 }
2221
2034 if (skb->sk && skb->sk->sk_hash) 2222 if (skb->sk && skb->sk->sk_hash)
2035 hash = skb->sk->sk_hash; 2223 hash = skb->sk->sk_hash;
2036 else 2224 else
2037 hash = (__force u16) skb->protocol ^ skb->rxhash; 2225 hash = (__force u16) skb->protocol ^ skb->rxhash;
2038 hash = jhash_1word(hash, hashrnd); 2226 hash = jhash_1word(hash, hashrnd);
2039 2227
2040 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 2228 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2041} 2229}
2042EXPORT_SYMBOL(skb_tx_hash); 2230EXPORT_SYMBOL(__skb_tx_hash);
2043 2231
2044static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) 2232static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2045{ 2233{
@@ -2054,26 +2242,70 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2054 return queue_index; 2242 return queue_index;
2055} 2243}
2056 2244
2245static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2246{
2247#ifdef CONFIG_XPS
2248 struct xps_dev_maps *dev_maps;
2249 struct xps_map *map;
2250 int queue_index = -1;
2251
2252 rcu_read_lock();
2253 dev_maps = rcu_dereference(dev->xps_maps);
2254 if (dev_maps) {
2255 map = rcu_dereference(
2256 dev_maps->cpu_map[raw_smp_processor_id()]);
2257 if (map) {
2258 if (map->len == 1)
2259 queue_index = map->queues[0];
2260 else {
2261 u32 hash;
2262 if (skb->sk && skb->sk->sk_hash)
2263 hash = skb->sk->sk_hash;
2264 else
2265 hash = (__force u16) skb->protocol ^
2266 skb->rxhash;
2267 hash = jhash_1word(hash, hashrnd);
2268 queue_index = map->queues[
2269 ((u64)hash * map->len) >> 32];
2270 }
2271 if (unlikely(queue_index >= dev->real_num_tx_queues))
2272 queue_index = -1;
2273 }
2274 }
2275 rcu_read_unlock();
2276
2277 return queue_index;
2278#else
2279 return -1;
2280#endif
2281}
2282
2057static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2283static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2058 struct sk_buff *skb) 2284 struct sk_buff *skb)
2059{ 2285{
2060 int queue_index; 2286 int queue_index;
2061 const struct net_device_ops *ops = dev->netdev_ops; 2287 const struct net_device_ops *ops = dev->netdev_ops;
2062 2288
2063 if (ops->ndo_select_queue) { 2289 if (dev->real_num_tx_queues == 1)
2290 queue_index = 0;
2291 else if (ops->ndo_select_queue) {
2064 queue_index = ops->ndo_select_queue(dev, skb); 2292 queue_index = ops->ndo_select_queue(dev, skb);
2065 queue_index = dev_cap_txqueue(dev, queue_index); 2293 queue_index = dev_cap_txqueue(dev, queue_index);
2066 } else { 2294 } else {
2067 struct sock *sk = skb->sk; 2295 struct sock *sk = skb->sk;
2068 queue_index = sk_tx_queue_get(sk); 2296 queue_index = sk_tx_queue_get(sk);
2069 if (queue_index < 0) {
2070 2297
2071 queue_index = 0; 2298 if (queue_index < 0 || skb->ooo_okay ||
2072 if (dev->real_num_tx_queues > 1) 2299 queue_index >= dev->real_num_tx_queues) {
2300 int old_index = queue_index;
2301
2302 queue_index = get_xps_queue(dev, skb);
2303 if (queue_index < 0)
2073 queue_index = skb_tx_hash(dev, skb); 2304 queue_index = skb_tx_hash(dev, skb);
2074 2305
2075 if (sk) { 2306 if (queue_index != old_index && sk) {
2076 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); 2307 struct dst_entry *dst =
2308 rcu_dereference_check(sk->sk_dst_cache, 1);
2077 2309
2078 if (dst && skb_dst(skb) == dst) 2310 if (dst && skb_dst(skb) == dst)
2079 sk_tx_queue_set(sk, queue_index); 2311 sk_tx_queue_set(sk, queue_index);
@@ -2090,15 +2322,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2090 struct netdev_queue *txq) 2322 struct netdev_queue *txq)
2091{ 2323{
2092 spinlock_t *root_lock = qdisc_lock(q); 2324 spinlock_t *root_lock = qdisc_lock(q);
2093 bool contended = qdisc_is_running(q); 2325 bool contended;
2094 int rc; 2326 int rc;
2095 2327
2328 qdisc_skb_cb(skb)->pkt_len = skb->len;
2329 qdisc_calculate_pkt_len(skb, q);
2096 /* 2330 /*
2097 * Heuristic to force contended enqueues to serialize on a 2331 * Heuristic to force contended enqueues to serialize on a
2098 * separate lock before trying to get qdisc main lock. 2332 * separate lock before trying to get qdisc main lock.
2099 * This permits __QDISC_STATE_RUNNING owner to get the lock more often 2333 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2100 * and dequeue packets faster. 2334 * and dequeue packets faster.
2101 */ 2335 */
2336 contended = qdisc_is_running(q);
2102 if (unlikely(contended)) 2337 if (unlikely(contended))
2103 spin_lock(&q->busylock); 2338 spin_lock(&q->busylock);
2104 2339
@@ -2115,7 +2350,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2115 */ 2350 */
2116 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) 2351 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2117 skb_dst_force(skb); 2352 skb_dst_force(skb);
2118 __qdisc_update_bstats(q, skb->len); 2353
2354 qdisc_bstats_update(q, skb);
2355
2119 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { 2356 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2120 if (unlikely(contended)) { 2357 if (unlikely(contended)) {
2121 spin_unlock(&q->busylock); 2358 spin_unlock(&q->busylock);
@@ -2128,7 +2365,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2128 rc = NET_XMIT_SUCCESS; 2365 rc = NET_XMIT_SUCCESS;
2129 } else { 2366 } else {
2130 skb_dst_force(skb); 2367 skb_dst_force(skb);
2131 rc = qdisc_enqueue_root(skb, q); 2368 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2132 if (qdisc_run_begin(q)) { 2369 if (qdisc_run_begin(q)) {
2133 if (unlikely(contended)) { 2370 if (unlikely(contended)) {
2134 spin_unlock(&q->busylock); 2371 spin_unlock(&q->busylock);
@@ -2143,6 +2380,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2143 return rc; 2380 return rc;
2144} 2381}
2145 2382
2383static DEFINE_PER_CPU(int, xmit_recursion);
2384#define RECURSION_LIMIT 10
2385
2146/** 2386/**
2147 * dev_queue_xmit - transmit a buffer 2387 * dev_queue_xmit - transmit a buffer
2148 * @skb: buffer to transmit 2388 * @skb: buffer to transmit
@@ -2186,6 +2426,7 @@ int dev_queue_xmit(struct sk_buff *skb)
2186#ifdef CONFIG_NET_CLS_ACT 2426#ifdef CONFIG_NET_CLS_ACT
2187 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2427 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2188#endif 2428#endif
2429 trace_net_dev_queue(skb);
2189 if (q->enqueue) { 2430 if (q->enqueue) {
2190 rc = __dev_xmit_skb(skb, q, dev, txq); 2431 rc = __dev_xmit_skb(skb, q, dev, txq);
2191 goto out; 2432 goto out;
@@ -2208,10 +2449,15 @@ int dev_queue_xmit(struct sk_buff *skb)
2208 2449
2209 if (txq->xmit_lock_owner != cpu) { 2450 if (txq->xmit_lock_owner != cpu) {
2210 2451
2452 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2453 goto recursion_alert;
2454
2211 HARD_TX_LOCK(dev, txq, cpu); 2455 HARD_TX_LOCK(dev, txq, cpu);
2212 2456
2213 if (!netif_tx_queue_stopped(txq)) { 2457 if (!netif_tx_queue_stopped(txq)) {
2458 __this_cpu_inc(xmit_recursion);
2214 rc = dev_hard_start_xmit(skb, dev, txq); 2459 rc = dev_hard_start_xmit(skb, dev, txq);
2460 __this_cpu_dec(xmit_recursion);
2215 if (dev_xmit_complete(rc)) { 2461 if (dev_xmit_complete(rc)) {
2216 HARD_TX_UNLOCK(dev, txq); 2462 HARD_TX_UNLOCK(dev, txq);
2217 goto out; 2463 goto out;
@@ -2223,7 +2469,9 @@ int dev_queue_xmit(struct sk_buff *skb)
2223 "queue packet!\n", dev->name); 2469 "queue packet!\n", dev->name);
2224 } else { 2470 } else {
2225 /* Recursion is detected! It is possible, 2471 /* Recursion is detected! It is possible,
2226 * unfortunately */ 2472 * unfortunately
2473 */
2474recursion_alert:
2227 if (net_ratelimit()) 2475 if (net_ratelimit())
2228 printk(KERN_CRIT "Dead loop on virtual device " 2476 printk(KERN_CRIT "Dead loop on virtual device "
2229 "%s, fix it urgently!\n", dev->name); 2477 "%s, fix it urgently!\n", dev->name);
@@ -2259,69 +2507,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,
2259 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2507 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2260} 2508}
2261 2509
2262#ifdef CONFIG_RPS
2263
2264/* One global table that all flow-based protocols share. */
2265struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2266EXPORT_SYMBOL(rps_sock_flow_table);
2267
2268/* 2510/*
2269 * get_rps_cpu is called from netif_receive_skb and returns the target 2511 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2270 * CPU from the RPS map of the receiving queue for a given skb. 2512 * and src/dst port numbers. Returns a non-zero hash number on success
2271 * rcu_read_lock must be held on entry. 2513 * and 0 on failure.
2272 */ 2514 */
2273static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2515__u32 __skb_get_rxhash(struct sk_buff *skb)
2274 struct rps_dev_flow **rflowp)
2275{ 2516{
2276 struct ipv6hdr *ip6; 2517 int nhoff, hash = 0, poff;
2277 struct iphdr *ip; 2518 const struct ipv6hdr *ip6;
2278 struct netdev_rx_queue *rxqueue; 2519 const struct iphdr *ip;
2279 struct rps_map *map;
2280 struct rps_dev_flow_table *flow_table;
2281 struct rps_sock_flow_table *sock_flow_table;
2282 int cpu = -1;
2283 u8 ip_proto; 2520 u8 ip_proto;
2284 u16 tcpu;
2285 u32 addr1, addr2, ihl; 2521 u32 addr1, addr2, ihl;
2286 union { 2522 union {
2287 u32 v32; 2523 u32 v32;
2288 u16 v16[2]; 2524 u16 v16[2];
2289 } ports; 2525 } ports;
2290 2526
2291 if (skb_rx_queue_recorded(skb)) { 2527 nhoff = skb_network_offset(skb);
2292 u16 index = skb_get_rx_queue(skb);
2293 if (unlikely(index >= dev->num_rx_queues)) {
2294 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2295 "on queue %u, but number of RX queues is %u\n",
2296 dev->name, index, dev->num_rx_queues);
2297 goto done;
2298 }
2299 rxqueue = dev->_rx + index;
2300 } else
2301 rxqueue = dev->_rx;
2302
2303 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2304 goto done;
2305
2306 if (skb->rxhash)
2307 goto got_hash; /* Skip hash computation on packet header */
2308 2528
2309 switch (skb->protocol) { 2529 switch (skb->protocol) {
2310 case __constant_htons(ETH_P_IP): 2530 case __constant_htons(ETH_P_IP):
2311 if (!pskb_may_pull(skb, sizeof(*ip))) 2531 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2312 goto done; 2532 goto done;
2313 2533
2314 ip = (struct iphdr *) skb->data; 2534 ip = (const struct iphdr *) (skb->data + nhoff);
2315 ip_proto = ip->protocol; 2535 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2536 ip_proto = 0;
2537 else
2538 ip_proto = ip->protocol;
2316 addr1 = (__force u32) ip->saddr; 2539 addr1 = (__force u32) ip->saddr;
2317 addr2 = (__force u32) ip->daddr; 2540 addr2 = (__force u32) ip->daddr;
2318 ihl = ip->ihl; 2541 ihl = ip->ihl;
2319 break; 2542 break;
2320 case __constant_htons(ETH_P_IPV6): 2543 case __constant_htons(ETH_P_IPV6):
2321 if (!pskb_may_pull(skb, sizeof(*ip6))) 2544 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2322 goto done; 2545 goto done;
2323 2546
2324 ip6 = (struct ipv6hdr *) skb->data; 2547 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2325 ip_proto = ip6->nexthdr; 2548 ip_proto = ip6->nexthdr;
2326 addr1 = (__force u32) ip6->saddr.s6_addr32[3]; 2549 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2327 addr2 = (__force u32) ip6->daddr.s6_addr32[3]; 2550 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
@@ -2330,33 +2553,130 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2330 default: 2553 default:
2331 goto done; 2554 goto done;
2332 } 2555 }
2333 switch (ip_proto) { 2556
2334 case IPPROTO_TCP: 2557 ports.v32 = 0;
2335 case IPPROTO_UDP: 2558 poff = proto_ports_offset(ip_proto);
2336 case IPPROTO_DCCP: 2559 if (poff >= 0) {
2337 case IPPROTO_ESP: 2560 nhoff += ihl * 4 + poff;
2338 case IPPROTO_AH: 2561 if (pskb_may_pull(skb, nhoff + 4)) {
2339 case IPPROTO_SCTP: 2562 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2340 case IPPROTO_UDPLITE:
2341 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2342 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2343 if (ports.v16[1] < ports.v16[0]) 2563 if (ports.v16[1] < ports.v16[0])
2344 swap(ports.v16[0], ports.v16[1]); 2564 swap(ports.v16[0], ports.v16[1]);
2345 break;
2346 } 2565 }
2347 default:
2348 ports.v32 = 0;
2349 break;
2350 } 2566 }
2351 2567
2352 /* get a consistent hash (same value on both flow directions) */ 2568 /* get a consistent hash (same value on both flow directions) */
2353 if (addr2 < addr1) 2569 if (addr2 < addr1)
2354 swap(addr1, addr2); 2570 swap(addr1, addr2);
2355 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2356 if (!skb->rxhash)
2357 skb->rxhash = 1;
2358 2571
2359got_hash: 2572 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2573 if (!hash)
2574 hash = 1;
2575
2576done:
2577 return hash;
2578}
2579EXPORT_SYMBOL(__skb_get_rxhash);
2580
2581#ifdef CONFIG_RPS
2582
2583/* One global table that all flow-based protocols share. */
2584struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2585EXPORT_SYMBOL(rps_sock_flow_table);
2586
2587static struct rps_dev_flow *
2588set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2589 struct rps_dev_flow *rflow, u16 next_cpu)
2590{
2591 u16 tcpu;
2592
2593 tcpu = rflow->cpu = next_cpu;
2594 if (tcpu != RPS_NO_CPU) {
2595#ifdef CONFIG_RFS_ACCEL
2596 struct netdev_rx_queue *rxqueue;
2597 struct rps_dev_flow_table *flow_table;
2598 struct rps_dev_flow *old_rflow;
2599 u32 flow_id;
2600 u16 rxq_index;
2601 int rc;
2602
2603 /* Should we steer this flow to a different hardware queue? */
2604 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2605 !(dev->features & NETIF_F_NTUPLE))
2606 goto out;
2607 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2608 if (rxq_index == skb_get_rx_queue(skb))
2609 goto out;
2610
2611 rxqueue = dev->_rx + rxq_index;
2612 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2613 if (!flow_table)
2614 goto out;
2615 flow_id = skb->rxhash & flow_table->mask;
2616 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2617 rxq_index, flow_id);
2618 if (rc < 0)
2619 goto out;
2620 old_rflow = rflow;
2621 rflow = &flow_table->flows[flow_id];
2622 rflow->cpu = next_cpu;
2623 rflow->filter = rc;
2624 if (old_rflow->filter == rflow->filter)
2625 old_rflow->filter = RPS_NO_FILTER;
2626 out:
2627#endif
2628 rflow->last_qtail =
2629 per_cpu(softnet_data, tcpu).input_queue_head;
2630 }
2631
2632 return rflow;
2633}
2634
2635/*
2636 * get_rps_cpu is called from netif_receive_skb and returns the target
2637 * CPU from the RPS map of the receiving queue for a given skb.
2638 * rcu_read_lock must be held on entry.
2639 */
2640static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2641 struct rps_dev_flow **rflowp)
2642{
2643 struct netdev_rx_queue *rxqueue;
2644 struct rps_map *map;
2645 struct rps_dev_flow_table *flow_table;
2646 struct rps_sock_flow_table *sock_flow_table;
2647 int cpu = -1;
2648 u16 tcpu;
2649
2650 if (skb_rx_queue_recorded(skb)) {
2651 u16 index = skb_get_rx_queue(skb);
2652 if (unlikely(index >= dev->real_num_rx_queues)) {
2653 WARN_ONCE(dev->real_num_rx_queues > 1,
2654 "%s received packet on queue %u, but number "
2655 "of RX queues is %u\n",
2656 dev->name, index, dev->real_num_rx_queues);
2657 goto done;
2658 }
2659 rxqueue = dev->_rx + index;
2660 } else
2661 rxqueue = dev->_rx;
2662
2663 map = rcu_dereference(rxqueue->rps_map);
2664 if (map) {
2665 if (map->len == 1 &&
2666 !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2667 tcpu = map->cpus[0];
2668 if (cpu_online(tcpu))
2669 cpu = tcpu;
2670 goto done;
2671 }
2672 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2673 goto done;
2674 }
2675
2676 skb_reset_network_header(skb);
2677 if (!skb_get_rxhash(skb))
2678 goto done;
2679
2360 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2680 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2361 sock_flow_table = rcu_dereference(rps_sock_flow_table); 2681 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2362 if (flow_table && sock_flow_table) { 2682 if (flow_table && sock_flow_table) {
@@ -2383,12 +2703,9 @@ got_hash:
2383 if (unlikely(tcpu != next_cpu) && 2703 if (unlikely(tcpu != next_cpu) &&
2384 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 2704 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2385 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 2705 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2386 rflow->last_qtail)) >= 0)) { 2706 rflow->last_qtail)) >= 0))
2387 tcpu = rflow->cpu = next_cpu; 2707 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2388 if (tcpu != RPS_NO_CPU) 2708
2389 rflow->last_qtail = per_cpu(softnet_data,
2390 tcpu).input_queue_head;
2391 }
2392 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 2709 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2393 *rflowp = rflow; 2710 *rflowp = rflow;
2394 cpu = tcpu; 2711 cpu = tcpu;
@@ -2396,7 +2713,6 @@ got_hash:
2396 } 2713 }
2397 } 2714 }
2398 2715
2399 map = rcu_dereference(rxqueue->rps_map);
2400 if (map) { 2716 if (map) {
2401 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2717 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2402 2718
@@ -2410,6 +2726,46 @@ done:
2410 return cpu; 2726 return cpu;
2411} 2727}
2412 2728
2729#ifdef CONFIG_RFS_ACCEL
2730
2731/**
2732 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2733 * @dev: Device on which the filter was set
2734 * @rxq_index: RX queue index
2735 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2736 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2737 *
2738 * Drivers that implement ndo_rx_flow_steer() should periodically call
2739 * this function for each installed filter and remove the filters for
2740 * which it returns %true.
2741 */
2742bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2743 u32 flow_id, u16 filter_id)
2744{
2745 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2746 struct rps_dev_flow_table *flow_table;
2747 struct rps_dev_flow *rflow;
2748 bool expire = true;
2749 int cpu;
2750
2751 rcu_read_lock();
2752 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2753 if (flow_table && flow_id <= flow_table->mask) {
2754 rflow = &flow_table->flows[flow_id];
2755 cpu = ACCESS_ONCE(rflow->cpu);
2756 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2757 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2758 rflow->last_qtail) <
2759 (int)(10 * flow_table->mask)))
2760 expire = false;
2761 }
2762 rcu_read_unlock();
2763 return expire;
2764}
2765EXPORT_SYMBOL(rps_may_expire_flow);
2766
2767#endif /* CONFIG_RFS_ACCEL */
2768
2413/* Called from hardirq (IPI) context */ 2769/* Called from hardirq (IPI) context */
2414static void rps_trigger_softirq(void *data) 2770static void rps_trigger_softirq(void *data)
2415{ 2771{
@@ -2482,6 +2838,7 @@ enqueue:
2482 2838
2483 local_irq_restore(flags); 2839 local_irq_restore(flags);
2484 2840
2841 atomic_long_inc(&skb->dev->rx_dropped);
2485 kfree_skb(skb); 2842 kfree_skb(skb);
2486 return NET_RX_DROP; 2843 return NET_RX_DROP;
2487} 2844}
@@ -2512,6 +2869,7 @@ int netif_rx(struct sk_buff *skb)
2512 if (netdev_tstamp_prequeue) 2869 if (netdev_tstamp_prequeue)
2513 net_timestamp_check(skb); 2870 net_timestamp_check(skb);
2514 2871
2872 trace_netif_rx(skb);
2515#ifdef CONFIG_RPS 2873#ifdef CONFIG_RPS
2516 { 2874 {
2517 struct rps_dev_flow voidflow, *rflow = &voidflow; 2875 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2929,7 @@ static void net_tx_action(struct softirq_action *h)
2571 clist = clist->next; 2929 clist = clist->next;
2572 2930
2573 WARN_ON(atomic_read(&skb->users)); 2931 WARN_ON(atomic_read(&skb->users));
2932 trace_kfree_skb(skb, net_tx_action);
2574 __kfree_skb(skb); 2933 __kfree_skb(skb);
2575 } 2934 }
2576 } 2935 }
@@ -2611,14 +2970,6 @@ static void net_tx_action(struct softirq_action *h)
2611 } 2970 }
2612} 2971}
2613 2972
2614static inline int deliver_skb(struct sk_buff *skb,
2615 struct packet_type *pt_prev,
2616 struct net_device *orig_dev)
2617{
2618 atomic_inc(&skb->users);
2619 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2620}
2621
2622#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 2973#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2623 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 2974 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2624/* This hook is defined here for ATM LANE */ 2975/* This hook is defined here for ATM LANE */
@@ -2632,15 +2983,14 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2632 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2983 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2633 * a compare and 2 stores extra right now if we dont have it on 2984 * a compare and 2 stores extra right now if we dont have it on
2634 * but have CONFIG_NET_CLS_ACT 2985 * but have CONFIG_NET_CLS_ACT
2635 * NOTE: This doesnt stop any functionality; if you dont have 2986 * NOTE: This doesn't stop any functionality; if you dont have
2636 * the ingress scheduler, you just cant add policies on ingress. 2987 * the ingress scheduler, you just can't add policies on ingress.
2637 * 2988 *
2638 */ 2989 */
2639static int ing_filter(struct sk_buff *skb) 2990static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2640{ 2991{
2641 struct net_device *dev = skb->dev; 2992 struct net_device *dev = skb->dev;
2642 u32 ttl = G_TC_RTTL(skb->tc_verd); 2993 u32 ttl = G_TC_RTTL(skb->tc_verd);
2643 struct netdev_queue *rxq;
2644 int result = TC_ACT_OK; 2994 int result = TC_ACT_OK;
2645 struct Qdisc *q; 2995 struct Qdisc *q;
2646 2996
@@ -2654,8 +3004,6 @@ static int ing_filter(struct sk_buff *skb)
2654 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3004 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2655 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3005 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2656 3006
2657 rxq = &dev->rx_queue;
2658
2659 q = rxq->qdisc; 3007 q = rxq->qdisc;
2660 if (q != &noop_qdisc) { 3008 if (q != &noop_qdisc) {
2661 spin_lock(qdisc_lock(q)); 3009 spin_lock(qdisc_lock(q));
@@ -2671,7 +3019,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2671 struct packet_type **pt_prev, 3019 struct packet_type **pt_prev,
2672 int *ret, struct net_device *orig_dev) 3020 int *ret, struct net_device *orig_dev)
2673{ 3021{
2674 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 3022 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3023
3024 if (!rxq || rxq->qdisc == &noop_qdisc)
2675 goto out; 3025 goto out;
2676 3026
2677 if (*pt_prev) { 3027 if (*pt_prev) {
@@ -2679,7 +3029,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2679 *pt_prev = NULL; 3029 *pt_prev = NULL;
2680 } 3030 }
2681 3031
2682 switch (ing_filter(skb)) { 3032 switch (ing_filter(skb, rxq)) {
2683 case TC_ACT_SHOT: 3033 case TC_ACT_SHOT:
2684 case TC_ACT_STOLEN: 3034 case TC_ACT_STOLEN:
2685 kfree_skb(skb); 3035 kfree_skb(skb);
@@ -2692,33 +3042,6 @@ out:
2692} 3042}
2693#endif 3043#endif
2694 3044
2695/*
2696 * netif_nit_deliver - deliver received packets to network taps
2697 * @skb: buffer
2698 *
2699 * This function is used to deliver incoming packets to network
2700 * taps. It should be used when the normal netif_receive_skb path
2701 * is bypassed, for example because of VLAN acceleration.
2702 */
2703void netif_nit_deliver(struct sk_buff *skb)
2704{
2705 struct packet_type *ptype;
2706
2707 if (list_empty(&ptype_all))
2708 return;
2709
2710 skb_reset_network_header(skb);
2711 skb_reset_transport_header(skb);
2712 skb->mac_len = skb->network_header - skb->mac_header;
2713
2714 rcu_read_lock();
2715 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2716 if (!ptype->dev || ptype->dev == skb->dev)
2717 deliver_skb(skb, ptype, skb->dev);
2718 }
2719 rcu_read_unlock();
2720}
2721
2722/** 3045/**
2723 * netdev_rx_handler_register - register receive handler 3046 * netdev_rx_handler_register - register receive handler
2724 * @dev: device to register a handler for 3047 * @dev: device to register a handler for
@@ -2730,6 +3053,8 @@ void netif_nit_deliver(struct sk_buff *skb)
2730 * on a failure. 3053 * on a failure.
2731 * 3054 *
2732 * The caller must hold the rtnl_mutex. 3055 * The caller must hold the rtnl_mutex.
3056 *
3057 * For a general description of rx_handler, see enum rx_handler_result.
2733 */ 3058 */
2734int netdev_rx_handler_register(struct net_device *dev, 3059int netdev_rx_handler_register(struct net_device *dev,
2735 rx_handler_func_t *rx_handler, 3060 rx_handler_func_t *rx_handler,
@@ -2764,72 +3089,20 @@ void netdev_rx_handler_unregister(struct net_device *dev)
2764} 3089}
2765EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3090EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2766 3091
2767static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2768 struct net_device *master)
2769{
2770 if (skb->pkt_type == PACKET_HOST) {
2771 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2772
2773 memcpy(dest, master->dev_addr, ETH_ALEN);
2774 }
2775}
2776
2777/* On bonding slaves other than the currently active slave, suppress
2778 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2779 * ARP on active-backup slaves with arp_validate enabled.
2780 */
2781int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2782{
2783 struct net_device *dev = skb->dev;
2784
2785 if (master->priv_flags & IFF_MASTER_ARPMON)
2786 dev->last_rx = jiffies;
2787
2788 if ((master->priv_flags & IFF_MASTER_ALB) &&
2789 (master->priv_flags & IFF_BRIDGE_PORT)) {
2790 /* Do address unmangle. The local destination address
2791 * will be always the one master has. Provides the right
2792 * functionality in a bridge.
2793 */
2794 skb_bond_set_mac_by_master(skb, master);
2795 }
2796
2797 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2798 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2799 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2800 return 0;
2801
2802 if (master->priv_flags & IFF_MASTER_ALB) {
2803 if (skb->pkt_type != PACKET_BROADCAST &&
2804 skb->pkt_type != PACKET_MULTICAST)
2805 return 0;
2806 }
2807 if (master->priv_flags & IFF_MASTER_8023AD &&
2808 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2809 return 0;
2810
2811 return 1;
2812 }
2813 return 0;
2814}
2815EXPORT_SYMBOL(__skb_bond_should_drop);
2816
2817static int __netif_receive_skb(struct sk_buff *skb) 3092static int __netif_receive_skb(struct sk_buff *skb)
2818{ 3093{
2819 struct packet_type *ptype, *pt_prev; 3094 struct packet_type *ptype, *pt_prev;
2820 rx_handler_func_t *rx_handler; 3095 rx_handler_func_t *rx_handler;
2821 struct net_device *orig_dev; 3096 struct net_device *orig_dev;
2822 struct net_device *master; 3097 struct net_device *null_or_dev;
2823 struct net_device *null_or_orig; 3098 bool deliver_exact = false;
2824 struct net_device *orig_or_bond;
2825 int ret = NET_RX_DROP; 3099 int ret = NET_RX_DROP;
2826 __be16 type; 3100 __be16 type;
2827 3101
2828 if (!netdev_tstamp_prequeue) 3102 if (!netdev_tstamp_prequeue)
2829 net_timestamp_check(skb); 3103 net_timestamp_check(skb);
2830 3104
2831 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 3105 trace_netif_receive_skb(skb);
2832 return NET_RX_SUCCESS;
2833 3106
2834 /* if we've gotten here through NAPI, check netpoll */ 3107 /* if we've gotten here through NAPI, check netpoll */
2835 if (netpoll_receive_skb(skb)) 3108 if (netpoll_receive_skb(skb))
@@ -2837,37 +3110,26 @@ static int __netif_receive_skb(struct sk_buff *skb)
2837 3110
2838 if (!skb->skb_iif) 3111 if (!skb->skb_iif)
2839 skb->skb_iif = skb->dev->ifindex; 3112 skb->skb_iif = skb->dev->ifindex;
2840
2841 /*
2842 * bonding note: skbs received on inactive slaves should only
2843 * be delivered to pkt handlers that are exact matches. Also
2844 * the deliver_no_wcard flag will be set. If packet handlers
2845 * are sensitive to duplicate packets these skbs will need to
2846 * be dropped at the handler. The vlan accel path may have
2847 * already set the deliver_no_wcard flag.
2848 */
2849 null_or_orig = NULL;
2850 orig_dev = skb->dev; 3113 orig_dev = skb->dev;
2851 master = ACCESS_ONCE(orig_dev->master);
2852 if (skb->deliver_no_wcard)
2853 null_or_orig = orig_dev;
2854 else if (master) {
2855 if (skb_bond_should_drop(skb, master)) {
2856 skb->deliver_no_wcard = 1;
2857 null_or_orig = orig_dev; /* deliver only exact match */
2858 } else
2859 skb->dev = master;
2860 }
2861 3114
2862 __this_cpu_inc(softnet_data.processed);
2863 skb_reset_network_header(skb); 3115 skb_reset_network_header(skb);
2864 skb_reset_transport_header(skb); 3116 skb_reset_transport_header(skb);
2865 skb->mac_len = skb->network_header - skb->mac_header; 3117 skb_reset_mac_len(skb);
2866 3118
2867 pt_prev = NULL; 3119 pt_prev = NULL;
2868 3120
2869 rcu_read_lock(); 3121 rcu_read_lock();
2870 3122
3123another_round:
3124
3125 __this_cpu_inc(softnet_data.processed);
3126
3127 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3128 skb = vlan_untag(skb);
3129 if (unlikely(!skb))
3130 goto out;
3131 }
3132
2871#ifdef CONFIG_NET_CLS_ACT 3133#ifdef CONFIG_NET_CLS_ACT
2872 if (skb->tc_verd & TC_NCLS) { 3134 if (skb->tc_verd & TC_NCLS) {
2873 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3135 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -2876,8 +3138,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
2876#endif 3138#endif
2877 3139
2878 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3140 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2879 if (ptype->dev == null_or_orig || ptype->dev == skb->dev || 3141 if (!ptype->dev || ptype->dev == skb->dev) {
2880 ptype->dev == orig_dev) {
2881 if (pt_prev) 3142 if (pt_prev)
2882 ret = deliver_skb(skb, pt_prev, orig_dev); 3143 ret = deliver_skb(skb, pt_prev, orig_dev);
2883 pt_prev = ptype; 3144 pt_prev = ptype;
@@ -2891,36 +3152,47 @@ static int __netif_receive_skb(struct sk_buff *skb)
2891ncls: 3152ncls:
2892#endif 3153#endif
2893 3154
2894 /* Handle special case of bridge or macvlan */
2895 rx_handler = rcu_dereference(skb->dev->rx_handler); 3155 rx_handler = rcu_dereference(skb->dev->rx_handler);
2896 if (rx_handler) { 3156 if (rx_handler) {
2897 if (pt_prev) { 3157 if (pt_prev) {
2898 ret = deliver_skb(skb, pt_prev, orig_dev); 3158 ret = deliver_skb(skb, pt_prev, orig_dev);
2899 pt_prev = NULL; 3159 pt_prev = NULL;
2900 } 3160 }
2901 skb = rx_handler(skb); 3161 switch (rx_handler(&skb)) {
2902 if (!skb) 3162 case RX_HANDLER_CONSUMED:
2903 goto out; 3163 goto out;
3164 case RX_HANDLER_ANOTHER:
3165 goto another_round;
3166 case RX_HANDLER_EXACT:
3167 deliver_exact = true;
3168 case RX_HANDLER_PASS:
3169 break;
3170 default:
3171 BUG();
3172 }
2904 } 3173 }
2905 3174
2906 /* 3175 if (vlan_tx_tag_present(skb)) {
2907 * Make sure frames received on VLAN interfaces stacked on 3176 if (pt_prev) {
2908 * bonding interfaces still make their way to any base bonding 3177 ret = deliver_skb(skb, pt_prev, orig_dev);
2909 * device that may have registered for a specific ptype. The 3178 pt_prev = NULL;
2910 * handler may have to adjust skb->dev and orig_dev. 3179 }
2911 */ 3180 if (vlan_do_receive(&skb)) {
2912 orig_or_bond = orig_dev; 3181 ret = __netif_receive_skb(skb);
2913 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && 3182 goto out;
2914 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { 3183 } else if (unlikely(!skb))
2915 orig_or_bond = vlan_dev_real_dev(skb->dev); 3184 goto out;
2916 } 3185 }
2917 3186
3187 /* deliver only exact match when indicated */
3188 null_or_dev = deliver_exact ? skb->dev : NULL;
3189
2918 type = skb->protocol; 3190 type = skb->protocol;
2919 list_for_each_entry_rcu(ptype, 3191 list_for_each_entry_rcu(ptype,
2920 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 3192 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2921 if (ptype->type == type && (ptype->dev == null_or_orig || 3193 if (ptype->type == type &&
2922 ptype->dev == skb->dev || ptype->dev == orig_dev || 3194 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
2923 ptype->dev == orig_or_bond)) { 3195 ptype->dev == orig_dev)) {
2924 if (pt_prev) 3196 if (pt_prev)
2925 ret = deliver_skb(skb, pt_prev, orig_dev); 3197 ret = deliver_skb(skb, pt_prev, orig_dev);
2926 pt_prev = ptype; 3198 pt_prev = ptype;
@@ -2930,6 +3202,7 @@ ncls:
2930 if (pt_prev) { 3202 if (pt_prev) {
2931 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3203 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2932 } else { 3204 } else {
3205 atomic_long_inc(&skb->dev->rx_dropped);
2933 kfree_skb(skb); 3206 kfree_skb(skb);
2934 /* Jamal, now you will not able to escape explaining 3207 /* Jamal, now you will not able to escape explaining
2935 * me how you were going to use this. :-) 3208 * me how you were going to use this. :-)
@@ -3050,7 +3323,7 @@ out:
3050 return netif_receive_skb(skb); 3323 return netif_receive_skb(skb);
3051} 3324}
3052 3325
3053static void napi_gro_flush(struct napi_struct *napi) 3326inline void napi_gro_flush(struct napi_struct *napi)
3054{ 3327{
3055 struct sk_buff *skb, *next; 3328 struct sk_buff *skb, *next;
3056 3329
@@ -3063,6 +3336,7 @@ static void napi_gro_flush(struct napi_struct *napi)
3063 napi->gro_count = 0; 3336 napi->gro_count = 0;
3064 napi->gro_list = NULL; 3337 napi->gro_list = NULL;
3065} 3338}
3339EXPORT_SYMBOL(napi_gro_flush);
3066 3340
3067enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3341enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3068{ 3342{
@@ -3077,7 +3351,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3077 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3351 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3078 goto normal; 3352 goto normal;
3079 3353
3080 if (skb_is_gso(skb) || skb_has_frags(skb)) 3354 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3081 goto normal; 3355 goto normal;
3082 3356
3083 rcu_read_lock(); 3357 rcu_read_lock();
@@ -3156,16 +3430,19 @@ normal:
3156} 3430}
3157EXPORT_SYMBOL(dev_gro_receive); 3431EXPORT_SYMBOL(dev_gro_receive);
3158 3432
3159static gro_result_t 3433static inline gro_result_t
3160__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3434__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3161{ 3435{
3162 struct sk_buff *p; 3436 struct sk_buff *p;
3163 3437
3164 for (p = napi->gro_list; p; p = p->next) { 3438 for (p = napi->gro_list; p; p = p->next) {
3165 NAPI_GRO_CB(p)->same_flow = 3439 unsigned long diffs;
3166 (p->dev == skb->dev) && 3440
3167 !compare_ether_header(skb_mac_header(p), 3441 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3442 diffs |= p->vlan_tci ^ skb->vlan_tci;
3443 diffs |= compare_ether_header(skb_mac_header(p),
3168 skb_gro_mac_header(skb)); 3444 skb_gro_mac_header(skb));
3445 NAPI_GRO_CB(p)->same_flow = !diffs;
3169 NAPI_GRO_CB(p)->flush = 0; 3446 NAPI_GRO_CB(p)->flush = 0;
3170 } 3447 }
3171 3448
@@ -3218,14 +3495,16 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3218} 3495}
3219EXPORT_SYMBOL(napi_gro_receive); 3496EXPORT_SYMBOL(napi_gro_receive);
3220 3497
3221void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 3498static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3222{ 3499{
3223 __skb_pull(skb, skb_headlen(skb)); 3500 __skb_pull(skb, skb_headlen(skb));
3224 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 3501 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3502 skb->vlan_tci = 0;
3503 skb->dev = napi->dev;
3504 skb->skb_iif = 0;
3225 3505
3226 napi->skb = skb; 3506 napi->skb = skb;
3227} 3507}
3228EXPORT_SYMBOL(napi_reuse_skb);
3229 3508
3230struct sk_buff *napi_get_frags(struct napi_struct *napi) 3509struct sk_buff *napi_get_frags(struct napi_struct *napi)
3231{ 3510{
@@ -3519,7 +3798,7 @@ static void net_rx_action(struct softirq_action *h)
3519 * with netpoll's poll_napi(). Only the entity which 3798 * with netpoll's poll_napi(). Only the entity which
3520 * obtains the lock and sees NAPI_STATE_SCHED set will 3799 * obtains the lock and sees NAPI_STATE_SCHED set will
3521 * actually make the ->poll() call. Therefore we avoid 3800 * actually make the ->poll() call. Therefore we avoid
3522 * accidently calling ->poll() when NAPI is not scheduled. 3801 * accidentally calling ->poll() when NAPI is not scheduled.
3523 */ 3802 */
3524 work = 0; 3803 work = 0;
3525 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 3804 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
@@ -3710,12 +3989,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3710 3989
3711void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3990void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3712{ 3991{
3713 struct net_device *dev = (v == SEQ_START_TOKEN) ? 3992 struct net_device *dev = v;
3714 first_net_device(seq_file_net(seq)) : 3993
3715 next_net_device((struct net_device *)v); 3994 if (v == SEQ_START_TOKEN)
3995 dev = first_net_device_rcu(seq_file_net(seq));
3996 else
3997 dev = next_net_device_rcu(dev);
3716 3998
3717 ++*pos; 3999 ++*pos;
3718 return rcu_dereference(dev); 4000 return dev;
3719} 4001}
3720 4002
3721void dev_seq_stop(struct seq_file *seq, void *v) 4003void dev_seq_stop(struct seq_file *seq, void *v)
@@ -3999,15 +4281,14 @@ static int __init dev_proc_init(void)
3999 4281
4000 4282
4001/** 4283/**
4002 * netdev_set_master - set up master/slave pair 4284 * netdev_set_master - set up master pointer
4003 * @slave: slave device 4285 * @slave: slave device
4004 * @master: new master device 4286 * @master: new master device
4005 * 4287 *
4006 * Changes the master device of the slave. Pass %NULL to break the 4288 * Changes the master device of the slave. Pass %NULL to break the
4007 * bonding. The caller must hold the RTNL semaphore. On a failure 4289 * bonding. The caller must hold the RTNL semaphore. On a failure
4008 * a negative errno code is returned. On success the reference counts 4290 * a negative errno code is returned. On success the reference counts
4009 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 4291 * are adjusted and the function returns zero.
4010 * function returns zero.
4011 */ 4292 */
4012int netdev_set_master(struct net_device *slave, struct net_device *master) 4293int netdev_set_master(struct net_device *slave, struct net_device *master)
4013{ 4294{
@@ -4023,10 +4304,31 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
4023 4304
4024 slave->master = master; 4305 slave->master = master;
4025 4306
4026 if (old) { 4307 if (old)
4027 synchronize_net();
4028 dev_put(old); 4308 dev_put(old);
4029 } 4309 return 0;
4310}
4311EXPORT_SYMBOL(netdev_set_master);
4312
4313/**
4314 * netdev_set_bond_master - set up bonding master/slave pair
4315 * @slave: slave device
4316 * @master: new master device
4317 *
4318 * Changes the master device of the slave. Pass %NULL to break the
4319 * bonding. The caller must hold the RTNL semaphore. On a failure
4320 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4321 * to the routing socket and the function returns zero.
4322 */
4323int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4324{
4325 int err;
4326
4327 ASSERT_RTNL();
4328
4329 err = netdev_set_master(slave, master);
4330 if (err)
4331 return err;
4030 if (master) 4332 if (master)
4031 slave->flags |= IFF_SLAVE; 4333 slave->flags |= IFF_SLAVE;
4032 else 4334 else
@@ -4035,7 +4337,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
4035 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 4337 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4036 return 0; 4338 return 0;
4037} 4339}
4038EXPORT_SYMBOL(netdev_set_master); 4340EXPORT_SYMBOL(netdev_set_bond_master);
4039 4341
4040static void dev_change_rx_flags(struct net_device *dev, int flags) 4342static void dev_change_rx_flags(struct net_device *dev, int flags)
4041{ 4343{
@@ -4204,6 +4506,30 @@ void dev_set_rx_mode(struct net_device *dev)
4204} 4506}
4205 4507
4206/** 4508/**
4509 * dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4510 * @dev: device
4511 * @cmd: memory area for ethtool_ops::get_settings() result
4512 *
4513 * The cmd arg is initialized properly (cleared and
4514 * ethtool_cmd::cmd field set to ETHTOOL_GSET).
4515 *
4516 * Return device's ethtool_ops::get_settings() result value or
4517 * -EOPNOTSUPP when device doesn't expose
4518 * ethtool_ops::get_settings() operation.
4519 */
4520int dev_ethtool_get_settings(struct net_device *dev,
4521 struct ethtool_cmd *cmd)
4522{
4523 if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4524 return -EOPNOTSUPP;
4525
4526 memset(cmd, 0, sizeof(struct ethtool_cmd));
4527 cmd->cmd = ETHTOOL_GSET;
4528 return dev->ethtool_ops->get_settings(dev, cmd);
4529}
4530EXPORT_SYMBOL(dev_ethtool_get_settings);
4531
4532/**
4207 * dev_get_flags - get flags reported to userspace 4533 * dev_get_flags - get flags reported to userspace
4208 * @dev: device 4534 * @dev: device
4209 * 4535 *
@@ -4372,6 +4698,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
4372EXPORT_SYMBOL(dev_set_mtu); 4698EXPORT_SYMBOL(dev_set_mtu);
4373 4699
4374/** 4700/**
4701 * dev_set_group - Change group this device belongs to
4702 * @dev: device
4703 * @new_group: group this device should belong to
4704 */
4705void dev_set_group(struct net_device *dev, int new_group)
4706{
4707 dev->group = new_group;
4708}
4709EXPORT_SYMBOL(dev_set_group);
4710
4711/**
4375 * dev_set_mac_address - Change Media Access Control Address 4712 * dev_set_mac_address - Change Media Access Control Address
4376 * @dev: device 4713 * @dev: device
4377 * @sa: new address 4714 * @sa: new address
@@ -4456,7 +4793,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
4456 * is never reached 4793 * is never reached
4457 */ 4794 */
4458 WARN_ON(1); 4795 WARN_ON(1);
4459 err = -EINVAL; 4796 err = -ENOTTY;
4460 break; 4797 break;
4461 4798
4462 } 4799 }
@@ -4724,7 +5061,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4724 /* Set the per device memory buffer space. 5061 /* Set the per device memory buffer space.
4725 * Not applicable in our case */ 5062 * Not applicable in our case */
4726 case SIOCSIFLINK: 5063 case SIOCSIFLINK:
4727 return -EINVAL; 5064 return -ENOTTY;
4728 5065
4729 /* 5066 /*
4730 * Unknown or private ioctl. 5067 * Unknown or private ioctl.
@@ -4745,7 +5082,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4745 /* Take care of Wireless Extensions */ 5082 /* Take care of Wireless Extensions */
4746 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 5083 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4747 return wext_handle_ioctl(net, &ifr, cmd, arg); 5084 return wext_handle_ioctl(net, &ifr, cmd, arg);
4748 return -EINVAL; 5085 return -ENOTTY;
4749 } 5086 }
4750} 5087}
4751 5088
@@ -4797,12 +5134,14 @@ static void rollback_registered_many(struct list_head *head)
4797 list_del(&dev->unreg_list); 5134 list_del(&dev->unreg_list);
4798 continue; 5135 continue;
4799 } 5136 }
4800 5137 dev->dismantle = true;
4801 BUG_ON(dev->reg_state != NETREG_REGISTERED); 5138 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5139 }
4802 5140
4803 /* If device is running, close it first. */ 5141 /* If device is running, close it first. */
4804 dev_close(dev); 5142 dev_close_many(head);
4805 5143
5144 list_for_each_entry(dev, head, unreg_list) {
4806 /* And unlink it from device chain. */ 5145 /* And unlink it from device chain. */
4807 unlist_netdevice(dev); 5146 unlist_netdevice(dev);
4808 5147
@@ -4857,55 +5196,62 @@ static void rollback_registered(struct net_device *dev)
4857 5196
4858 list_add(&dev->unreg_list, &single); 5197 list_add(&dev->unreg_list, &single);
4859 rollback_registered_many(&single); 5198 rollback_registered_many(&single);
5199 list_del(&single);
4860} 5200}
4861 5201
4862static void __netdev_init_queue_locks_one(struct net_device *dev, 5202u32 netdev_fix_features(struct net_device *dev, u32 features)
4863 struct netdev_queue *dev_queue,
4864 void *_unused)
4865{ 5203{
4866 spin_lock_init(&dev_queue->_xmit_lock); 5204 /* Fix illegal checksum combinations */
4867 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); 5205 if ((features & NETIF_F_HW_CSUM) &&
4868 dev_queue->xmit_lock_owner = -1; 5206 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4869} 5207 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5208 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5209 }
4870 5210
4871static void netdev_init_queue_locks(struct net_device *dev) 5211 if ((features & NETIF_F_NO_CSUM) &&
4872{ 5212 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4873 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 5213 netdev_warn(dev, "mixed no checksumming and other settings.\n");
4874 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 5214 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4875} 5215 }
4876 5216
4877unsigned long netdev_fix_features(unsigned long features, const char *name)
4878{
4879 /* Fix illegal SG+CSUM combinations. */ 5217 /* Fix illegal SG+CSUM combinations. */
4880 if ((features & NETIF_F_SG) && 5218 if ((features & NETIF_F_SG) &&
4881 !(features & NETIF_F_ALL_CSUM)) { 5219 !(features & NETIF_F_ALL_CSUM)) {
4882 if (name) 5220 netdev_dbg(dev,
4883 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 5221 "Dropping NETIF_F_SG since no checksum feature.\n");
4884 "checksum feature.\n", name);
4885 features &= ~NETIF_F_SG; 5222 features &= ~NETIF_F_SG;
4886 } 5223 }
4887 5224
4888 /* TSO requires that SG is present as well. */ 5225 /* TSO requires that SG is present as well. */
4889 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 5226 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
4890 if (name) 5227 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
4891 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " 5228 features &= ~NETIF_F_ALL_TSO;
4892 "SG feature.\n", name);
4893 features &= ~NETIF_F_TSO;
4894 } 5229 }
4895 5230
5231 /* TSO ECN requires that TSO is present as well. */
5232 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5233 features &= ~NETIF_F_TSO_ECN;
5234
5235 /* Software GSO depends on SG. */
5236 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5237 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5238 features &= ~NETIF_F_GSO;
5239 }
5240
5241 /* UFO needs SG and checksumming */
4896 if (features & NETIF_F_UFO) { 5242 if (features & NETIF_F_UFO) {
4897 if (!(features & NETIF_F_GEN_CSUM)) { 5243 /* maybe split UFO into V4 and V6? */
4898 if (name) 5244 if (!((features & NETIF_F_GEN_CSUM) ||
4899 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 5245 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
4900 "since no NETIF_F_HW_CSUM feature.\n", 5246 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4901 name); 5247 netdev_dbg(dev,
5248 "Dropping NETIF_F_UFO since no checksum offload features.\n");
4902 features &= ~NETIF_F_UFO; 5249 features &= ~NETIF_F_UFO;
4903 } 5250 }
4904 5251
4905 if (!(features & NETIF_F_SG)) { 5252 if (!(features & NETIF_F_SG)) {
4906 if (name) 5253 netdev_dbg(dev,
4907 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 5254 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
4908 "since no NETIF_F_SG feature.\n", name);
4909 features &= ~NETIF_F_UFO; 5255 features &= ~NETIF_F_UFO;
4910 } 5256 }
4911 } 5257 }
@@ -4914,6 +5260,75 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
4914} 5260}
4915EXPORT_SYMBOL(netdev_fix_features); 5261EXPORT_SYMBOL(netdev_fix_features);
4916 5262
5263int __netdev_update_features(struct net_device *dev)
5264{
5265 u32 features;
5266 int err = 0;
5267
5268 ASSERT_RTNL();
5269
5270 features = netdev_get_wanted_features(dev);
5271
5272 if (dev->netdev_ops->ndo_fix_features)
5273 features = dev->netdev_ops->ndo_fix_features(dev, features);
5274
5275 /* driver might be less strict about feature dependencies */
5276 features = netdev_fix_features(dev, features);
5277
5278 if (dev->features == features)
5279 return 0;
5280
5281 netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5282 dev->features, features);
5283
5284 if (dev->netdev_ops->ndo_set_features)
5285 err = dev->netdev_ops->ndo_set_features(dev, features);
5286
5287 if (unlikely(err < 0)) {
5288 netdev_err(dev,
5289 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5290 err, features, dev->features);
5291 return -1;
5292 }
5293
5294 if (!err)
5295 dev->features = features;
5296
5297 return 1;
5298}
5299
5300/**
5301 * netdev_update_features - recalculate device features
5302 * @dev: the device to check
5303 *
5304 * Recalculate dev->features set and send notifications if it
5305 * has changed. Should be called after driver or hardware dependent
5306 * conditions might have changed that influence the features.
5307 */
5308void netdev_update_features(struct net_device *dev)
5309{
5310 if (__netdev_update_features(dev))
5311 netdev_features_change(dev);
5312}
5313EXPORT_SYMBOL(netdev_update_features);
5314
5315/**
5316 * netdev_change_features - recalculate device features
5317 * @dev: the device to check
5318 *
5319 * Recalculate dev->features set and send notifications even
5320 * if they have not changed. Should be called instead of
5321 * netdev_update_features() if also dev->vlan_features might
5322 * have changed to allow the changes to be propagated to stacked
5323 * VLAN devices.
5324 */
5325void netdev_change_features(struct net_device *dev)
5326{
5327 __netdev_update_features(dev);
5328 netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_change_features);
5331
4917/** 5332/**
4918 * netif_stacked_transfer_operstate - transfer operstate 5333 * netif_stacked_transfer_operstate - transfer operstate
4919 * @rootdev: the root or lower level device to transfer state from 5334 * @rootdev: the root or lower level device to transfer state from
@@ -4941,6 +5356,59 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4941} 5356}
4942EXPORT_SYMBOL(netif_stacked_transfer_operstate); 5357EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4943 5358
5359#ifdef CONFIG_RPS
5360static int netif_alloc_rx_queues(struct net_device *dev)
5361{
5362 unsigned int i, count = dev->num_rx_queues;
5363 struct netdev_rx_queue *rx;
5364
5365 BUG_ON(count < 1);
5366
5367 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5368 if (!rx) {
5369 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5370 return -ENOMEM;
5371 }
5372 dev->_rx = rx;
5373
5374 for (i = 0; i < count; i++)
5375 rx[i].dev = dev;
5376 return 0;
5377}
5378#endif
5379
5380static void netdev_init_one_queue(struct net_device *dev,
5381 struct netdev_queue *queue, void *_unused)
5382{
5383 /* Initialize queue lock */
5384 spin_lock_init(&queue->_xmit_lock);
5385 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5386 queue->xmit_lock_owner = -1;
5387 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5388 queue->dev = dev;
5389}
5390
5391static int netif_alloc_netdev_queues(struct net_device *dev)
5392{
5393 unsigned int count = dev->num_tx_queues;
5394 struct netdev_queue *tx;
5395
5396 BUG_ON(count < 1);
5397
5398 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5399 if (!tx) {
5400 pr_err("netdev: Unable to allocate %u tx queues.\n",
5401 count);
5402 return -ENOMEM;
5403 }
5404 dev->_tx = tx;
5405
5406 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5407 spin_lock_init(&dev->tx_global_lock);
5408
5409 return 0;
5410}
5411
4944/** 5412/**
4945 * register_netdevice - register a network device 5413 * register_netdevice - register a network device
4946 * @dev: device to register 5414 * @dev: device to register
@@ -4974,28 +5442,13 @@ int register_netdevice(struct net_device *dev)
4974 5442
4975 spin_lock_init(&dev->addr_list_lock); 5443 spin_lock_init(&dev->addr_list_lock);
4976 netdev_set_addr_lockdep_class(dev); 5444 netdev_set_addr_lockdep_class(dev);
4977 netdev_init_queue_locks(dev);
4978 5445
4979 dev->iflink = -1; 5446 dev->iflink = -1;
4980 5447
4981#ifdef CONFIG_RPS 5448 ret = dev_get_valid_name(dev, dev->name);
4982 if (!dev->num_rx_queues) { 5449 if (ret < 0)
4983 /* 5450 goto out;
4984 * Allocate a single RX queue if driver never called
4985 * alloc_netdev_mq
4986 */
4987
4988 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4989 if (!dev->_rx) {
4990 ret = -ENOMEM;
4991 goto out;
4992 }
4993 5451
4994 dev->_rx->first = dev->_rx;
4995 atomic_set(&dev->_rx->count, 1);
4996 dev->num_rx_queues = 1;
4997 }
4998#endif
4999 /* Init, if this function is available */ 5452 /* Init, if this function is available */
5000 if (dev->netdev_ops->ndo_init) { 5453 if (dev->netdev_ops->ndo_init) {
5001 ret = dev->netdev_ops->ndo_init(dev); 5454 ret = dev->netdev_ops->ndo_init(dev);
@@ -5006,34 +5459,30 @@ int register_netdevice(struct net_device *dev)
5006 } 5459 }
5007 } 5460 }
5008 5461
5009 ret = dev_get_valid_name(dev, dev->name, 0);
5010 if (ret)
5011 goto err_uninit;
5012
5013 dev->ifindex = dev_new_index(net); 5462 dev->ifindex = dev_new_index(net);
5014 if (dev->iflink == -1) 5463 if (dev->iflink == -1)
5015 dev->iflink = dev->ifindex; 5464 dev->iflink = dev->ifindex;
5016 5465
5017 /* Fix illegal checksum combinations */ 5466 /* Transfer changeable features to wanted_features and enable
5018 if ((dev->features & NETIF_F_HW_CSUM) && 5467 * software offloads (GSO and GRO).
5019 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5468 */
5020 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 5469 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5021 dev->name); 5470 dev->features |= NETIF_F_SOFT_FEATURES;
5022 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 5471 dev->wanted_features = dev->features & dev->hw_features;
5023 }
5024 5472
5025 if ((dev->features & NETIF_F_NO_CSUM) && 5473 /* Turn on no cache copy if HW is doing checksum */
5026 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5474 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5027 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 5475 if ((dev->features & NETIF_F_ALL_CSUM) &&
5028 dev->name); 5476 !(dev->features & NETIF_F_NO_CSUM)) {
5029 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 5477 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5478 dev->features |= NETIF_F_NOCACHE_COPY;
5030 } 5479 }
5031 5480
5032 dev->features = netdev_fix_features(dev->features, dev->name); 5481 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5033 5482 * vlan_dev_init() will do the dev->features check, so these features
5034 /* Enable software GSO if SG is supported. */ 5483 * are enabled only if supported by underlying device.
5035 if (dev->features & NETIF_F_SG) 5484 */
5036 dev->features |= NETIF_F_GSO; 5485 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5037 5486
5038 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5487 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5039 ret = notifier_to_errno(ret); 5488 ret = notifier_to_errno(ret);
@@ -5045,6 +5494,8 @@ int register_netdevice(struct net_device *dev)
5045 goto err_uninit; 5494 goto err_uninit;
5046 dev->reg_state = NETREG_REGISTERED; 5495 dev->reg_state = NETREG_REGISTERED;
5047 5496
5497 __netdev_update_features(dev);
5498
5048 /* 5499 /*
5049 * Default initial state at registry is that the 5500 * Default initial state at registry is that the
5050 * device is present. 5501 * device is present.
@@ -5105,9 +5556,6 @@ int init_dummy_netdev(struct net_device *dev)
5105 */ 5556 */
5106 dev->reg_state = NETREG_DUMMY; 5557 dev->reg_state = NETREG_DUMMY;
5107 5558
5108 /* initialize the ref count */
5109 atomic_set(&dev->refcnt, 1);
5110
5111 /* NAPI wants this */ 5559 /* NAPI wants this */
5112 INIT_LIST_HEAD(&dev->napi_list); 5560 INIT_LIST_HEAD(&dev->napi_list);
5113 5561
@@ -5115,6 +5563,11 @@ int init_dummy_netdev(struct net_device *dev)
5115 set_bit(__LINK_STATE_PRESENT, &dev->state); 5563 set_bit(__LINK_STATE_PRESENT, &dev->state);
5116 set_bit(__LINK_STATE_START, &dev->state); 5564 set_bit(__LINK_STATE_START, &dev->state);
5117 5565
5566 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5567 * because users of this 'device' dont need to change
5568 * its refcount.
5569 */
5570
5118 return 0; 5571 return 0;
5119} 5572}
5120EXPORT_SYMBOL_GPL(init_dummy_netdev); 5573EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5138,24 +5591,22 @@ int register_netdev(struct net_device *dev)
5138 int err; 5591 int err;
5139 5592
5140 rtnl_lock(); 5593 rtnl_lock();
5141
5142 /*
5143 * If the name is a format string the caller wants us to do a
5144 * name allocation.
5145 */
5146 if (strchr(dev->name, '%')) {
5147 err = dev_alloc_name(dev, dev->name);
5148 if (err < 0)
5149 goto out;
5150 }
5151
5152 err = register_netdevice(dev); 5594 err = register_netdevice(dev);
5153out:
5154 rtnl_unlock(); 5595 rtnl_unlock();
5155 return err; 5596 return err;
5156} 5597}
5157EXPORT_SYMBOL(register_netdev); 5598EXPORT_SYMBOL(register_netdev);
5158 5599
5600int netdev_refcnt_read(const struct net_device *dev)
5601{
5602 int i, refcnt = 0;
5603
5604 for_each_possible_cpu(i)
5605 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5606 return refcnt;
5607}
5608EXPORT_SYMBOL(netdev_refcnt_read);
5609
5159/* 5610/*
5160 * netdev_wait_allrefs - wait until all references are gone. 5611 * netdev_wait_allrefs - wait until all references are gone.
5161 * 5612 *
@@ -5170,11 +5621,14 @@ EXPORT_SYMBOL(register_netdev);
5170static void netdev_wait_allrefs(struct net_device *dev) 5621static void netdev_wait_allrefs(struct net_device *dev)
5171{ 5622{
5172 unsigned long rebroadcast_time, warning_time; 5623 unsigned long rebroadcast_time, warning_time;
5624 int refcnt;
5173 5625
5174 linkwatch_forget_dev(dev); 5626 linkwatch_forget_dev(dev);
5175 5627
5176 rebroadcast_time = warning_time = jiffies; 5628 rebroadcast_time = warning_time = jiffies;
5177 while (atomic_read(&dev->refcnt) != 0) { 5629 refcnt = netdev_refcnt_read(dev);
5630
5631 while (refcnt != 0) {
5178 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5632 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5179 rtnl_lock(); 5633 rtnl_lock();
5180 5634
@@ -5201,11 +5655,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
5201 5655
5202 msleep(250); 5656 msleep(250);
5203 5657
5658 refcnt = netdev_refcnt_read(dev);
5659
5204 if (time_after(jiffies, warning_time + 10 * HZ)) { 5660 if (time_after(jiffies, warning_time + 10 * HZ)) {
5205 printk(KERN_EMERG "unregister_netdevice: " 5661 printk(KERN_EMERG "unregister_netdevice: "
5206 "waiting for %s to become free. Usage " 5662 "waiting for %s to become free. Usage "
5207 "count = %d\n", 5663 "count = %d\n",
5208 dev->name, atomic_read(&dev->refcnt)); 5664 dev->name, refcnt);
5209 warning_time = jiffies; 5665 warning_time = jiffies;
5210 } 5666 }
5211 } 5667 }
@@ -5263,9 +5719,9 @@ void netdev_run_todo(void)
5263 netdev_wait_allrefs(dev); 5719 netdev_wait_allrefs(dev);
5264 5720
5265 /* paranoia */ 5721 /* paranoia */
5266 BUG_ON(atomic_read(&dev->refcnt)); 5722 BUG_ON(netdev_refcnt_read(dev));
5267 WARN_ON(dev->ip_ptr); 5723 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5268 WARN_ON(dev->ip6_ptr); 5724 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5269 WARN_ON(dev->dn_ptr); 5725 WARN_ON(dev->dn_ptr);
5270 5726
5271 if (dev->destructor) 5727 if (dev->destructor)
@@ -5276,34 +5732,6 @@ void netdev_run_todo(void)
5276 } 5732 }
5277} 5733}
5278 5734
5279/**
5280 * dev_txq_stats_fold - fold tx_queues stats
5281 * @dev: device to get statistics from
5282 * @stats: struct rtnl_link_stats64 to hold results
5283 */
5284void dev_txq_stats_fold(const struct net_device *dev,
5285 struct rtnl_link_stats64 *stats)
5286{
5287 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5288 unsigned int i;
5289 struct netdev_queue *txq;
5290
5291 for (i = 0; i < dev->num_tx_queues; i++) {
5292 txq = netdev_get_tx_queue(dev, i);
5293 spin_lock_bh(&txq->_xmit_lock);
5294 tx_bytes += txq->tx_bytes;
5295 tx_packets += txq->tx_packets;
5296 tx_dropped += txq->tx_dropped;
5297 spin_unlock_bh(&txq->_xmit_lock);
5298 }
5299 if (tx_bytes || tx_packets || tx_dropped) {
5300 stats->tx_bytes = tx_bytes;
5301 stats->tx_packets = tx_packets;
5302 stats->tx_dropped = tx_dropped;
5303 }
5304}
5305EXPORT_SYMBOL(dev_txq_stats_fold);
5306
5307/* Convert net_device_stats to rtnl_link_stats64. They have the same 5735/* Convert net_device_stats to rtnl_link_stats64. They have the same
5308 * fields in the same order, with only the type differing. 5736 * fields in the same order, with only the type differing.
5309 */ 5737 */
@@ -5342,57 +5770,71 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5342 5770
5343 if (ops->ndo_get_stats64) { 5771 if (ops->ndo_get_stats64) {
5344 memset(storage, 0, sizeof(*storage)); 5772 memset(storage, 0, sizeof(*storage));
5345 return ops->ndo_get_stats64(dev, storage); 5773 ops->ndo_get_stats64(dev, storage);
5346 } 5774 } else if (ops->ndo_get_stats) {
5347 if (ops->ndo_get_stats) {
5348 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 5775 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5349 return storage; 5776 } else {
5777 netdev_stats_to_stats64(storage, &dev->stats);
5350 } 5778 }
5351 netdev_stats_to_stats64(storage, &dev->stats); 5779 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5352 dev_txq_stats_fold(dev, storage);
5353 return storage; 5780 return storage;
5354} 5781}
5355EXPORT_SYMBOL(dev_get_stats); 5782EXPORT_SYMBOL(dev_get_stats);
5356 5783
5357static void netdev_init_one_queue(struct net_device *dev, 5784struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5358 struct netdev_queue *queue,
5359 void *_unused)
5360{ 5785{
5361 queue->dev = dev; 5786 struct netdev_queue *queue = dev_ingress_queue(dev);
5362}
5363 5787
5364static void netdev_init_queues(struct net_device *dev) 5788#ifdef CONFIG_NET_CLS_ACT
5365{ 5789 if (queue)
5366 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 5790 return queue;
5367 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5791 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5368 spin_lock_init(&dev->tx_global_lock); 5792 if (!queue)
5793 return NULL;
5794 netdev_init_one_queue(dev, queue, NULL);
5795 queue->qdisc = &noop_qdisc;
5796 queue->qdisc_sleeping = &noop_qdisc;
5797 rcu_assign_pointer(dev->ingress_queue, queue);
5798#endif
5799 return queue;
5369} 5800}
5370 5801
5371/** 5802/**
5372 * alloc_netdev_mq - allocate network device 5803 * alloc_netdev_mqs - allocate network device
5373 * @sizeof_priv: size of private data to allocate space for 5804 * @sizeof_priv: size of private data to allocate space for
5374 * @name: device name format string 5805 * @name: device name format string
5375 * @setup: callback to initialize device 5806 * @setup: callback to initialize device
5376 * @queue_count: the number of subqueues to allocate 5807 * @txqs: the number of TX subqueues to allocate
5808 * @rxqs: the number of RX subqueues to allocate
5377 * 5809 *
5378 * Allocates a struct net_device with private data area for driver use 5810 * Allocates a struct net_device with private data area for driver use
5379 * and performs basic initialization. Also allocates subquue structs 5811 * and performs basic initialization. Also allocates subquue structs
5380 * for each queue on the device at the end of the netdevice. 5812 * for each queue on the device.
5381 */ 5813 */
5382struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 5814struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5383 void (*setup)(struct net_device *), unsigned int queue_count) 5815 void (*setup)(struct net_device *),
5816 unsigned int txqs, unsigned int rxqs)
5384{ 5817{
5385 struct netdev_queue *tx;
5386 struct net_device *dev; 5818 struct net_device *dev;
5387 size_t alloc_size; 5819 size_t alloc_size;
5388 struct net_device *p; 5820 struct net_device *p;
5389#ifdef CONFIG_RPS
5390 struct netdev_rx_queue *rx;
5391 int i;
5392#endif
5393 5821
5394 BUG_ON(strlen(name) >= sizeof(dev->name)); 5822 BUG_ON(strlen(name) >= sizeof(dev->name));
5395 5823
5824 if (txqs < 1) {
5825 pr_err("alloc_netdev: Unable to allocate device "
5826 "with zero queues.\n");
5827 return NULL;
5828 }
5829
5830#ifdef CONFIG_RPS
5831 if (rxqs < 1) {
5832 pr_err("alloc_netdev: Unable to allocate device "
5833 "with zero RX queues.\n");
5834 return NULL;
5835 }
5836#endif
5837
5396 alloc_size = sizeof(struct net_device); 5838 alloc_size = sizeof(struct net_device);
5397 if (sizeof_priv) { 5839 if (sizeof_priv) {
5398 /* ensure 32-byte alignment of private area */ 5840 /* ensure 32-byte alignment of private area */
@@ -5408,55 +5850,23 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5408 return NULL; 5850 return NULL;
5409 } 5851 }
5410 5852
5411 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5412 if (!tx) {
5413 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5414 "tx qdiscs.\n");
5415 goto free_p;
5416 }
5417
5418#ifdef CONFIG_RPS
5419 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5420 if (!rx) {
5421 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5422 "rx queues.\n");
5423 goto free_tx;
5424 }
5425
5426 atomic_set(&rx->count, queue_count);
5427
5428 /*
5429 * Set a pointer to first element in the array which holds the
5430 * reference count.
5431 */
5432 for (i = 0; i < queue_count; i++)
5433 rx[i].first = rx;
5434#endif
5435
5436 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5853 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5437 dev->padded = (char *)dev - (char *)p; 5854 dev->padded = (char *)dev - (char *)p;
5438 5855
5856 dev->pcpu_refcnt = alloc_percpu(int);
5857 if (!dev->pcpu_refcnt)
5858 goto free_p;
5859
5439 if (dev_addr_init(dev)) 5860 if (dev_addr_init(dev))
5440 goto free_rx; 5861 goto free_pcpu;
5441 5862
5442 dev_mc_init(dev); 5863 dev_mc_init(dev);
5443 dev_uc_init(dev); 5864 dev_uc_init(dev);
5444 5865
5445 dev_net_set(dev, &init_net); 5866 dev_net_set(dev, &init_net);
5446 5867
5447 dev->_tx = tx;
5448 dev->num_tx_queues = queue_count;
5449 dev->real_num_tx_queues = queue_count;
5450
5451#ifdef CONFIG_RPS
5452 dev->_rx = rx;
5453 dev->num_rx_queues = queue_count;
5454#endif
5455
5456 dev->gso_max_size = GSO_MAX_SIZE; 5868 dev->gso_max_size = GSO_MAX_SIZE;
5457 5869
5458 netdev_init_queues(dev);
5459
5460 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); 5870 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5461 dev->ethtool_ntuple_list.count = 0; 5871 dev->ethtool_ntuple_list.count = 0;
5462 INIT_LIST_HEAD(&dev->napi_list); 5872 INIT_LIST_HEAD(&dev->napi_list);
@@ -5464,20 +5874,39 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5464 INIT_LIST_HEAD(&dev->link_watch_list); 5874 INIT_LIST_HEAD(&dev->link_watch_list);
5465 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5875 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5466 setup(dev); 5876 setup(dev);
5877
5878 dev->num_tx_queues = txqs;
5879 dev->real_num_tx_queues = txqs;
5880 if (netif_alloc_netdev_queues(dev))
5881 goto free_all;
5882
5883#ifdef CONFIG_RPS
5884 dev->num_rx_queues = rxqs;
5885 dev->real_num_rx_queues = rxqs;
5886 if (netif_alloc_rx_queues(dev))
5887 goto free_all;
5888#endif
5889
5467 strcpy(dev->name, name); 5890 strcpy(dev->name, name);
5891 dev->group = INIT_NETDEV_GROUP;
5468 return dev; 5892 return dev;
5469 5893
5470free_rx: 5894free_all:
5895 free_netdev(dev);
5896 return NULL;
5897
5898free_pcpu:
5899 free_percpu(dev->pcpu_refcnt);
5900 kfree(dev->_tx);
5471#ifdef CONFIG_RPS 5901#ifdef CONFIG_RPS
5472 kfree(rx); 5902 kfree(dev->_rx);
5473free_tx:
5474#endif 5903#endif
5475 kfree(tx); 5904
5476free_p: 5905free_p:
5477 kfree(p); 5906 kfree(p);
5478 return NULL; 5907 return NULL;
5479} 5908}
5480EXPORT_SYMBOL(alloc_netdev_mq); 5909EXPORT_SYMBOL(alloc_netdev_mqs);
5481 5910
5482/** 5911/**
5483 * free_netdev - free network device 5912 * free_netdev - free network device
@@ -5494,6 +5923,11 @@ void free_netdev(struct net_device *dev)
5494 release_net(dev_net(dev)); 5923 release_net(dev_net(dev));
5495 5924
5496 kfree(dev->_tx); 5925 kfree(dev->_tx);
5926#ifdef CONFIG_RPS
5927 kfree(dev->_rx);
5928#endif
5929
5930 kfree(rcu_dereference_raw(dev->ingress_queue));
5497 5931
5498 /* Flush device addresses */ 5932 /* Flush device addresses */
5499 dev_addr_flush(dev); 5933 dev_addr_flush(dev);
@@ -5504,6 +5938,9 @@ void free_netdev(struct net_device *dev)
5504 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5938 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5505 netif_napi_del(p); 5939 netif_napi_del(p);
5506 5940
5941 free_percpu(dev->pcpu_refcnt);
5942 dev->pcpu_refcnt = NULL;
5943
5507 /* Compatibility with error handling in drivers */ 5944 /* Compatibility with error handling in drivers */
5508 if (dev->reg_state == NETREG_UNINITIALIZED) { 5945 if (dev->reg_state == NETREG_UNINITIALIZED) {
5509 kfree((char *)dev - dev->padded); 5946 kfree((char *)dev - dev->padded);
@@ -5527,7 +5964,10 @@ EXPORT_SYMBOL(free_netdev);
5527void synchronize_net(void) 5964void synchronize_net(void)
5528{ 5965{
5529 might_sleep(); 5966 might_sleep();
5530 synchronize_rcu(); 5967 if (rtnl_is_locked())
5968 synchronize_rcu_expedited();
5969 else
5970 synchronize_rcu();
5531} 5971}
5532EXPORT_SYMBOL(synchronize_net); 5972EXPORT_SYMBOL(synchronize_net);
5533 5973
@@ -5636,7 +6076,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5636 /* We get here if we can't use the current device name */ 6076 /* We get here if we can't use the current device name */
5637 if (!pat) 6077 if (!pat)
5638 goto out; 6078 goto out;
5639 if (dev_get_valid_name(dev, pat, 1)) 6079 if (dev_get_valid_name(dev, pat) < 0)
5640 goto out; 6080 goto out;
5641 } 6081 }
5642 6082
@@ -5658,6 +6098,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5658 6098
5659 /* Notify protocols, that we are about to destroy 6099 /* Notify protocols, that we are about to destroy
5660 this device. They should clean all the things. 6100 this device. They should clean all the things.
6101
6102 Note that dev->reg_state stays at NETREG_REGISTERED.
6103 This is wanted because this way 8021q and macvlan know
6104 the device is just moving and can keep their slaves up.
5661 */ 6105 */
5662 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6106 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5663 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 6107 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
@@ -5734,6 +6178,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5734 oldsd->output_queue = NULL; 6178 oldsd->output_queue = NULL;
5735 oldsd->output_queue_tailp = &oldsd->output_queue; 6179 oldsd->output_queue_tailp = &oldsd->output_queue;
5736 } 6180 }
6181 /* Append NAPI poll list from offline CPU. */
6182 if (!list_empty(&oldsd->poll_list)) {
6183 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6184 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6185 }
5737 6186
5738 raise_softirq_irqoff(NET_TX_SOFTIRQ); 6187 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5739 local_irq_enable(); 6188 local_irq_enable();
@@ -5762,32 +6211,22 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5762 * @one to the master device with current feature set @all. Will not 6211 * @one to the master device with current feature set @all. Will not
5763 * enable anything that is off in @mask. Returns the new feature set. 6212 * enable anything that is off in @mask. Returns the new feature set.
5764 */ 6213 */
5765unsigned long netdev_increment_features(unsigned long all, unsigned long one, 6214u32 netdev_increment_features(u32 all, u32 one, u32 mask)
5766 unsigned long mask)
5767{ 6215{
5768 /* If device needs checksumming, downgrade to it. */ 6216 if (mask & NETIF_F_GEN_CSUM)
5769 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 6217 mask |= NETIF_F_ALL_CSUM;
5770 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); 6218 mask |= NETIF_F_VLAN_CHALLENGED;
5771 else if (mask & NETIF_F_ALL_CSUM) {
5772 /* If one device supports v4/v6 checksumming, set for all. */
5773 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5774 !(all & NETIF_F_GEN_CSUM)) {
5775 all &= ~NETIF_F_ALL_CSUM;
5776 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5777 }
5778 6219
5779 /* If one device supports hw checksumming, set for all. */ 6220 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
5780 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { 6221 all &= one | ~NETIF_F_ALL_FOR_ALL;
5781 all &= ~NETIF_F_ALL_CSUM;
5782 all |= NETIF_F_HW_CSUM;
5783 }
5784 }
5785 6222
5786 one |= NETIF_F_ALL_CSUM; 6223 /* If device needs checksumming, downgrade to it. */
6224 if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6225 all &= ~NETIF_F_NO_CSUM;
5787 6226
5788 one |= all & NETIF_F_ONE_FOR_ALL; 6227 /* If one device supports hw checksumming, set for all. */
5789 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; 6228 if (all & NETIF_F_GEN_CSUM)
5790 all |= one & mask & NETIF_F_ONE_FOR_ALL; 6229 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
5791 6230
5792 return all; 6231 return all;
5793} 6232}
@@ -5830,29 +6269,23 @@ err_name:
5830/** 6269/**
5831 * netdev_drivername - network driver for the device 6270 * netdev_drivername - network driver for the device
5832 * @dev: network device 6271 * @dev: network device
5833 * @buffer: buffer for resulting name
5834 * @len: size of buffer
5835 * 6272 *
5836 * Determine network driver for device. 6273 * Determine network driver for device.
5837 */ 6274 */
5838char *netdev_drivername(const struct net_device *dev, char *buffer, int len) 6275const char *netdev_drivername(const struct net_device *dev)
5839{ 6276{
5840 const struct device_driver *driver; 6277 const struct device_driver *driver;
5841 const struct device *parent; 6278 const struct device *parent;
5842 6279 const char *empty = "";
5843 if (len <= 0 || !buffer)
5844 return buffer;
5845 buffer[0] = 0;
5846 6280
5847 parent = dev->dev.parent; 6281 parent = dev->dev.parent;
5848
5849 if (!parent) 6282 if (!parent)
5850 return buffer; 6283 return empty;
5851 6284
5852 driver = parent->driver; 6285 driver = parent->driver;
5853 if (driver && driver->name) 6286 if (driver && driver->name)
5854 strlcpy(buffer, driver->name, len); 6287 return driver->name;
5855 return buffer; 6288 return empty;
5856} 6289}
5857 6290
5858static int __netdev_printk(const char *level, const struct net_device *dev, 6291static int __netdev_printk(const char *level, const struct net_device *dev,
@@ -5948,7 +6381,7 @@ static void __net_exit default_device_exit(struct net *net)
5948 if (dev->rtnl_link_ops) 6381 if (dev->rtnl_link_ops)
5949 continue; 6382 continue;
5950 6383
5951 /* Push remaing network devices to init_net */ 6384 /* Push remaining network devices to init_net */
5952 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 6385 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5953 err = dev_change_net_namespace(dev, &init_net, fb_name); 6386 err = dev_change_net_namespace(dev, &init_net, fb_name);
5954 if (err) { 6387 if (err) {
@@ -5963,7 +6396,7 @@ static void __net_exit default_device_exit(struct net *net)
5963static void __net_exit default_device_exit_batch(struct list_head *net_list) 6396static void __net_exit default_device_exit_batch(struct list_head *net_list)
5964{ 6397{
5965 /* At exit all network devices most be removed from a network 6398 /* At exit all network devices most be removed from a network
5966 * namespace. Do this in the reverse order of registeration. 6399 * namespace. Do this in the reverse order of registration.
5967 * Do this across as many network namespaces as possible to 6400 * Do this across as many network namespaces as possible to
5968 * improve batching efficiency. 6401 * improve batching efficiency.
5969 */ 6402 */
@@ -5981,6 +6414,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
5981 } 6414 }
5982 } 6415 }
5983 unregister_netdevice_many(&dev_kill_list); 6416 unregister_netdevice_many(&dev_kill_list);
6417 list_del(&dev_kill_list);
5984 rtnl_unlock(); 6418 rtnl_unlock();
5985} 6419}
5986 6420