aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/core
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c8
-rw-r--r--net/core/dev.c1758
-rw-r--r--net/core/dev_addr_lists.c18
-rw-r--r--net/core/drop_monitor.c14
-rw-r--r--net/core/dst.c134
-rw-r--r--net/core/ethtool.c927
-rw-r--r--net/core/fib_rules.c44
-rw-r--r--net/core/filter.c495
-rw-r--r--net/core/flow.c96
-rw-r--r--net/core/gen_estimator.c13
-rw-r--r--net/core/iovec.c26
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c488
-rw-r--r--net/core/net-sysfs.c533
-rw-r--r--net/core/net-sysfs.h4
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/net_namespace.c101
-rw-r--r--net/core/netpoll.c60
-rw-r--r--net/core/pktgen.c507
-rw-r--r--net/core/request_sock.c5
-rw-r--r--net/core/rtnetlink.c331
-rw-r--r--net/core/scm.c12
-rw-r--r--net/core/skbuff.c147
-rw-r--r--net/core/sock.c101
-rw-r--r--net/core/sysctl_net_core.c13
-rw-r--r--net/core/timestamping.c10
-rw-r--r--net/core/utils.c40
27 files changed, 3797 insertions, 2091 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 251997a95483..18ac112ea7ae 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -177,7 +177,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
177 * interrupt level will suddenly eat the receive_queue. 177 * interrupt level will suddenly eat the receive_queue.
178 * 178 *
179 * Look at current nfs client by the way... 179 * Look at current nfs client by the way...
180 * However, this function was corrent in any case. 8) 180 * However, this function was correct in any case. 8)
181 */ 181 */
182 unsigned long cpu_flags; 182 unsigned long cpu_flags;
183 183
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
243 unlock_sock_fast(sk, slow); 243 unlock_sock_fast(sk, slow);
244 244
245 /* skb is now orphaned, can be freed outside of locked section */ 245 /* skb is now orphaned, can be freed outside of locked section */
246 trace_kfree_skb(skb, skb_free_datagram_locked);
246 __kfree_skb(skb); 247 __kfree_skb(skb);
247} 248}
248EXPORT_SYMBOL(skb_free_datagram_locked); 249EXPORT_SYMBOL(skb_free_datagram_locked);
@@ -746,13 +747,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
746 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 747 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
747 mask |= POLLERR; 748 mask |= POLLERR;
748 if (sk->sk_shutdown & RCV_SHUTDOWN) 749 if (sk->sk_shutdown & RCV_SHUTDOWN)
749 mask |= POLLRDHUP; 750 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
750 if (sk->sk_shutdown == SHUTDOWN_MASK) 751 if (sk->sk_shutdown == SHUTDOWN_MASK)
751 mask |= POLLHUP; 752 mask |= POLLHUP;
752 753
753 /* readable? */ 754 /* readable? */
754 if (!skb_queue_empty(&sk->sk_receive_queue) || 755 if (!skb_queue_empty(&sk->sk_receive_queue))
755 (sk->sk_shutdown & RCV_SHUTDOWN))
756 mask |= POLLIN | POLLRDNORM; 756 mask |= POLLIN | POLLRDNORM;
757 757
758 /* Connection-based need to check for termination and startup */ 758 /* Connection-based need to check for termination and startup */
diff --git a/net/core/dev.c b/net/core/dev.c
index 660dd41aaaa6..9c58c1ec41a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,7 +128,11 @@
128#include <linux/jhash.h> 128#include <linux/jhash.h>
129#include <linux/random.h> 129#include <linux/random.h>
130#include <trace/events/napi.h> 130#include <trace/events/napi.h>
131#include <trace/events/net.h>
132#include <trace/events/skb.h>
131#include <linux/pci.h> 133#include <linux/pci.h>
134#include <linux/inetdevice.h>
135#include <linux/cpu_rmap.h>
132 136
133#include "net-sysfs.h" 137#include "net-sysfs.h"
134 138
@@ -371,6 +375,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
371 * --ANK (980803) 375 * --ANK (980803)
372 */ 376 */
373 377
378static inline struct list_head *ptype_head(const struct packet_type *pt)
379{
380 if (pt->type == htons(ETH_P_ALL))
381 return &ptype_all;
382 else
383 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384}
385
374/** 386/**
375 * dev_add_pack - add packet handler 387 * dev_add_pack - add packet handler
376 * @pt: packet type declaration 388 * @pt: packet type declaration
@@ -386,16 +398,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
386 398
387void dev_add_pack(struct packet_type *pt) 399void dev_add_pack(struct packet_type *pt)
388{ 400{
389 int hash; 401 struct list_head *head = ptype_head(pt);
390 402
391 spin_lock_bh(&ptype_lock); 403 spin_lock(&ptype_lock);
392 if (pt->type == htons(ETH_P_ALL)) 404 list_add_rcu(&pt->list, head);
393 list_add_rcu(&pt->list, &ptype_all); 405 spin_unlock(&ptype_lock);
394 else {
395 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
396 list_add_rcu(&pt->list, &ptype_base[hash]);
397 }
398 spin_unlock_bh(&ptype_lock);
399} 406}
400EXPORT_SYMBOL(dev_add_pack); 407EXPORT_SYMBOL(dev_add_pack);
401 408
@@ -414,15 +421,10 @@ EXPORT_SYMBOL(dev_add_pack);
414 */ 421 */
415void __dev_remove_pack(struct packet_type *pt) 422void __dev_remove_pack(struct packet_type *pt)
416{ 423{
417 struct list_head *head; 424 struct list_head *head = ptype_head(pt);
418 struct packet_type *pt1; 425 struct packet_type *pt1;
419 426
420 spin_lock_bh(&ptype_lock); 427 spin_lock(&ptype_lock);
421
422 if (pt->type == htons(ETH_P_ALL))
423 head = &ptype_all;
424 else
425 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
426 428
427 list_for_each_entry(pt1, head, list) { 429 list_for_each_entry(pt1, head, list) {
428 if (pt == pt1) { 430 if (pt == pt1) {
@@ -433,7 +435,7 @@ void __dev_remove_pack(struct packet_type *pt)
433 435
434 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
435out: 437out:
436 spin_unlock_bh(&ptype_lock); 438 spin_unlock(&ptype_lock);
437} 439}
438EXPORT_SYMBOL(__dev_remove_pack); 440EXPORT_SYMBOL(__dev_remove_pack);
439 441
@@ -742,34 +744,32 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
742EXPORT_SYMBOL(dev_get_by_index); 744EXPORT_SYMBOL(dev_get_by_index);
743 745
744/** 746/**
745 * dev_getbyhwaddr - find a device by its hardware address 747 * dev_getbyhwaddr_rcu - find a device by its hardware address
746 * @net: the applicable net namespace 748 * @net: the applicable net namespace
747 * @type: media type of device 749 * @type: media type of device
748 * @ha: hardware address 750 * @ha: hardware address
749 * 751 *
750 * Search for an interface by MAC address. Returns NULL if the device 752 * Search for an interface by MAC address. Returns NULL if the device
751 * is not found or a pointer to the device. The caller must hold the 753 * is not found or a pointer to the device.
752 * rtnl semaphore. The returned device has not had its ref count increased 754 * The caller must hold RCU or RTNL.
755 * The returned device has not had its ref count increased
753 * and the caller must therefore be careful about locking 756 * and the caller must therefore be careful about locking
754 * 757 *
755 * BUGS:
756 * If the API was consistent this would be __dev_get_by_hwaddr
757 */ 758 */
758 759
759struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 760struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 const char *ha)
760{ 762{
761 struct net_device *dev; 763 struct net_device *dev;
762 764
763 ASSERT_RTNL(); 765 for_each_netdev_rcu(net, dev)
764
765 for_each_netdev(net, dev)
766 if (dev->type == type && 766 if (dev->type == type &&
767 !memcmp(dev->dev_addr, ha, dev->addr_len)) 767 !memcmp(dev->dev_addr, ha, dev->addr_len))
768 return dev; 768 return dev;
769 769
770 return NULL; 770 return NULL;
771} 771}
772EXPORT_SYMBOL(dev_getbyhwaddr); 772EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 773
774struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 774struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775{ 775{
@@ -948,7 +948,7 @@ int dev_alloc_name(struct net_device *dev, const char *name)
948} 948}
949EXPORT_SYMBOL(dev_alloc_name); 949EXPORT_SYMBOL(dev_alloc_name);
950 950
951static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) 951static int dev_get_valid_name(struct net_device *dev, const char *name)
952{ 952{
953 struct net *net; 953 struct net *net;
954 954
@@ -958,7 +958,7 @@ static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt
958 if (!dev_valid_name(name)) 958 if (!dev_valid_name(name))
959 return -EINVAL; 959 return -EINVAL;
960 960
961 if (fmt && strchr(name, '%')) 961 if (strchr(name, '%'))
962 return dev_alloc_name(dev, name); 962 return dev_alloc_name(dev, name);
963 else if (__dev_get_by_name(net, name)) 963 else if (__dev_get_by_name(net, name))
964 return -EEXIST; 964 return -EEXIST;
@@ -995,7 +995,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
995 995
996 memcpy(oldname, dev->name, IFNAMSIZ); 996 memcpy(oldname, dev->name, IFNAMSIZ);
997 997
998 err = dev_get_valid_name(dev, newname, 1); 998 err = dev_get_valid_name(dev, newname);
999 if (err < 0) 999 if (err < 0)
1000 return err; 1000 return err;
1001 1001
@@ -1007,7 +1007,7 @@ rollback:
1007 } 1007 }
1008 1008
1009 write_lock_bh(&dev_base_lock); 1009 write_lock_bh(&dev_base_lock);
1010 hlist_del(&dev->name_hlist); 1010 hlist_del_rcu(&dev->name_hlist);
1011 write_unlock_bh(&dev_base_lock); 1011 write_unlock_bh(&dev_base_lock);
1012 1012
1013 synchronize_rcu(); 1013 synchronize_rcu();
@@ -1115,13 +1115,21 @@ EXPORT_SYMBOL(netdev_bonding_change);
1115void dev_load(struct net *net, const char *name) 1115void dev_load(struct net *net, const char *name)
1116{ 1116{
1117 struct net_device *dev; 1117 struct net_device *dev;
1118 int no_module;
1118 1119
1119 rcu_read_lock(); 1120 rcu_read_lock();
1120 dev = dev_get_by_name_rcu(net, name); 1121 dev = dev_get_by_name_rcu(net, name);
1121 rcu_read_unlock(); 1122 rcu_read_unlock();
1122 1123
1123 if (!dev && capable(CAP_NET_ADMIN)) 1124 no_module = !dev;
1124 request_module("%s", name); 1125 if (no_module && capable(CAP_NET_ADMIN))
1126 no_module = request_module("netdev-%s", name);
1127 if (no_module && capable(CAP_SYS_MODULE)) {
1128 if (!request_module("%s", name))
1129 pr_err("Loading kernel module for a network device "
1130"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1131"instead\n", name);
1132 }
1125} 1133}
1126EXPORT_SYMBOL(dev_load); 1134EXPORT_SYMBOL(dev_load);
1127 1135
@@ -1132,9 +1140,6 @@ static int __dev_open(struct net_device *dev)
1132 1140
1133 ASSERT_RTNL(); 1141 ASSERT_RTNL();
1134 1142
1135 /*
1136 * Is it even present?
1137 */
1138 if (!netif_device_present(dev)) 1143 if (!netif_device_present(dev))
1139 return -ENODEV; 1144 return -ENODEV;
1140 1145
@@ -1143,9 +1148,6 @@ static int __dev_open(struct net_device *dev)
1143 if (ret) 1148 if (ret)
1144 return ret; 1149 return ret;
1145 1150
1146 /*
1147 * Call device private open method
1148 */
1149 set_bit(__LINK_STATE_START, &dev->state); 1151 set_bit(__LINK_STATE_START, &dev->state);
1150 1152
1151 if (ops->ndo_validate_addr) 1153 if (ops->ndo_validate_addr)
@@ -1154,31 +1156,12 @@ static int __dev_open(struct net_device *dev)
1154 if (!ret && ops->ndo_open) 1156 if (!ret && ops->ndo_open)
1155 ret = ops->ndo_open(dev); 1157 ret = ops->ndo_open(dev);
1156 1158
1157 /*
1158 * If it went open OK then:
1159 */
1160
1161 if (ret) 1159 if (ret)
1162 clear_bit(__LINK_STATE_START, &dev->state); 1160 clear_bit(__LINK_STATE_START, &dev->state);
1163 else { 1161 else {
1164 /*
1165 * Set the flags.
1166 */
1167 dev->flags |= IFF_UP; 1162 dev->flags |= IFF_UP;
1168
1169 /*
1170 * Enable NET_DMA
1171 */
1172 net_dmaengine_get(); 1163 net_dmaengine_get();
1173
1174 /*
1175 * Initialize multicasting status
1176 */
1177 dev_set_rx_mode(dev); 1164 dev_set_rx_mode(dev);
1178
1179 /*
1180 * Wakeup transmit queue engine
1181 */
1182 dev_activate(dev); 1165 dev_activate(dev);
1183 } 1166 }
1184 1167
@@ -1201,22 +1184,13 @@ int dev_open(struct net_device *dev)
1201{ 1184{
1202 int ret; 1185 int ret;
1203 1186
1204 /*
1205 * Is it already up?
1206 */
1207 if (dev->flags & IFF_UP) 1187 if (dev->flags & IFF_UP)
1208 return 0; 1188 return 0;
1209 1189
1210 /*
1211 * Open device
1212 */
1213 ret = __dev_open(dev); 1190 ret = __dev_open(dev);
1214 if (ret < 0) 1191 if (ret < 0)
1215 return ret; 1192 return ret;
1216 1193
1217 /*
1218 * ... and announce new interface.
1219 */
1220 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1194 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221 call_netdevice_notifiers(NETDEV_UP, dev); 1195 call_netdevice_notifiers(NETDEV_UP, dev);
1222 1196
@@ -1224,52 +1198,78 @@ int dev_open(struct net_device *dev)
1224} 1198}
1225EXPORT_SYMBOL(dev_open); 1199EXPORT_SYMBOL(dev_open);
1226 1200
1227static int __dev_close(struct net_device *dev) 1201static int __dev_close_many(struct list_head *head)
1228{ 1202{
1229 const struct net_device_ops *ops = dev->netdev_ops; 1203 struct net_device *dev;
1230 1204
1231 ASSERT_RTNL(); 1205 ASSERT_RTNL();
1232 might_sleep(); 1206 might_sleep();
1233 1207
1234 /* 1208 list_for_each_entry(dev, head, unreg_list) {
1235 * Tell people we are going down, so that they can 1209 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1236 * prepare to death, when device is still operating.
1237 */
1238 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1239 1210
1240 clear_bit(__LINK_STATE_START, &dev->state); 1211 clear_bit(__LINK_STATE_START, &dev->state);
1241 1212
1242 /* Synchronize to scheduled poll. We cannot touch poll list, 1213 /* Synchronize to scheduled poll. We cannot touch poll list, it
1243 * it can be even on different cpu. So just clear netif_running(). 1214 * can be even on different cpu. So just clear netif_running().
1244 * 1215 *
1245 * dev->stop() will invoke napi_disable() on all of it's 1216 * dev->stop() will invoke napi_disable() on all of it's
1246 * napi_struct instances on this device. 1217 * napi_struct instances on this device.
1247 */ 1218 */
1248 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1219 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 }
1249 1221
1250 dev_deactivate(dev); 1222 dev_deactivate_many(head);
1251 1223
1252 /* 1224 list_for_each_entry(dev, head, unreg_list) {
1253 * Call the device specific close. This cannot fail. 1225 const struct net_device_ops *ops = dev->netdev_ops;
1254 * Only if device is UP
1255 *
1256 * We allow it to be called even after a DETACH hot-plug
1257 * event.
1258 */
1259 if (ops->ndo_stop)
1260 ops->ndo_stop(dev);
1261 1226
1262 /* 1227 /*
1263 * Device is now down. 1228 * Call the device specific close. This cannot fail.
1264 */ 1229 * Only if device is UP
1230 *
1231 * We allow it to be called even after a DETACH hot-plug
1232 * event.
1233 */
1234 if (ops->ndo_stop)
1235 ops->ndo_stop(dev);
1265 1236
1266 dev->flags &= ~IFF_UP; 1237 dev->flags &= ~IFF_UP;
1238 net_dmaengine_put();
1239 }
1267 1240
1268 /* 1241 return 0;
1269 * Shutdown NET_DMA 1242}
1270 */ 1243
1271 net_dmaengine_put(); 1244static int __dev_close(struct net_device *dev)
1245{
1246 int retval;
1247 LIST_HEAD(single);
1272 1248
1249 list_add(&dev->unreg_list, &single);
1250 retval = __dev_close_many(&single);
1251 list_del(&single);
1252 return retval;
1253}
1254
1255static int dev_close_many(struct list_head *head)
1256{
1257 struct net_device *dev, *tmp;
1258 LIST_HEAD(tmp_list);
1259
1260 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 if (!(dev->flags & IFF_UP))
1262 list_move(&dev->unreg_list, &tmp_list);
1263
1264 __dev_close_many(head);
1265
1266 list_for_each_entry(dev, head, unreg_list) {
1267 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 }
1270
1271 /* rollback_registered_many needs the complete original list */
1272 list_splice(&tmp_list, head);
1273 return 0; 1273 return 0;
1274} 1274}
1275 1275
@@ -1284,17 +1284,13 @@ static int __dev_close(struct net_device *dev)
1284 */ 1284 */
1285int dev_close(struct net_device *dev) 1285int dev_close(struct net_device *dev)
1286{ 1286{
1287 if (!(dev->flags & IFF_UP)) 1287 if (dev->flags & IFF_UP) {
1288 return 0; 1288 LIST_HEAD(single);
1289
1290 __dev_close(dev);
1291
1292 /*
1293 * Tell people we are down
1294 */
1295 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1296 call_netdevice_notifiers(NETDEV_DOWN, dev);
1297 1289
1290 list_add(&dev->unreg_list, &single);
1291 dev_close_many(&single);
1292 list_del(&single);
1293 }
1298 return 0; 1294 return 0;
1299} 1295}
1300EXPORT_SYMBOL(dev_close); 1296EXPORT_SYMBOL(dev_close);
@@ -1310,26 +1306,32 @@ EXPORT_SYMBOL(dev_close);
1310 */ 1306 */
1311void dev_disable_lro(struct net_device *dev) 1307void dev_disable_lro(struct net_device *dev)
1312{ 1308{
1313 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1309 u32 flags;
1314 dev->ethtool_ops->set_flags) { 1310
1315 u32 flags = dev->ethtool_ops->get_flags(dev); 1311 /*
1316 if (flags & ETH_FLAG_LRO) { 1312 * If we're trying to disable lro on a vlan device
1317 flags &= ~ETH_FLAG_LRO; 1313 * use the underlying physical device instead
1318 dev->ethtool_ops->set_flags(dev, flags); 1314 */
1319 } 1315 if (is_vlan_dev(dev))
1320 } 1316 dev = vlan_dev_real_dev(dev);
1321 WARN_ON(dev->features & NETIF_F_LRO); 1317
1318 if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1319 flags = dev->ethtool_ops->get_flags(dev);
1320 else
1321 flags = ethtool_op_get_flags(dev);
1322
1323 if (!(flags & ETH_FLAG_LRO))
1324 return;
1325
1326 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1327 if (unlikely(dev->features & NETIF_F_LRO))
1328 netdev_WARN(dev, "failed to disable LRO!\n");
1322} 1329}
1323EXPORT_SYMBOL(dev_disable_lro); 1330EXPORT_SYMBOL(dev_disable_lro);
1324 1331
1325 1332
1326static int dev_boot_phase = 1; 1333static int dev_boot_phase = 1;
1327 1334
1328/*
1329 * Device change register/unregister. These are not inline or static
1330 * as we export them to the world.
1331 */
1332
1333/** 1335/**
1334 * register_netdevice_notifier - register a network notifier block 1336 * register_netdevice_notifier - register a network notifier block
1335 * @nb: notifier 1337 * @nb: notifier
@@ -1431,6 +1433,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1431 ASSERT_RTNL(); 1433 ASSERT_RTNL();
1432 return raw_notifier_call_chain(&netdev_chain, val, dev); 1434 return raw_notifier_call_chain(&netdev_chain, val, dev);
1433} 1435}
1436EXPORT_SYMBOL(call_netdevice_notifiers);
1434 1437
1435/* When > 0 there are consumers of rx skb time stamps */ 1438/* When > 0 there are consumers of rx skb time stamps */
1436static atomic_t netstamp_needed = ATOMIC_INIT(0); 1439static atomic_t netstamp_needed = ATOMIC_INIT(0);
@@ -1461,6 +1464,27 @@ static inline void net_timestamp_check(struct sk_buff *skb)
1461 __net_timestamp(skb); 1464 __net_timestamp(skb);
1462} 1465}
1463 1466
1467static inline bool is_skb_forwardable(struct net_device *dev,
1468 struct sk_buff *skb)
1469{
1470 unsigned int len;
1471
1472 if (!(dev->flags & IFF_UP))
1473 return false;
1474
1475 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1476 if (skb->len <= len)
1477 return true;
1478
1479 /* if TSO is enabled, we don't care about the length as the packet
1480 * could be forwarded without being segmented before
1481 */
1482 if (skb_is_gso(skb))
1483 return true;
1484
1485 return false;
1486}
1487
1464/** 1488/**
1465 * dev_forward_skb - loopback an skb to another netif 1489 * dev_forward_skb - loopback an skb to another netif
1466 * 1490 *
@@ -1484,8 +1508,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1484 skb_orphan(skb); 1508 skb_orphan(skb);
1485 nf_reset(skb); 1509 nf_reset(skb);
1486 1510
1487 if (!(dev->flags & IFF_UP) || 1511 if (unlikely(!is_skb_forwardable(dev, skb))) {
1488 (skb->len > (dev->mtu + dev->hard_header_len))) { 1512 atomic_long_inc(&dev->rx_dropped);
1489 kfree_skb(skb); 1513 kfree_skb(skb);
1490 return NET_RX_DROP; 1514 return NET_RX_DROP;
1491 } 1515 }
@@ -1497,6 +1521,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1497} 1521}
1498EXPORT_SYMBOL_GPL(dev_forward_skb); 1522EXPORT_SYMBOL_GPL(dev_forward_skb);
1499 1523
1524static inline int deliver_skb(struct sk_buff *skb,
1525 struct packet_type *pt_prev,
1526 struct net_device *orig_dev)
1527{
1528 atomic_inc(&skb->users);
1529 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1530}
1531
1500/* 1532/*
1501 * Support routine. Sends outgoing frames to any network 1533 * Support routine. Sends outgoing frames to any network
1502 * taps currently in use. 1534 * taps currently in use.
@@ -1505,13 +1537,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb);
1505static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1537static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1506{ 1538{
1507 struct packet_type *ptype; 1539 struct packet_type *ptype;
1508 1540 struct sk_buff *skb2 = NULL;
1509#ifdef CONFIG_NET_CLS_ACT 1541 struct packet_type *pt_prev = NULL;
1510 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1511 net_timestamp_set(skb);
1512#else
1513 net_timestamp_set(skb);
1514#endif
1515 1542
1516 rcu_read_lock(); 1543 rcu_read_lock();
1517 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1544 list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1521,10 +1548,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1521 if ((ptype->dev == dev || !ptype->dev) && 1548 if ((ptype->dev == dev || !ptype->dev) &&
1522 (ptype->af_packet_priv == NULL || 1549 (ptype->af_packet_priv == NULL ||
1523 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1550 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1524 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1551 if (pt_prev) {
1552 deliver_skb(skb2, pt_prev, skb->dev);
1553 pt_prev = ptype;
1554 continue;
1555 }
1556
1557 skb2 = skb_clone(skb, GFP_ATOMIC);
1525 if (!skb2) 1558 if (!skb2)
1526 break; 1559 break;
1527 1560
1561 net_timestamp_set(skb2);
1562
1528 /* skb->nh should be correctly 1563 /* skb->nh should be correctly
1529 set by sender, so that the second statement is 1564 set by sender, so that the second statement is
1530 just protection against buggy protocols. 1565 just protection against buggy protocols.
@@ -1543,31 +1578,121 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1543 1578
1544 skb2->transport_header = skb2->network_header; 1579 skb2->transport_header = skb2->network_header;
1545 skb2->pkt_type = PACKET_OUTGOING; 1580 skb2->pkt_type = PACKET_OUTGOING;
1546 ptype->func(skb2, skb->dev, ptype, skb->dev); 1581 pt_prev = ptype;
1547 } 1582 }
1548 } 1583 }
1584 if (pt_prev)
1585 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1549 rcu_read_unlock(); 1586 rcu_read_unlock();
1550} 1587}
1551 1588
1589/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1590 * @dev: Network device
1591 * @txq: number of queues available
1592 *
1593 * If real_num_tx_queues is changed the tc mappings may no longer be
1594 * valid. To resolve this verify the tc mapping remains valid and if
1595 * not NULL the mapping. With no priorities mapping to this
1596 * offset/count pair it will no longer be used. In the worst case TC0
1597 * is invalid nothing can be done so disable priority mappings. If is
1598 * expected that drivers will fix this mapping if they can before
1599 * calling netif_set_real_num_tx_queues.
1600 */
1601static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1602{
1603 int i;
1604 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1605
1606 /* If TC0 is invalidated disable TC mapping */
1607 if (tc->offset + tc->count > txq) {
1608 pr_warning("Number of in use tx queues changed "
1609 "invalidating tc mappings. Priority "
1610 "traffic classification disabled!\n");
1611 dev->num_tc = 0;
1612 return;
1613 }
1614
1615 /* Invalidated prio to tc mappings set to TC0 */
1616 for (i = 1; i < TC_BITMASK + 1; i++) {
1617 int q = netdev_get_prio_tc_map(dev, i);
1618
1619 tc = &dev->tc_to_txq[q];
1620 if (tc->offset + tc->count > txq) {
1621 pr_warning("Number of in use tx queues "
1622 "changed. Priority %i to tc "
1623 "mapping %i is no longer valid "
1624 "setting map to 0\n",
1625 i, q);
1626 netdev_set_prio_tc_map(dev, i, 0);
1627 }
1628 }
1629}
1630
1552/* 1631/*
1553 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1632 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1554 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1633 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1555 */ 1634 */
1556void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1635int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1557{ 1636{
1558 unsigned int real_num = dev->real_num_tx_queues; 1637 int rc;
1638
1639 if (txq < 1 || txq > dev->num_tx_queues)
1640 return -EINVAL;
1641
1642 if (dev->reg_state == NETREG_REGISTERED ||
1643 dev->reg_state == NETREG_UNREGISTERING) {
1644 ASSERT_RTNL();
1559 1645
1560 if (unlikely(txq > dev->num_tx_queues)) 1646 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1561 ; 1647 txq);
1562 else if (txq > real_num) 1648 if (rc)
1563 dev->real_num_tx_queues = txq; 1649 return rc;
1564 else if (txq < real_num) { 1650
1565 dev->real_num_tx_queues = txq; 1651 if (dev->num_tc)
1566 qdisc_reset_all_tx_gt(dev, txq); 1652 netif_setup_tc(dev, txq);
1653
1654 if (txq < dev->real_num_tx_queues)
1655 qdisc_reset_all_tx_gt(dev, txq);
1567 } 1656 }
1657
1658 dev->real_num_tx_queues = txq;
1659 return 0;
1568} 1660}
1569EXPORT_SYMBOL(netif_set_real_num_tx_queues); 1661EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1570 1662
1663#ifdef CONFIG_RPS
1664/**
1665 * netif_set_real_num_rx_queues - set actual number of RX queues used
1666 * @dev: Network device
1667 * @rxq: Actual number of RX queues
1668 *
1669 * This must be called either with the rtnl_lock held or before
1670 * registration of the net device. Returns 0 on success, or a
1671 * negative error code. If called before registration, it always
1672 * succeeds.
1673 */
1674int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1675{
1676 int rc;
1677
1678 if (rxq < 1 || rxq > dev->num_rx_queues)
1679 return -EINVAL;
1680
1681 if (dev->reg_state == NETREG_REGISTERED) {
1682 ASSERT_RTNL();
1683
1684 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1685 rxq);
1686 if (rc)
1687 return rc;
1688 }
1689
1690 dev->real_num_rx_queues = rxq;
1691 return 0;
1692}
1693EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1694#endif
1695
1571static inline void __netif_reschedule(struct Qdisc *q) 1696static inline void __netif_reschedule(struct Qdisc *q)
1572{ 1697{
1573 struct softnet_data *sd; 1698 struct softnet_data *sd;
@@ -1646,32 +1771,6 @@ void netif_device_attach(struct net_device *dev)
1646} 1771}
1647EXPORT_SYMBOL(netif_device_attach); 1772EXPORT_SYMBOL(netif_device_attach);
1648 1773
1649static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1650{
1651 return ((features & NETIF_F_GEN_CSUM) ||
1652 ((features & NETIF_F_IP_CSUM) &&
1653 protocol == htons(ETH_P_IP)) ||
1654 ((features & NETIF_F_IPV6_CSUM) &&
1655 protocol == htons(ETH_P_IPV6)) ||
1656 ((features & NETIF_F_FCOE_CRC) &&
1657 protocol == htons(ETH_P_FCOE)));
1658}
1659
1660static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1661{
1662 if (can_checksum_protocol(dev->features, skb->protocol))
1663 return true;
1664
1665 if (skb->protocol == htons(ETH_P_8021Q)) {
1666 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1667 if (can_checksum_protocol(dev->features & dev->vlan_features,
1668 veh->h_vlan_encapsulated_proto))
1669 return true;
1670 }
1671
1672 return false;
1673}
1674
1675/** 1774/**
1676 * skb_dev_set -- assign a new device to a buffer 1775 * skb_dev_set -- assign a new device to a buffer
1677 * @skb: buffer for the new device 1776 * @skb: buffer for the new device
@@ -1719,7 +1818,7 @@ int skb_checksum_help(struct sk_buff *skb)
1719 goto out_set_summed; 1818 goto out_set_summed;
1720 } 1819 }
1721 1820
1722 offset = skb->csum_start - skb_headroom(skb); 1821 offset = skb_checksum_start_offset(skb);
1723 BUG_ON(offset >= skb_headlen(skb)); 1822 BUG_ON(offset >= skb_headlen(skb));
1724 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1823 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1725 1824
@@ -1751,13 +1850,25 @@ EXPORT_SYMBOL(skb_checksum_help);
1751 * It may return NULL if the skb requires no segmentation. This is 1850 * It may return NULL if the skb requires no segmentation. This is
1752 * only possible when GSO is used for verifying header integrity. 1851 * only possible when GSO is used for verifying header integrity.
1753 */ 1852 */
1754struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1853struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1755{ 1854{
1756 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1855 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1757 struct packet_type *ptype; 1856 struct packet_type *ptype;
1758 __be16 type = skb->protocol; 1857 __be16 type = skb->protocol;
1858 int vlan_depth = ETH_HLEN;
1759 int err; 1859 int err;
1760 1860
1861 while (type == htons(ETH_P_8021Q)) {
1862 struct vlan_hdr *vh;
1863
1864 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1865 return ERR_PTR(-EINVAL);
1866
1867 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1868 type = vh->h_vlan_encapsulated_proto;
1869 vlan_depth += VLAN_HLEN;
1870 }
1871
1761 skb_reset_mac_header(skb); 1872 skb_reset_mac_header(skb);
1762 skb->mac_len = skb->network_header - skb->mac_header; 1873 skb->mac_len = skb->network_header - skb->mac_header;
1763 __skb_pull(skb, skb->mac_len); 1874 __skb_pull(skb, skb->mac_len);
@@ -1769,8 +1880,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1769 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1880 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1770 dev->ethtool_ops->get_drvinfo(dev, &info); 1881 dev->ethtool_ops->get_drvinfo(dev, &info);
1771 1882
1772 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " 1883 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1773 "ip_summed=%d",
1774 info.driver, dev ? dev->features : 0L, 1884 info.driver, dev ? dev->features : 0L,
1775 skb->sk ? skb->sk->sk_route_caps : 0L, 1885 skb->sk ? skb->sk->sk_route_caps : 0L,
1776 skb->len, skb->data_len, skb->ip_summed); 1886 skb->len, skb->data_len, skb->ip_summed);
@@ -1873,16 +1983,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
1873/** 1983/**
1874 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1984 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1875 * @skb: buffer to segment 1985 * @skb: buffer to segment
1986 * @features: device features as applicable to this skb
1876 * 1987 *
1877 * This function segments the given skb and stores the list of segments 1988 * This function segments the given skb and stores the list of segments
1878 * in skb->next. 1989 * in skb->next.
1879 */ 1990 */
1880static int dev_gso_segment(struct sk_buff *skb) 1991static int dev_gso_segment(struct sk_buff *skb, int features)
1881{ 1992{
1882 struct net_device *dev = skb->dev;
1883 struct sk_buff *segs; 1993 struct sk_buff *segs;
1884 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1885 NETIF_F_SG : 0);
1886 1994
1887 segs = skb_gso_segment(skb, features); 1995 segs = skb_gso_segment(skb, features);
1888 1996
@@ -1902,14 +2010,14 @@ static int dev_gso_segment(struct sk_buff *skb)
1902 2010
1903/* 2011/*
1904 * Try to orphan skb early, right before transmission by the device. 2012 * Try to orphan skb early, right before transmission by the device.
1905 * We cannot orphan skb if tx timestamp is requested, since 2013 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1906 * drivers need to call skb_tstamp_tx() to send the timestamp. 2014 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1907 */ 2015 */
1908static inline void skb_orphan_try(struct sk_buff *skb) 2016static inline void skb_orphan_try(struct sk_buff *skb)
1909{ 2017{
1910 struct sock *sk = skb->sk; 2018 struct sock *sk = skb->sk;
1911 2019
1912 if (sk && !skb_tx(skb)->flags) { 2020 if (sk && !skb_shinfo(skb)->tx_flags) {
1913 /* skb_tx_hash() wont be able to get sk. 2021 /* skb_tx_hash() wont be able to get sk.
1914 * We copy sk_hash into skb->rxhash 2022 * We copy sk_hash into skb->rxhash
1915 */ 2023 */
@@ -1919,6 +2027,53 @@ static inline void skb_orphan_try(struct sk_buff *skb)
1919 } 2027 }
1920} 2028}
1921 2029
2030static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2031{
2032 return ((features & NETIF_F_GEN_CSUM) ||
2033 ((features & NETIF_F_V4_CSUM) &&
2034 protocol == htons(ETH_P_IP)) ||
2035 ((features & NETIF_F_V6_CSUM) &&
2036 protocol == htons(ETH_P_IPV6)) ||
2037 ((features & NETIF_F_FCOE_CRC) &&
2038 protocol == htons(ETH_P_FCOE)));
2039}
2040
2041static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2042{
2043 if (!can_checksum_protocol(features, protocol)) {
2044 features &= ~NETIF_F_ALL_CSUM;
2045 features &= ~NETIF_F_SG;
2046 } else if (illegal_highdma(skb->dev, skb)) {
2047 features &= ~NETIF_F_SG;
2048 }
2049
2050 return features;
2051}
2052
2053u32 netif_skb_features(struct sk_buff *skb)
2054{
2055 __be16 protocol = skb->protocol;
2056 u32 features = skb->dev->features;
2057
2058 if (protocol == htons(ETH_P_8021Q)) {
2059 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2060 protocol = veh->h_vlan_encapsulated_proto;
2061 } else if (!vlan_tx_tag_present(skb)) {
2062 return harmonize_features(skb, protocol, features);
2063 }
2064
2065 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2066
2067 if (protocol != htons(ETH_P_8021Q)) {
2068 return harmonize_features(skb, protocol, features);
2069 } else {
2070 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2071 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2072 return harmonize_features(skb, protocol, features);
2073 }
2074}
2075EXPORT_SYMBOL(netif_skb_features);
2076
1922/* 2077/*
1923 * Returns true if either: 2078 * Returns true if either:
1924 * 1. skb has frag_list and the device doesn't support FRAGLIST, or 2079 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
@@ -1927,12 +2082,13 @@ static inline void skb_orphan_try(struct sk_buff *skb)
1927 * support DMA from it. 2082 * support DMA from it.
1928 */ 2083 */
1929static inline int skb_needs_linearize(struct sk_buff *skb, 2084static inline int skb_needs_linearize(struct sk_buff *skb,
1930 struct net_device *dev) 2085 int features)
1931{ 2086{
1932 return skb_is_nonlinear(skb) && 2087 return skb_is_nonlinear(skb) &&
1933 ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || 2088 ((skb_has_frag_list(skb) &&
1934 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || 2089 !(features & NETIF_F_FRAGLIST)) ||
1935 illegal_highdma(dev, skb)))); 2090 (skb_shinfo(skb)->nr_frags &&
2091 !(features & NETIF_F_SG)));
1936} 2092}
1937 2093
1938int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 2094int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
@@ -1940,27 +2096,41 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1940{ 2096{
1941 const struct net_device_ops *ops = dev->netdev_ops; 2097 const struct net_device_ops *ops = dev->netdev_ops;
1942 int rc = NETDEV_TX_OK; 2098 int rc = NETDEV_TX_OK;
2099 unsigned int skb_len;
1943 2100
1944 if (likely(!skb->next)) { 2101 if (likely(!skb->next)) {
1945 if (!list_empty(&ptype_all)) 2102 u32 features;
1946 dev_queue_xmit_nit(skb, dev);
1947 2103
1948 /* 2104 /*
1949 * If device doesnt need skb->dst, release it right now while 2105 * If device doesn't need skb->dst, release it right now while
1950 * its hot in this cpu cache 2106 * its hot in this cpu cache
1951 */ 2107 */
1952 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2108 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1953 skb_dst_drop(skb); 2109 skb_dst_drop(skb);
1954 2110
2111 if (!list_empty(&ptype_all))
2112 dev_queue_xmit_nit(skb, dev);
2113
1955 skb_orphan_try(skb); 2114 skb_orphan_try(skb);
1956 2115
1957 if (netif_needs_gso(dev, skb)) { 2116 features = netif_skb_features(skb);
1958 if (unlikely(dev_gso_segment(skb))) 2117
2118 if (vlan_tx_tag_present(skb) &&
2119 !(features & NETIF_F_HW_VLAN_TX)) {
2120 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2121 if (unlikely(!skb))
2122 goto out;
2123
2124 skb->vlan_tci = 0;
2125 }
2126
2127 if (netif_needs_gso(skb, features)) {
2128 if (unlikely(dev_gso_segment(skb, features)))
1959 goto out_kfree_skb; 2129 goto out_kfree_skb;
1960 if (skb->next) 2130 if (skb->next)
1961 goto gso; 2131 goto gso;
1962 } else { 2132 } else {
1963 if (skb_needs_linearize(skb, dev) && 2133 if (skb_needs_linearize(skb, features) &&
1964 __skb_linearize(skb)) 2134 __skb_linearize(skb))
1965 goto out_kfree_skb; 2135 goto out_kfree_skb;
1966 2136
@@ -1969,15 +2139,17 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1969 * checksumming here. 2139 * checksumming here.
1970 */ 2140 */
1971 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2141 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1972 skb_set_transport_header(skb, skb->csum_start - 2142 skb_set_transport_header(skb,
1973 skb_headroom(skb)); 2143 skb_checksum_start_offset(skb));
1974 if (!dev_can_checksum(dev, skb) && 2144 if (!(features & NETIF_F_ALL_CSUM) &&
1975 skb_checksum_help(skb)) 2145 skb_checksum_help(skb))
1976 goto out_kfree_skb; 2146 goto out_kfree_skb;
1977 } 2147 }
1978 } 2148 }
1979 2149
2150 skb_len = skb->len;
1980 rc = ops->ndo_start_xmit(skb, dev); 2151 rc = ops->ndo_start_xmit(skb, dev);
2152 trace_net_dev_xmit(skb, rc, dev, skb_len);
1981 if (rc == NETDEV_TX_OK) 2153 if (rc == NETDEV_TX_OK)
1982 txq_trans_update(txq); 2154 txq_trans_update(txq);
1983 return rc; 2155 return rc;
@@ -1991,13 +2163,15 @@ gso:
1991 nskb->next = NULL; 2163 nskb->next = NULL;
1992 2164
1993 /* 2165 /*
1994 * If device doesnt need nskb->dst, release it right now while 2166 * If device doesn't need nskb->dst, release it right now while
1995 * its hot in this cpu cache 2167 * its hot in this cpu cache
1996 */ 2168 */
1997 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2169 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1998 skb_dst_drop(nskb); 2170 skb_dst_drop(nskb);
1999 2171
2172 skb_len = nskb->len;
2000 rc = ops->ndo_start_xmit(nskb, dev); 2173 rc = ops->ndo_start_xmit(nskb, dev);
2174 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2001 if (unlikely(rc != NETDEV_TX_OK)) { 2175 if (unlikely(rc != NETDEV_TX_OK)) {
2002 if (rc & ~NETDEV_TX_MASK) 2176 if (rc & ~NETDEV_TX_MASK)
2003 goto out_kfree_gso_skb; 2177 goto out_kfree_gso_skb;
@@ -2015,31 +2189,45 @@ out_kfree_gso_skb:
2015 skb->destructor = DEV_GSO_CB(skb)->destructor; 2189 skb->destructor = DEV_GSO_CB(skb)->destructor;
2016out_kfree_skb: 2190out_kfree_skb:
2017 kfree_skb(skb); 2191 kfree_skb(skb);
2192out:
2018 return rc; 2193 return rc;
2019} 2194}
2020 2195
2021static u32 hashrnd __read_mostly; 2196static u32 hashrnd __read_mostly;
2022 2197
2023u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 2198/*
2199 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2200 * to be used as a distribution range.
2201 */
2202u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2203 unsigned int num_tx_queues)
2024{ 2204{
2025 u32 hash; 2205 u32 hash;
2206 u16 qoffset = 0;
2207 u16 qcount = num_tx_queues;
2026 2208
2027 if (skb_rx_queue_recorded(skb)) { 2209 if (skb_rx_queue_recorded(skb)) {
2028 hash = skb_get_rx_queue(skb); 2210 hash = skb_get_rx_queue(skb);
2029 while (unlikely(hash >= dev->real_num_tx_queues)) 2211 while (unlikely(hash >= num_tx_queues))
2030 hash -= dev->real_num_tx_queues; 2212 hash -= num_tx_queues;
2031 return hash; 2213 return hash;
2032 } 2214 }
2033 2215
2216 if (dev->num_tc) {
2217 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2218 qoffset = dev->tc_to_txq[tc].offset;
2219 qcount = dev->tc_to_txq[tc].count;
2220 }
2221
2034 if (skb->sk && skb->sk->sk_hash) 2222 if (skb->sk && skb->sk->sk_hash)
2035 hash = skb->sk->sk_hash; 2223 hash = skb->sk->sk_hash;
2036 else 2224 else
2037 hash = (__force u16) skb->protocol ^ skb->rxhash; 2225 hash = (__force u16) skb->protocol ^ skb->rxhash;
2038 hash = jhash_1word(hash, hashrnd); 2226 hash = jhash_1word(hash, hashrnd);
2039 2227
2040 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 2228 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2041} 2229}
2042EXPORT_SYMBOL(skb_tx_hash); 2230EXPORT_SYMBOL(__skb_tx_hash);
2043 2231
2044static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) 2232static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2045{ 2233{
@@ -2054,26 +2242,70 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2054 return queue_index; 2242 return queue_index;
2055} 2243}
2056 2244
2245static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2246{
2247#ifdef CONFIG_XPS
2248 struct xps_dev_maps *dev_maps;
2249 struct xps_map *map;
2250 int queue_index = -1;
2251
2252 rcu_read_lock();
2253 dev_maps = rcu_dereference(dev->xps_maps);
2254 if (dev_maps) {
2255 map = rcu_dereference(
2256 dev_maps->cpu_map[raw_smp_processor_id()]);
2257 if (map) {
2258 if (map->len == 1)
2259 queue_index = map->queues[0];
2260 else {
2261 u32 hash;
2262 if (skb->sk && skb->sk->sk_hash)
2263 hash = skb->sk->sk_hash;
2264 else
2265 hash = (__force u16) skb->protocol ^
2266 skb->rxhash;
2267 hash = jhash_1word(hash, hashrnd);
2268 queue_index = map->queues[
2269 ((u64)hash * map->len) >> 32];
2270 }
2271 if (unlikely(queue_index >= dev->real_num_tx_queues))
2272 queue_index = -1;
2273 }
2274 }
2275 rcu_read_unlock();
2276
2277 return queue_index;
2278#else
2279 return -1;
2280#endif
2281}
2282
2057static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2283static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2058 struct sk_buff *skb) 2284 struct sk_buff *skb)
2059{ 2285{
2060 int queue_index; 2286 int queue_index;
2061 const struct net_device_ops *ops = dev->netdev_ops; 2287 const struct net_device_ops *ops = dev->netdev_ops;
2062 2288
2063 if (ops->ndo_select_queue) { 2289 if (dev->real_num_tx_queues == 1)
2290 queue_index = 0;
2291 else if (ops->ndo_select_queue) {
2064 queue_index = ops->ndo_select_queue(dev, skb); 2292 queue_index = ops->ndo_select_queue(dev, skb);
2065 queue_index = dev_cap_txqueue(dev, queue_index); 2293 queue_index = dev_cap_txqueue(dev, queue_index);
2066 } else { 2294 } else {
2067 struct sock *sk = skb->sk; 2295 struct sock *sk = skb->sk;
2068 queue_index = sk_tx_queue_get(sk); 2296 queue_index = sk_tx_queue_get(sk);
2069 if (queue_index < 0) {
2070 2297
2071 queue_index = 0; 2298 if (queue_index < 0 || skb->ooo_okay ||
2072 if (dev->real_num_tx_queues > 1) 2299 queue_index >= dev->real_num_tx_queues) {
2300 int old_index = queue_index;
2301
2302 queue_index = get_xps_queue(dev, skb);
2303 if (queue_index < 0)
2073 queue_index = skb_tx_hash(dev, skb); 2304 queue_index = skb_tx_hash(dev, skb);
2074 2305
2075 if (sk) { 2306 if (queue_index != old_index && sk) {
2076 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); 2307 struct dst_entry *dst =
2308 rcu_dereference_check(sk->sk_dst_cache, 1);
2077 2309
2078 if (dst && skb_dst(skb) == dst) 2310 if (dst && skb_dst(skb) == dst)
2079 sk_tx_queue_set(sk, queue_index); 2311 sk_tx_queue_set(sk, queue_index);
@@ -2090,15 +2322,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2090 struct netdev_queue *txq) 2322 struct netdev_queue *txq)
2091{ 2323{
2092 spinlock_t *root_lock = qdisc_lock(q); 2324 spinlock_t *root_lock = qdisc_lock(q);
2093 bool contended = qdisc_is_running(q); 2325 bool contended;
2094 int rc; 2326 int rc;
2095 2327
2328 qdisc_skb_cb(skb)->pkt_len = skb->len;
2329 qdisc_calculate_pkt_len(skb, q);
2096 /* 2330 /*
2097 * Heuristic to force contended enqueues to serialize on a 2331 * Heuristic to force contended enqueues to serialize on a
2098 * separate lock before trying to get qdisc main lock. 2332 * separate lock before trying to get qdisc main lock.
2099 * This permits __QDISC_STATE_RUNNING owner to get the lock more often 2333 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2100 * and dequeue packets faster. 2334 * and dequeue packets faster.
2101 */ 2335 */
2336 contended = qdisc_is_running(q);
2102 if (unlikely(contended)) 2337 if (unlikely(contended))
2103 spin_lock(&q->busylock); 2338 spin_lock(&q->busylock);
2104 2339
@@ -2115,7 +2350,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2115 */ 2350 */
2116 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) 2351 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2117 skb_dst_force(skb); 2352 skb_dst_force(skb);
2118 __qdisc_update_bstats(q, skb->len); 2353
2354 qdisc_bstats_update(q, skb);
2355
2119 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { 2356 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2120 if (unlikely(contended)) { 2357 if (unlikely(contended)) {
2121 spin_unlock(&q->busylock); 2358 spin_unlock(&q->busylock);
@@ -2128,7 +2365,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2128 rc = NET_XMIT_SUCCESS; 2365 rc = NET_XMIT_SUCCESS;
2129 } else { 2366 } else {
2130 skb_dst_force(skb); 2367 skb_dst_force(skb);
2131 rc = qdisc_enqueue_root(skb, q); 2368 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2132 if (qdisc_run_begin(q)) { 2369 if (qdisc_run_begin(q)) {
2133 if (unlikely(contended)) { 2370 if (unlikely(contended)) {
2134 spin_unlock(&q->busylock); 2371 spin_unlock(&q->busylock);
@@ -2143,6 +2380,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2143 return rc; 2380 return rc;
2144} 2381}
2145 2382
2383static DEFINE_PER_CPU(int, xmit_recursion);
2384#define RECURSION_LIMIT 10
2385
2146/** 2386/**
2147 * dev_queue_xmit - transmit a buffer 2387 * dev_queue_xmit - transmit a buffer
2148 * @skb: buffer to transmit 2388 * @skb: buffer to transmit
@@ -2186,6 +2426,7 @@ int dev_queue_xmit(struct sk_buff *skb)
2186#ifdef CONFIG_NET_CLS_ACT 2426#ifdef CONFIG_NET_CLS_ACT
2187 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2427 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2188#endif 2428#endif
2429 trace_net_dev_queue(skb);
2189 if (q->enqueue) { 2430 if (q->enqueue) {
2190 rc = __dev_xmit_skb(skb, q, dev, txq); 2431 rc = __dev_xmit_skb(skb, q, dev, txq);
2191 goto out; 2432 goto out;
@@ -2208,10 +2449,15 @@ int dev_queue_xmit(struct sk_buff *skb)
2208 2449
2209 if (txq->xmit_lock_owner != cpu) { 2450 if (txq->xmit_lock_owner != cpu) {
2210 2451
2452 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2453 goto recursion_alert;
2454
2211 HARD_TX_LOCK(dev, txq, cpu); 2455 HARD_TX_LOCK(dev, txq, cpu);
2212 2456
2213 if (!netif_tx_queue_stopped(txq)) { 2457 if (!netif_tx_queue_stopped(txq)) {
2458 __this_cpu_inc(xmit_recursion);
2214 rc = dev_hard_start_xmit(skb, dev, txq); 2459 rc = dev_hard_start_xmit(skb, dev, txq);
2460 __this_cpu_dec(xmit_recursion);
2215 if (dev_xmit_complete(rc)) { 2461 if (dev_xmit_complete(rc)) {
2216 HARD_TX_UNLOCK(dev, txq); 2462 HARD_TX_UNLOCK(dev, txq);
2217 goto out; 2463 goto out;
@@ -2223,7 +2469,9 @@ int dev_queue_xmit(struct sk_buff *skb)
2223 "queue packet!\n", dev->name); 2469 "queue packet!\n", dev->name);
2224 } else { 2470 } else {
2225 /* Recursion is detected! It is possible, 2471 /* Recursion is detected! It is possible,
2226 * unfortunately */ 2472 * unfortunately
2473 */
2474recursion_alert:
2227 if (net_ratelimit()) 2475 if (net_ratelimit())
2228 printk(KERN_CRIT "Dead loop on virtual device " 2476 printk(KERN_CRIT "Dead loop on virtual device "
2229 "%s, fix it urgently!\n", dev->name); 2477 "%s, fix it urgently!\n", dev->name);
@@ -2259,69 +2507,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,
2259 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2507 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2260} 2508}
2261 2509
2262#ifdef CONFIG_RPS
2263
2264/* One global table that all flow-based protocols share. */
2265struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2266EXPORT_SYMBOL(rps_sock_flow_table);
2267
2268/* 2510/*
2269 * get_rps_cpu is called from netif_receive_skb and returns the target 2511 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2270 * CPU from the RPS map of the receiving queue for a given skb. 2512 * and src/dst port numbers. Returns a non-zero hash number on success
2271 * rcu_read_lock must be held on entry. 2513 * and 0 on failure.
2272 */ 2514 */
2273static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2515__u32 __skb_get_rxhash(struct sk_buff *skb)
2274 struct rps_dev_flow **rflowp)
2275{ 2516{
2276 struct ipv6hdr *ip6; 2517 int nhoff, hash = 0, poff;
2277 struct iphdr *ip; 2518 const struct ipv6hdr *ip6;
2278 struct netdev_rx_queue *rxqueue; 2519 const struct iphdr *ip;
2279 struct rps_map *map;
2280 struct rps_dev_flow_table *flow_table;
2281 struct rps_sock_flow_table *sock_flow_table;
2282 int cpu = -1;
2283 u8 ip_proto; 2520 u8 ip_proto;
2284 u16 tcpu;
2285 u32 addr1, addr2, ihl; 2521 u32 addr1, addr2, ihl;
2286 union { 2522 union {
2287 u32 v32; 2523 u32 v32;
2288 u16 v16[2]; 2524 u16 v16[2];
2289 } ports; 2525 } ports;
2290 2526
2291 if (skb_rx_queue_recorded(skb)) { 2527 nhoff = skb_network_offset(skb);
2292 u16 index = skb_get_rx_queue(skb);
2293 if (unlikely(index >= dev->num_rx_queues)) {
2294 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2295 "on queue %u, but number of RX queues is %u\n",
2296 dev->name, index, dev->num_rx_queues);
2297 goto done;
2298 }
2299 rxqueue = dev->_rx + index;
2300 } else
2301 rxqueue = dev->_rx;
2302
2303 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2304 goto done;
2305
2306 if (skb->rxhash)
2307 goto got_hash; /* Skip hash computation on packet header */
2308 2528
2309 switch (skb->protocol) { 2529 switch (skb->protocol) {
2310 case __constant_htons(ETH_P_IP): 2530 case __constant_htons(ETH_P_IP):
2311 if (!pskb_may_pull(skb, sizeof(*ip))) 2531 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2312 goto done; 2532 goto done;
2313 2533
2314 ip = (struct iphdr *) skb->data; 2534 ip = (const struct iphdr *) (skb->data + nhoff);
2315 ip_proto = ip->protocol; 2535 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2536 ip_proto = 0;
2537 else
2538 ip_proto = ip->protocol;
2316 addr1 = (__force u32) ip->saddr; 2539 addr1 = (__force u32) ip->saddr;
2317 addr2 = (__force u32) ip->daddr; 2540 addr2 = (__force u32) ip->daddr;
2318 ihl = ip->ihl; 2541 ihl = ip->ihl;
2319 break; 2542 break;
2320 case __constant_htons(ETH_P_IPV6): 2543 case __constant_htons(ETH_P_IPV6):
2321 if (!pskb_may_pull(skb, sizeof(*ip6))) 2544 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2322 goto done; 2545 goto done;
2323 2546
2324 ip6 = (struct ipv6hdr *) skb->data; 2547 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2325 ip_proto = ip6->nexthdr; 2548 ip_proto = ip6->nexthdr;
2326 addr1 = (__force u32) ip6->saddr.s6_addr32[3]; 2549 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2327 addr2 = (__force u32) ip6->daddr.s6_addr32[3]; 2550 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
@@ -2330,33 +2553,130 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2330 default: 2553 default:
2331 goto done; 2554 goto done;
2332 } 2555 }
2333 switch (ip_proto) { 2556
2334 case IPPROTO_TCP: 2557 ports.v32 = 0;
2335 case IPPROTO_UDP: 2558 poff = proto_ports_offset(ip_proto);
2336 case IPPROTO_DCCP: 2559 if (poff >= 0) {
2337 case IPPROTO_ESP: 2560 nhoff += ihl * 4 + poff;
2338 case IPPROTO_AH: 2561 if (pskb_may_pull(skb, nhoff + 4)) {
2339 case IPPROTO_SCTP: 2562 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2340 case IPPROTO_UDPLITE:
2341 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2342 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2343 if (ports.v16[1] < ports.v16[0]) 2563 if (ports.v16[1] < ports.v16[0])
2344 swap(ports.v16[0], ports.v16[1]); 2564 swap(ports.v16[0], ports.v16[1]);
2345 break;
2346 } 2565 }
2347 default:
2348 ports.v32 = 0;
2349 break;
2350 } 2566 }
2351 2567
2352 /* get a consistent hash (same value on both flow directions) */ 2568 /* get a consistent hash (same value on both flow directions) */
2353 if (addr2 < addr1) 2569 if (addr2 < addr1)
2354 swap(addr1, addr2); 2570 swap(addr1, addr2);
2355 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2356 if (!skb->rxhash)
2357 skb->rxhash = 1;
2358 2571
2359got_hash: 2572 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2573 if (!hash)
2574 hash = 1;
2575
2576done:
2577 return hash;
2578}
2579EXPORT_SYMBOL(__skb_get_rxhash);
2580
2581#ifdef CONFIG_RPS
2582
2583/* One global table that all flow-based protocols share. */
2584struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2585EXPORT_SYMBOL(rps_sock_flow_table);
2586
2587static struct rps_dev_flow *
2588set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2589 struct rps_dev_flow *rflow, u16 next_cpu)
2590{
2591 u16 tcpu;
2592
2593 tcpu = rflow->cpu = next_cpu;
2594 if (tcpu != RPS_NO_CPU) {
2595#ifdef CONFIG_RFS_ACCEL
2596 struct netdev_rx_queue *rxqueue;
2597 struct rps_dev_flow_table *flow_table;
2598 struct rps_dev_flow *old_rflow;
2599 u32 flow_id;
2600 u16 rxq_index;
2601 int rc;
2602
2603 /* Should we steer this flow to a different hardware queue? */
2604 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2605 !(dev->features & NETIF_F_NTUPLE))
2606 goto out;
2607 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2608 if (rxq_index == skb_get_rx_queue(skb))
2609 goto out;
2610
2611 rxqueue = dev->_rx + rxq_index;
2612 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2613 if (!flow_table)
2614 goto out;
2615 flow_id = skb->rxhash & flow_table->mask;
2616 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2617 rxq_index, flow_id);
2618 if (rc < 0)
2619 goto out;
2620 old_rflow = rflow;
2621 rflow = &flow_table->flows[flow_id];
2622 rflow->cpu = next_cpu;
2623 rflow->filter = rc;
2624 if (old_rflow->filter == rflow->filter)
2625 old_rflow->filter = RPS_NO_FILTER;
2626 out:
2627#endif
2628 rflow->last_qtail =
2629 per_cpu(softnet_data, tcpu).input_queue_head;
2630 }
2631
2632 return rflow;
2633}
2634
2635/*
2636 * get_rps_cpu is called from netif_receive_skb and returns the target
2637 * CPU from the RPS map of the receiving queue for a given skb.
2638 * rcu_read_lock must be held on entry.
2639 */
2640static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2641 struct rps_dev_flow **rflowp)
2642{
2643 struct netdev_rx_queue *rxqueue;
2644 struct rps_map *map;
2645 struct rps_dev_flow_table *flow_table;
2646 struct rps_sock_flow_table *sock_flow_table;
2647 int cpu = -1;
2648 u16 tcpu;
2649
2650 if (skb_rx_queue_recorded(skb)) {
2651 u16 index = skb_get_rx_queue(skb);
2652 if (unlikely(index >= dev->real_num_rx_queues)) {
2653 WARN_ONCE(dev->real_num_rx_queues > 1,
2654 "%s received packet on queue %u, but number "
2655 "of RX queues is %u\n",
2656 dev->name, index, dev->real_num_rx_queues);
2657 goto done;
2658 }
2659 rxqueue = dev->_rx + index;
2660 } else
2661 rxqueue = dev->_rx;
2662
2663 map = rcu_dereference(rxqueue->rps_map);
2664 if (map) {
2665 if (map->len == 1 &&
2666 !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2667 tcpu = map->cpus[0];
2668 if (cpu_online(tcpu))
2669 cpu = tcpu;
2670 goto done;
2671 }
2672 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2673 goto done;
2674 }
2675
2676 skb_reset_network_header(skb);
2677 if (!skb_get_rxhash(skb))
2678 goto done;
2679
2360 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2680 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2361 sock_flow_table = rcu_dereference(rps_sock_flow_table); 2681 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2362 if (flow_table && sock_flow_table) { 2682 if (flow_table && sock_flow_table) {
@@ -2383,12 +2703,9 @@ got_hash:
2383 if (unlikely(tcpu != next_cpu) && 2703 if (unlikely(tcpu != next_cpu) &&
2384 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 2704 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2385 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 2705 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2386 rflow->last_qtail)) >= 0)) { 2706 rflow->last_qtail)) >= 0))
2387 tcpu = rflow->cpu = next_cpu; 2707 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2388 if (tcpu != RPS_NO_CPU) 2708
2389 rflow->last_qtail = per_cpu(softnet_data,
2390 tcpu).input_queue_head;
2391 }
2392 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 2709 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2393 *rflowp = rflow; 2710 *rflowp = rflow;
2394 cpu = tcpu; 2711 cpu = tcpu;
@@ -2396,7 +2713,6 @@ got_hash:
2396 } 2713 }
2397 } 2714 }
2398 2715
2399 map = rcu_dereference(rxqueue->rps_map);
2400 if (map) { 2716 if (map) {
2401 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2717 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2402 2718
@@ -2410,6 +2726,46 @@ done:
2410 return cpu; 2726 return cpu;
2411} 2727}
2412 2728
2729#ifdef CONFIG_RFS_ACCEL
2730
2731/**
2732 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2733 * @dev: Device on which the filter was set
2734 * @rxq_index: RX queue index
2735 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2736 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2737 *
2738 * Drivers that implement ndo_rx_flow_steer() should periodically call
2739 * this function for each installed filter and remove the filters for
2740 * which it returns %true.
2741 */
2742bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2743 u32 flow_id, u16 filter_id)
2744{
2745 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2746 struct rps_dev_flow_table *flow_table;
2747 struct rps_dev_flow *rflow;
2748 bool expire = true;
2749 int cpu;
2750
2751 rcu_read_lock();
2752 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2753 if (flow_table && flow_id <= flow_table->mask) {
2754 rflow = &flow_table->flows[flow_id];
2755 cpu = ACCESS_ONCE(rflow->cpu);
2756 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2757 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2758 rflow->last_qtail) <
2759 (int)(10 * flow_table->mask)))
2760 expire = false;
2761 }
2762 rcu_read_unlock();
2763 return expire;
2764}
2765EXPORT_SYMBOL(rps_may_expire_flow);
2766
2767#endif /* CONFIG_RFS_ACCEL */
2768
2413/* Called from hardirq (IPI) context */ 2769/* Called from hardirq (IPI) context */
2414static void rps_trigger_softirq(void *data) 2770static void rps_trigger_softirq(void *data)
2415{ 2771{
@@ -2482,6 +2838,7 @@ enqueue:
2482 2838
2483 local_irq_restore(flags); 2839 local_irq_restore(flags);
2484 2840
2841 atomic_long_inc(&skb->dev->rx_dropped);
2485 kfree_skb(skb); 2842 kfree_skb(skb);
2486 return NET_RX_DROP; 2843 return NET_RX_DROP;
2487} 2844}
@@ -2512,6 +2869,7 @@ int netif_rx(struct sk_buff *skb)
2512 if (netdev_tstamp_prequeue) 2869 if (netdev_tstamp_prequeue)
2513 net_timestamp_check(skb); 2870 net_timestamp_check(skb);
2514 2871
2872 trace_netif_rx(skb);
2515#ifdef CONFIG_RPS 2873#ifdef CONFIG_RPS
2516 { 2874 {
2517 struct rps_dev_flow voidflow, *rflow = &voidflow; 2875 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2929,7 @@ static void net_tx_action(struct softirq_action *h)
2571 clist = clist->next; 2929 clist = clist->next;
2572 2930
2573 WARN_ON(atomic_read(&skb->users)); 2931 WARN_ON(atomic_read(&skb->users));
2932 trace_kfree_skb(skb, net_tx_action);
2574 __kfree_skb(skb); 2933 __kfree_skb(skb);
2575 } 2934 }
2576 } 2935 }
@@ -2611,14 +2970,6 @@ static void net_tx_action(struct softirq_action *h)
2611 } 2970 }
2612} 2971}
2613 2972
2614static inline int deliver_skb(struct sk_buff *skb,
2615 struct packet_type *pt_prev,
2616 struct net_device *orig_dev)
2617{
2618 atomic_inc(&skb->users);
2619 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2620}
2621
2622#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 2973#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2623 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 2974 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2624/* This hook is defined here for ATM LANE */ 2975/* This hook is defined here for ATM LANE */
@@ -2632,15 +2983,14 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2632 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2983 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2633 * a compare and 2 stores extra right now if we dont have it on 2984 * a compare and 2 stores extra right now if we dont have it on
2634 * but have CONFIG_NET_CLS_ACT 2985 * but have CONFIG_NET_CLS_ACT
2635 * NOTE: This doesnt stop any functionality; if you dont have 2986 * NOTE: This doesn't stop any functionality; if you dont have
2636 * the ingress scheduler, you just cant add policies on ingress. 2987 * the ingress scheduler, you just can't add policies on ingress.
2637 * 2988 *
2638 */ 2989 */
2639static int ing_filter(struct sk_buff *skb) 2990static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2640{ 2991{
2641 struct net_device *dev = skb->dev; 2992 struct net_device *dev = skb->dev;
2642 u32 ttl = G_TC_RTTL(skb->tc_verd); 2993 u32 ttl = G_TC_RTTL(skb->tc_verd);
2643 struct netdev_queue *rxq;
2644 int result = TC_ACT_OK; 2994 int result = TC_ACT_OK;
2645 struct Qdisc *q; 2995 struct Qdisc *q;
2646 2996
@@ -2654,8 +3004,6 @@ static int ing_filter(struct sk_buff *skb)
2654 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3004 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2655 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3005 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2656 3006
2657 rxq = &dev->rx_queue;
2658
2659 q = rxq->qdisc; 3007 q = rxq->qdisc;
2660 if (q != &noop_qdisc) { 3008 if (q != &noop_qdisc) {
2661 spin_lock(qdisc_lock(q)); 3009 spin_lock(qdisc_lock(q));
@@ -2671,7 +3019,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2671 struct packet_type **pt_prev, 3019 struct packet_type **pt_prev,
2672 int *ret, struct net_device *orig_dev) 3020 int *ret, struct net_device *orig_dev)
2673{ 3021{
2674 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 3022 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3023
3024 if (!rxq || rxq->qdisc == &noop_qdisc)
2675 goto out; 3025 goto out;
2676 3026
2677 if (*pt_prev) { 3027 if (*pt_prev) {
@@ -2679,7 +3029,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2679 *pt_prev = NULL; 3029 *pt_prev = NULL;
2680 } 3030 }
2681 3031
2682 switch (ing_filter(skb)) { 3032 switch (ing_filter(skb, rxq)) {
2683 case TC_ACT_SHOT: 3033 case TC_ACT_SHOT:
2684 case TC_ACT_STOLEN: 3034 case TC_ACT_STOLEN:
2685 kfree_skb(skb); 3035 kfree_skb(skb);
@@ -2692,33 +3042,6 @@ out:
2692} 3042}
2693#endif 3043#endif
2694 3044
2695/*
2696 * netif_nit_deliver - deliver received packets to network taps
2697 * @skb: buffer
2698 *
2699 * This function is used to deliver incoming packets to network
2700 * taps. It should be used when the normal netif_receive_skb path
2701 * is bypassed, for example because of VLAN acceleration.
2702 */
2703void netif_nit_deliver(struct sk_buff *skb)
2704{
2705 struct packet_type *ptype;
2706
2707 if (list_empty(&ptype_all))
2708 return;
2709
2710 skb_reset_network_header(skb);
2711 skb_reset_transport_header(skb);
2712 skb->mac_len = skb->network_header - skb->mac_header;
2713
2714 rcu_read_lock();
2715 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2716 if (!ptype->dev || ptype->dev == skb->dev)
2717 deliver_skb(skb, ptype, skb->dev);
2718 }
2719 rcu_read_unlock();
2720}
2721
2722/** 3045/**
2723 * netdev_rx_handler_register - register receive handler 3046 * netdev_rx_handler_register - register receive handler
2724 * @dev: device to register a handler for 3047 * @dev: device to register a handler for
@@ -2730,6 +3053,8 @@ void netif_nit_deliver(struct sk_buff *skb)
2730 * on a failure. 3053 * on a failure.
2731 * 3054 *
2732 * The caller must hold the rtnl_mutex. 3055 * The caller must hold the rtnl_mutex.
3056 *
3057 * For a general description of rx_handler, see enum rx_handler_result.
2733 */ 3058 */
2734int netdev_rx_handler_register(struct net_device *dev, 3059int netdev_rx_handler_register(struct net_device *dev,
2735 rx_handler_func_t *rx_handler, 3060 rx_handler_func_t *rx_handler,
@@ -2764,72 +3089,20 @@ void netdev_rx_handler_unregister(struct net_device *dev)
2764} 3089}
2765EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3090EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2766 3091
2767static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2768 struct net_device *master)
2769{
2770 if (skb->pkt_type == PACKET_HOST) {
2771 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2772
2773 memcpy(dest, master->dev_addr, ETH_ALEN);
2774 }
2775}
2776
2777/* On bonding slaves other than the currently active slave, suppress
2778 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2779 * ARP on active-backup slaves with arp_validate enabled.
2780 */
2781int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2782{
2783 struct net_device *dev = skb->dev;
2784
2785 if (master->priv_flags & IFF_MASTER_ARPMON)
2786 dev->last_rx = jiffies;
2787
2788 if ((master->priv_flags & IFF_MASTER_ALB) &&
2789 (master->priv_flags & IFF_BRIDGE_PORT)) {
2790 /* Do address unmangle. The local destination address
2791 * will be always the one master has. Provides the right
2792 * functionality in a bridge.
2793 */
2794 skb_bond_set_mac_by_master(skb, master);
2795 }
2796
2797 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2798 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2799 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2800 return 0;
2801
2802 if (master->priv_flags & IFF_MASTER_ALB) {
2803 if (skb->pkt_type != PACKET_BROADCAST &&
2804 skb->pkt_type != PACKET_MULTICAST)
2805 return 0;
2806 }
2807 if (master->priv_flags & IFF_MASTER_8023AD &&
2808 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2809 return 0;
2810
2811 return 1;
2812 }
2813 return 0;
2814}
2815EXPORT_SYMBOL(__skb_bond_should_drop);
2816
2817static int __netif_receive_skb(struct sk_buff *skb) 3092static int __netif_receive_skb(struct sk_buff *skb)
2818{ 3093{
2819 struct packet_type *ptype, *pt_prev; 3094 struct packet_type *ptype, *pt_prev;
2820 rx_handler_func_t *rx_handler; 3095 rx_handler_func_t *rx_handler;
2821 struct net_device *orig_dev; 3096 struct net_device *orig_dev;
2822 struct net_device *master; 3097 struct net_device *null_or_dev;
2823 struct net_device *null_or_orig; 3098 bool deliver_exact = false;
2824 struct net_device *orig_or_bond;
2825 int ret = NET_RX_DROP; 3099 int ret = NET_RX_DROP;
2826 __be16 type; 3100 __be16 type;
2827 3101
2828 if (!netdev_tstamp_prequeue) 3102 if (!netdev_tstamp_prequeue)
2829 net_timestamp_check(skb); 3103 net_timestamp_check(skb);
2830 3104
2831 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 3105 trace_netif_receive_skb(skb);
2832 return NET_RX_SUCCESS;
2833 3106
2834 /* if we've gotten here through NAPI, check netpoll */ 3107 /* if we've gotten here through NAPI, check netpoll */
2835 if (netpoll_receive_skb(skb)) 3108 if (netpoll_receive_skb(skb))
@@ -2837,37 +3110,26 @@ static int __netif_receive_skb(struct sk_buff *skb)
2837 3110
2838 if (!skb->skb_iif) 3111 if (!skb->skb_iif)
2839 skb->skb_iif = skb->dev->ifindex; 3112 skb->skb_iif = skb->dev->ifindex;
2840
2841 /*
2842 * bonding note: skbs received on inactive slaves should only
2843 * be delivered to pkt handlers that are exact matches. Also
2844 * the deliver_no_wcard flag will be set. If packet handlers
2845 * are sensitive to duplicate packets these skbs will need to
2846 * be dropped at the handler. The vlan accel path may have
2847 * already set the deliver_no_wcard flag.
2848 */
2849 null_or_orig = NULL;
2850 orig_dev = skb->dev; 3113 orig_dev = skb->dev;
2851 master = ACCESS_ONCE(orig_dev->master);
2852 if (skb->deliver_no_wcard)
2853 null_or_orig = orig_dev;
2854 else if (master) {
2855 if (skb_bond_should_drop(skb, master)) {
2856 skb->deliver_no_wcard = 1;
2857 null_or_orig = orig_dev; /* deliver only exact match */
2858 } else
2859 skb->dev = master;
2860 }
2861 3114
2862 __this_cpu_inc(softnet_data.processed);
2863 skb_reset_network_header(skb); 3115 skb_reset_network_header(skb);
2864 skb_reset_transport_header(skb); 3116 skb_reset_transport_header(skb);
2865 skb->mac_len = skb->network_header - skb->mac_header; 3117 skb_reset_mac_len(skb);
2866 3118
2867 pt_prev = NULL; 3119 pt_prev = NULL;
2868 3120
2869 rcu_read_lock(); 3121 rcu_read_lock();
2870 3122
3123another_round:
3124
3125 __this_cpu_inc(softnet_data.processed);
3126
3127 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3128 skb = vlan_untag(skb);
3129 if (unlikely(!skb))
3130 goto out;
3131 }
3132
2871#ifdef CONFIG_NET_CLS_ACT 3133#ifdef CONFIG_NET_CLS_ACT
2872 if (skb->tc_verd & TC_NCLS) { 3134 if (skb->tc_verd & TC_NCLS) {
2873 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3135 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -2876,8 +3138,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
2876#endif 3138#endif
2877 3139
2878 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3140 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2879 if (ptype->dev == null_or_orig || ptype->dev == skb->dev || 3141 if (!ptype->dev || ptype->dev == skb->dev) {
2880 ptype->dev == orig_dev) {
2881 if (pt_prev) 3142 if (pt_prev)
2882 ret = deliver_skb(skb, pt_prev, orig_dev); 3143 ret = deliver_skb(skb, pt_prev, orig_dev);
2883 pt_prev = ptype; 3144 pt_prev = ptype;
@@ -2891,36 +3152,47 @@ static int __netif_receive_skb(struct sk_buff *skb)
2891ncls: 3152ncls:
2892#endif 3153#endif
2893 3154
2894 /* Handle special case of bridge or macvlan */
2895 rx_handler = rcu_dereference(skb->dev->rx_handler); 3155 rx_handler = rcu_dereference(skb->dev->rx_handler);
2896 if (rx_handler) { 3156 if (rx_handler) {
2897 if (pt_prev) { 3157 if (pt_prev) {
2898 ret = deliver_skb(skb, pt_prev, orig_dev); 3158 ret = deliver_skb(skb, pt_prev, orig_dev);
2899 pt_prev = NULL; 3159 pt_prev = NULL;
2900 } 3160 }
2901 skb = rx_handler(skb); 3161 switch (rx_handler(&skb)) {
2902 if (!skb) 3162 case RX_HANDLER_CONSUMED:
2903 goto out; 3163 goto out;
3164 case RX_HANDLER_ANOTHER:
3165 goto another_round;
3166 case RX_HANDLER_EXACT:
3167 deliver_exact = true;
3168 case RX_HANDLER_PASS:
3169 break;
3170 default:
3171 BUG();
3172 }
2904 } 3173 }
2905 3174
2906 /* 3175 if (vlan_tx_tag_present(skb)) {
2907 * Make sure frames received on VLAN interfaces stacked on 3176 if (pt_prev) {
2908 * bonding interfaces still make their way to any base bonding 3177 ret = deliver_skb(skb, pt_prev, orig_dev);
2909 * device that may have registered for a specific ptype. The 3178 pt_prev = NULL;
2910 * handler may have to adjust skb->dev and orig_dev. 3179 }
2911 */ 3180 if (vlan_do_receive(&skb)) {
2912 orig_or_bond = orig_dev; 3181 ret = __netif_receive_skb(skb);
2913 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && 3182 goto out;
2914 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { 3183 } else if (unlikely(!skb))
2915 orig_or_bond = vlan_dev_real_dev(skb->dev); 3184 goto out;
2916 } 3185 }
2917 3186
3187 /* deliver only exact match when indicated */
3188 null_or_dev = deliver_exact ? skb->dev : NULL;
3189
2918 type = skb->protocol; 3190 type = skb->protocol;
2919 list_for_each_entry_rcu(ptype, 3191 list_for_each_entry_rcu(ptype,
2920 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 3192 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2921 if (ptype->type == type && (ptype->dev == null_or_orig || 3193 if (ptype->type == type &&
2922 ptype->dev == skb->dev || ptype->dev == orig_dev || 3194 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
2923 ptype->dev == orig_or_bond)) { 3195 ptype->dev == orig_dev)) {
2924 if (pt_prev) 3196 if (pt_prev)
2925 ret = deliver_skb(skb, pt_prev, orig_dev); 3197 ret = deliver_skb(skb, pt_prev, orig_dev);
2926 pt_prev = ptype; 3198 pt_prev = ptype;
@@ -2930,6 +3202,7 @@ ncls:
2930 if (pt_prev) { 3202 if (pt_prev) {
2931 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3203 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2932 } else { 3204 } else {
3205 atomic_long_inc(&skb->dev->rx_dropped);
2933 kfree_skb(skb); 3206 kfree_skb(skb);
2934 /* Jamal, now you will not able to escape explaining 3207 /* Jamal, now you will not able to escape explaining
2935 * me how you were going to use this. :-) 3208 * me how you were going to use this. :-)
@@ -3050,7 +3323,7 @@ out:
3050 return netif_receive_skb(skb); 3323 return netif_receive_skb(skb);
3051} 3324}
3052 3325
3053static void napi_gro_flush(struct napi_struct *napi) 3326inline void napi_gro_flush(struct napi_struct *napi)
3054{ 3327{
3055 struct sk_buff *skb, *next; 3328 struct sk_buff *skb, *next;
3056 3329
@@ -3063,6 +3336,7 @@ static void napi_gro_flush(struct napi_struct *napi)
3063 napi->gro_count = 0; 3336 napi->gro_count = 0;
3064 napi->gro_list = NULL; 3337 napi->gro_list = NULL;
3065} 3338}
3339EXPORT_SYMBOL(napi_gro_flush);
3066 3340
3067enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3341enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3068{ 3342{
@@ -3077,7 +3351,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3077 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3351 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3078 goto normal; 3352 goto normal;
3079 3353
3080 if (skb_is_gso(skb) || skb_has_frags(skb)) 3354 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3081 goto normal; 3355 goto normal;
3082 3356
3083 rcu_read_lock(); 3357 rcu_read_lock();
@@ -3156,16 +3430,19 @@ normal:
3156} 3430}
3157EXPORT_SYMBOL(dev_gro_receive); 3431EXPORT_SYMBOL(dev_gro_receive);
3158 3432
3159static gro_result_t 3433static inline gro_result_t
3160__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3434__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3161{ 3435{
3162 struct sk_buff *p; 3436 struct sk_buff *p;
3163 3437
3164 for (p = napi->gro_list; p; p = p->next) { 3438 for (p = napi->gro_list; p; p = p->next) {
3165 NAPI_GRO_CB(p)->same_flow = 3439 unsigned long diffs;
3166 (p->dev == skb->dev) && 3440
3167 !compare_ether_header(skb_mac_header(p), 3441 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3442 diffs |= p->vlan_tci ^ skb->vlan_tci;
3443 diffs |= compare_ether_header(skb_mac_header(p),
3168 skb_gro_mac_header(skb)); 3444 skb_gro_mac_header(skb));
3445 NAPI_GRO_CB(p)->same_flow = !diffs;
3169 NAPI_GRO_CB(p)->flush = 0; 3446 NAPI_GRO_CB(p)->flush = 0;
3170 } 3447 }
3171 3448
@@ -3218,14 +3495,16 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3218} 3495}
3219EXPORT_SYMBOL(napi_gro_receive); 3496EXPORT_SYMBOL(napi_gro_receive);
3220 3497
3221void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 3498static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3222{ 3499{
3223 __skb_pull(skb, skb_headlen(skb)); 3500 __skb_pull(skb, skb_headlen(skb));
3224 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 3501 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3502 skb->vlan_tci = 0;
3503 skb->dev = napi->dev;
3504 skb->skb_iif = 0;
3225 3505
3226 napi->skb = skb; 3506 napi->skb = skb;
3227} 3507}
3228EXPORT_SYMBOL(napi_reuse_skb);
3229 3508
3230struct sk_buff *napi_get_frags(struct napi_struct *napi) 3509struct sk_buff *napi_get_frags(struct napi_struct *napi)
3231{ 3510{
@@ -3519,7 +3798,7 @@ static void net_rx_action(struct softirq_action *h)
3519 * with netpoll's poll_napi(). Only the entity which 3798 * with netpoll's poll_napi(). Only the entity which
3520 * obtains the lock and sees NAPI_STATE_SCHED set will 3799 * obtains the lock and sees NAPI_STATE_SCHED set will
3521 * actually make the ->poll() call. Therefore we avoid 3800 * actually make the ->poll() call. Therefore we avoid
3522 * accidently calling ->poll() when NAPI is not scheduled. 3801 * accidentally calling ->poll() when NAPI is not scheduled.
3523 */ 3802 */
3524 work = 0; 3803 work = 0;
3525 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 3804 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
@@ -3710,12 +3989,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3710 3989
3711void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3990void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3712{ 3991{
3713 struct net_device *dev = (v == SEQ_START_TOKEN) ? 3992 struct net_device *dev = v;
3714 first_net_device(seq_file_net(seq)) : 3993
3715 next_net_device((struct net_device *)v); 3994 if (v == SEQ_START_TOKEN)
3995 dev = first_net_device_rcu(seq_file_net(seq));
3996 else
3997 dev = next_net_device_rcu(dev);
3716 3998
3717 ++*pos; 3999 ++*pos;
3718 return rcu_dereference(dev); 4000 return dev;
3719} 4001}
3720 4002
3721void dev_seq_stop(struct seq_file *seq, void *v) 4003void dev_seq_stop(struct seq_file *seq, void *v)
@@ -3999,15 +4281,14 @@ static int __init dev_proc_init(void)
3999 4281
4000 4282
4001/** 4283/**
4002 * netdev_set_master - set up master/slave pair 4284 * netdev_set_master - set up master pointer
4003 * @slave: slave device 4285 * @slave: slave device
4004 * @master: new master device 4286 * @master: new master device
4005 * 4287 *
4006 * Changes the master device of the slave. Pass %NULL to break the 4288 * Changes the master device of the slave. Pass %NULL to break the
4007 * bonding. The caller must hold the RTNL semaphore. On a failure 4289 * bonding. The caller must hold the RTNL semaphore. On a failure
4008 * a negative errno code is returned. On success the reference counts 4290 * a negative errno code is returned. On success the reference counts
4009 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 4291 * are adjusted and the function returns zero.
4010 * function returns zero.
4011 */ 4292 */
4012int netdev_set_master(struct net_device *slave, struct net_device *master) 4293int netdev_set_master(struct net_device *slave, struct net_device *master)
4013{ 4294{
@@ -4023,10 +4304,31 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
4023 4304
4024 slave->master = master; 4305 slave->master = master;
4025 4306
4026 if (old) { 4307 if (old)
4027 synchronize_net();
4028 dev_put(old); 4308 dev_put(old);
4029 } 4309 return 0;
4310}
4311EXPORT_SYMBOL(netdev_set_master);
4312
4313/**
4314 * netdev_set_bond_master - set up bonding master/slave pair
4315 * @slave: slave device
4316 * @master: new master device
4317 *
4318 * Changes the master device of the slave. Pass %NULL to break the
4319 * bonding. The caller must hold the RTNL semaphore. On a failure
4320 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4321 * to the routing socket and the function returns zero.
4322 */
4323int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4324{
4325 int err;
4326
4327 ASSERT_RTNL();
4328
4329 err = netdev_set_master(slave, master);
4330 if (err)
4331 return err;
4030 if (master) 4332 if (master)
4031 slave->flags |= IFF_SLAVE; 4333 slave->flags |= IFF_SLAVE;
4032 else 4334 else
@@ -4035,7 +4337,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
4035 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 4337 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4036 return 0; 4338 return 0;
4037} 4339}
4038EXPORT_SYMBOL(netdev_set_master); 4340EXPORT_SYMBOL(netdev_set_bond_master);
4039 4341
4040static void dev_change_rx_flags(struct net_device *dev, int flags) 4342static void dev_change_rx_flags(struct net_device *dev, int flags)
4041{ 4343{
@@ -4204,6 +4506,30 @@ void dev_set_rx_mode(struct net_device *dev)
4204} 4506}
4205 4507
4206/** 4508/**
4509 * dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4510 * @dev: device
4511 * @cmd: memory area for ethtool_ops::get_settings() result
4512 *
4513 * The cmd arg is initialized properly (cleared and
4514 * ethtool_cmd::cmd field set to ETHTOOL_GSET).
4515 *
4516 * Return device's ethtool_ops::get_settings() result value or
4517 * -EOPNOTSUPP when device doesn't expose
4518 * ethtool_ops::get_settings() operation.
4519 */
4520int dev_ethtool_get_settings(struct net_device *dev,
4521 struct ethtool_cmd *cmd)
4522{
4523 if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4524 return -EOPNOTSUPP;
4525
4526 memset(cmd, 0, sizeof(struct ethtool_cmd));
4527 cmd->cmd = ETHTOOL_GSET;
4528 return dev->ethtool_ops->get_settings(dev, cmd);
4529}
4530EXPORT_SYMBOL(dev_ethtool_get_settings);
4531
4532/**
4207 * dev_get_flags - get flags reported to userspace 4533 * dev_get_flags - get flags reported to userspace
4208 * @dev: device 4534 * @dev: device
4209 * 4535 *
@@ -4372,6 +4698,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
4372EXPORT_SYMBOL(dev_set_mtu); 4698EXPORT_SYMBOL(dev_set_mtu);
4373 4699
4374/** 4700/**
4701 * dev_set_group - Change group this device belongs to
4702 * @dev: device
4703 * @new_group: group this device should belong to
4704 */
4705void dev_set_group(struct net_device *dev, int new_group)
4706{
4707 dev->group = new_group;
4708}
4709EXPORT_SYMBOL(dev_set_group);
4710
4711/**
4375 * dev_set_mac_address - Change Media Access Control Address 4712 * dev_set_mac_address - Change Media Access Control Address
4376 * @dev: device 4713 * @dev: device
4377 * @sa: new address 4714 * @sa: new address
@@ -4456,7 +4793,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
4456 * is never reached 4793 * is never reached
4457 */ 4794 */
4458 WARN_ON(1); 4795 WARN_ON(1);
4459 err = -EINVAL; 4796 err = -ENOTTY;
4460 break; 4797 break;
4461 4798
4462 } 4799 }
@@ -4724,7 +5061,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4724 /* Set the per device memory buffer space. 5061 /* Set the per device memory buffer space.
4725 * Not applicable in our case */ 5062 * Not applicable in our case */
4726 case SIOCSIFLINK: 5063 case SIOCSIFLINK:
4727 return -EINVAL; 5064 return -ENOTTY;
4728 5065
4729 /* 5066 /*
4730 * Unknown or private ioctl. 5067 * Unknown or private ioctl.
@@ -4745,7 +5082,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4745 /* Take care of Wireless Extensions */ 5082 /* Take care of Wireless Extensions */
4746 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 5083 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4747 return wext_handle_ioctl(net, &ifr, cmd, arg); 5084 return wext_handle_ioctl(net, &ifr, cmd, arg);
4748 return -EINVAL; 5085 return -ENOTTY;
4749 } 5086 }
4750} 5087}
4751 5088
@@ -4797,12 +5134,14 @@ static void rollback_registered_many(struct list_head *head)
4797 list_del(&dev->unreg_list); 5134 list_del(&dev->unreg_list);
4798 continue; 5135 continue;
4799 } 5136 }
4800 5137 dev->dismantle = true;
4801 BUG_ON(dev->reg_state != NETREG_REGISTERED); 5138 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5139 }
4802 5140
4803 /* If device is running, close it first. */ 5141 /* If device is running, close it first. */
4804 dev_close(dev); 5142 dev_close_many(head);
4805 5143
5144 list_for_each_entry(dev, head, unreg_list) {
4806 /* And unlink it from device chain. */ 5145 /* And unlink it from device chain. */
4807 unlist_netdevice(dev); 5146 unlist_netdevice(dev);
4808 5147
@@ -4857,55 +5196,62 @@ static void rollback_registered(struct net_device *dev)
4857 5196
4858 list_add(&dev->unreg_list, &single); 5197 list_add(&dev->unreg_list, &single);
4859 rollback_registered_many(&single); 5198 rollback_registered_many(&single);
5199 list_del(&single);
4860} 5200}
4861 5201
4862static void __netdev_init_queue_locks_one(struct net_device *dev, 5202u32 netdev_fix_features(struct net_device *dev, u32 features)
4863 struct netdev_queue *dev_queue,
4864 void *_unused)
4865{ 5203{
4866 spin_lock_init(&dev_queue->_xmit_lock); 5204 /* Fix illegal checksum combinations */
4867 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); 5205 if ((features & NETIF_F_HW_CSUM) &&
4868 dev_queue->xmit_lock_owner = -1; 5206 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4869} 5207 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5208 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5209 }
4870 5210
4871static void netdev_init_queue_locks(struct net_device *dev) 5211 if ((features & NETIF_F_NO_CSUM) &&
4872{ 5212 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4873 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 5213 netdev_warn(dev, "mixed no checksumming and other settings.\n");
4874 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 5214 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4875} 5215 }
4876 5216
4877unsigned long netdev_fix_features(unsigned long features, const char *name)
4878{
4879 /* Fix illegal SG+CSUM combinations. */ 5217 /* Fix illegal SG+CSUM combinations. */
4880 if ((features & NETIF_F_SG) && 5218 if ((features & NETIF_F_SG) &&
4881 !(features & NETIF_F_ALL_CSUM)) { 5219 !(features & NETIF_F_ALL_CSUM)) {
4882 if (name) 5220 netdev_dbg(dev,
4883 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 5221 "Dropping NETIF_F_SG since no checksum feature.\n");
4884 "checksum feature.\n", name);
4885 features &= ~NETIF_F_SG; 5222 features &= ~NETIF_F_SG;
4886 } 5223 }
4887 5224
4888 /* TSO requires that SG is present as well. */ 5225 /* TSO requires that SG is present as well. */
4889 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 5226 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
4890 if (name) 5227 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
4891 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " 5228 features &= ~NETIF_F_ALL_TSO;
4892 "SG feature.\n", name);
4893 features &= ~NETIF_F_TSO;
4894 } 5229 }
4895 5230
5231 /* TSO ECN requires that TSO is present as well. */
5232 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5233 features &= ~NETIF_F_TSO_ECN;
5234
5235 /* Software GSO depends on SG. */
5236 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5237 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5238 features &= ~NETIF_F_GSO;
5239 }
5240
5241 /* UFO needs SG and checksumming */
4896 if (features & NETIF_F_UFO) { 5242 if (features & NETIF_F_UFO) {
4897 if (!(features & NETIF_F_GEN_CSUM)) { 5243 /* maybe split UFO into V4 and V6? */
4898 if (name) 5244 if (!((features & NETIF_F_GEN_CSUM) ||
4899 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 5245 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
4900 "since no NETIF_F_HW_CSUM feature.\n", 5246 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4901 name); 5247 netdev_dbg(dev,
5248 "Dropping NETIF_F_UFO since no checksum offload features.\n");
4902 features &= ~NETIF_F_UFO; 5249 features &= ~NETIF_F_UFO;
4903 } 5250 }
4904 5251
4905 if (!(features & NETIF_F_SG)) { 5252 if (!(features & NETIF_F_SG)) {
4906 if (name) 5253 netdev_dbg(dev,
4907 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 5254 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
4908 "since no NETIF_F_SG feature.\n", name);
4909 features &= ~NETIF_F_UFO; 5255 features &= ~NETIF_F_UFO;
4910 } 5256 }
4911 } 5257 }
@@ -4914,6 +5260,75 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
4914} 5260}
4915EXPORT_SYMBOL(netdev_fix_features); 5261EXPORT_SYMBOL(netdev_fix_features);
4916 5262
5263int __netdev_update_features(struct net_device *dev)
5264{
5265 u32 features;
5266 int err = 0;
5267
5268 ASSERT_RTNL();
5269
5270 features = netdev_get_wanted_features(dev);
5271
5272 if (dev->netdev_ops->ndo_fix_features)
5273 features = dev->netdev_ops->ndo_fix_features(dev, features);
5274
5275 /* driver might be less strict about feature dependencies */
5276 features = netdev_fix_features(dev, features);
5277
5278 if (dev->features == features)
5279 return 0;
5280
5281 netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5282 dev->features, features);
5283
5284 if (dev->netdev_ops->ndo_set_features)
5285 err = dev->netdev_ops->ndo_set_features(dev, features);
5286
5287 if (unlikely(err < 0)) {
5288 netdev_err(dev,
5289 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5290 err, features, dev->features);
5291 return -1;
5292 }
5293
5294 if (!err)
5295 dev->features = features;
5296
5297 return 1;
5298}
5299
5300/**
5301 * netdev_update_features - recalculate device features
5302 * @dev: the device to check
5303 *
5304 * Recalculate dev->features set and send notifications if it
5305 * has changed. Should be called after driver or hardware dependent
5306 * conditions might have changed that influence the features.
5307 */
5308void netdev_update_features(struct net_device *dev)
5309{
5310 if (__netdev_update_features(dev))
5311 netdev_features_change(dev);
5312}
5313EXPORT_SYMBOL(netdev_update_features);
5314
5315/**
5316 * netdev_change_features - recalculate device features
5317 * @dev: the device to check
5318 *
5319 * Recalculate dev->features set and send notifications even
5320 * if they have not changed. Should be called instead of
5321 * netdev_update_features() if also dev->vlan_features might
5322 * have changed to allow the changes to be propagated to stacked
5323 * VLAN devices.
5324 */
5325void netdev_change_features(struct net_device *dev)
5326{
5327 __netdev_update_features(dev);
5328 netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_change_features);
5331
4917/** 5332/**
4918 * netif_stacked_transfer_operstate - transfer operstate 5333 * netif_stacked_transfer_operstate - transfer operstate
4919 * @rootdev: the root or lower level device to transfer state from 5334 * @rootdev: the root or lower level device to transfer state from
@@ -4941,6 +5356,59 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4941} 5356}
4942EXPORT_SYMBOL(netif_stacked_transfer_operstate); 5357EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4943 5358
5359#ifdef CONFIG_RPS
5360static int netif_alloc_rx_queues(struct net_device *dev)
5361{
5362 unsigned int i, count = dev->num_rx_queues;
5363 struct netdev_rx_queue *rx;
5364
5365 BUG_ON(count < 1);
5366
5367 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5368 if (!rx) {
5369 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5370 return -ENOMEM;
5371 }
5372 dev->_rx = rx;
5373
5374 for (i = 0; i < count; i++)
5375 rx[i].dev = dev;
5376 return 0;
5377}
5378#endif
5379
5380static void netdev_init_one_queue(struct net_device *dev,
5381 struct netdev_queue *queue, void *_unused)
5382{
5383 /* Initialize queue lock */
5384 spin_lock_init(&queue->_xmit_lock);
5385 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5386 queue->xmit_lock_owner = -1;
5387 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5388 queue->dev = dev;
5389}
5390
5391static int netif_alloc_netdev_queues(struct net_device *dev)
5392{
5393 unsigned int count = dev->num_tx_queues;
5394 struct netdev_queue *tx;
5395
5396 BUG_ON(count < 1);
5397
5398 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5399 if (!tx) {
5400 pr_err("netdev: Unable to allocate %u tx queues.\n",
5401 count);
5402 return -ENOMEM;
5403 }
5404 dev->_tx = tx;
5405
5406 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5407 spin_lock_init(&dev->tx_global_lock);
5408
5409 return 0;
5410}
5411
4944/** 5412/**
4945 * register_netdevice - register a network device 5413 * register_netdevice - register a network device
4946 * @dev: device to register 5414 * @dev: device to register
@@ -4974,28 +5442,13 @@ int register_netdevice(struct net_device *dev)
4974 5442
4975 spin_lock_init(&dev->addr_list_lock); 5443 spin_lock_init(&dev->addr_list_lock);
4976 netdev_set_addr_lockdep_class(dev); 5444 netdev_set_addr_lockdep_class(dev);
4977 netdev_init_queue_locks(dev);
4978 5445
4979 dev->iflink = -1; 5446 dev->iflink = -1;
4980 5447
4981#ifdef CONFIG_RPS 5448 ret = dev_get_valid_name(dev, dev->name);
4982 if (!dev->num_rx_queues) { 5449 if (ret < 0)
4983 /* 5450 goto out;
4984 * Allocate a single RX queue if driver never called
4985 * alloc_netdev_mq
4986 */
4987
4988 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4989 if (!dev->_rx) {
4990 ret = -ENOMEM;
4991 goto out;
4992 }
4993 5451
4994 dev->_rx->first = dev->_rx;
4995 atomic_set(&dev->_rx->count, 1);
4996 dev->num_rx_queues = 1;
4997 }
4998#endif
4999 /* Init, if this function is available */ 5452 /* Init, if this function is available */
5000 if (dev->netdev_ops->ndo_init) { 5453 if (dev->netdev_ops->ndo_init) {
5001 ret = dev->netdev_ops->ndo_init(dev); 5454 ret = dev->netdev_ops->ndo_init(dev);
@@ -5006,34 +5459,30 @@ int register_netdevice(struct net_device *dev)
5006 } 5459 }
5007 } 5460 }
5008 5461
5009 ret = dev_get_valid_name(dev, dev->name, 0);
5010 if (ret)
5011 goto err_uninit;
5012
5013 dev->ifindex = dev_new_index(net); 5462 dev->ifindex = dev_new_index(net);
5014 if (dev->iflink == -1) 5463 if (dev->iflink == -1)
5015 dev->iflink = dev->ifindex; 5464 dev->iflink = dev->ifindex;
5016 5465
5017 /* Fix illegal checksum combinations */ 5466 /* Transfer changeable features to wanted_features and enable
5018 if ((dev->features & NETIF_F_HW_CSUM) && 5467 * software offloads (GSO and GRO).
5019 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5468 */
5020 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 5469 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5021 dev->name); 5470 dev->features |= NETIF_F_SOFT_FEATURES;
5022 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 5471 dev->wanted_features = dev->features & dev->hw_features;
5023 }
5024 5472
5025 if ((dev->features & NETIF_F_NO_CSUM) && 5473 /* Turn on no cache copy if HW is doing checksum */
5026 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5474 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5027 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 5475 if ((dev->features & NETIF_F_ALL_CSUM) &&
5028 dev->name); 5476 !(dev->features & NETIF_F_NO_CSUM)) {
5029 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 5477 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5478 dev->features |= NETIF_F_NOCACHE_COPY;
5030 } 5479 }
5031 5480
5032 dev->features = netdev_fix_features(dev->features, dev->name); 5481 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5033 5482 * vlan_dev_init() will do the dev->features check, so these features
5034 /* Enable software GSO if SG is supported. */ 5483 * are enabled only if supported by underlying device.
5035 if (dev->features & NETIF_F_SG) 5484 */
5036 dev->features |= NETIF_F_GSO; 5485 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5037 5486
5038 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5487 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5039 ret = notifier_to_errno(ret); 5488 ret = notifier_to_errno(ret);
@@ -5045,6 +5494,8 @@ int register_netdevice(struct net_device *dev)
5045 goto err_uninit; 5494 goto err_uninit;
5046 dev->reg_state = NETREG_REGISTERED; 5495 dev->reg_state = NETREG_REGISTERED;
5047 5496
5497 __netdev_update_features(dev);
5498
5048 /* 5499 /*
5049 * Default initial state at registry is that the 5500 * Default initial state at registry is that the
5050 * device is present. 5501 * device is present.
@@ -5105,9 +5556,6 @@ int init_dummy_netdev(struct net_device *dev)
5105 */ 5556 */
5106 dev->reg_state = NETREG_DUMMY; 5557 dev->reg_state = NETREG_DUMMY;
5107 5558
5108 /* initialize the ref count */
5109 atomic_set(&dev->refcnt, 1);
5110
5111 /* NAPI wants this */ 5559 /* NAPI wants this */
5112 INIT_LIST_HEAD(&dev->napi_list); 5560 INIT_LIST_HEAD(&dev->napi_list);
5113 5561
@@ -5115,6 +5563,11 @@ int init_dummy_netdev(struct net_device *dev)
5115 set_bit(__LINK_STATE_PRESENT, &dev->state); 5563 set_bit(__LINK_STATE_PRESENT, &dev->state);
5116 set_bit(__LINK_STATE_START, &dev->state); 5564 set_bit(__LINK_STATE_START, &dev->state);
5117 5565
5566 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5567 * because users of this 'device' dont need to change
5568 * its refcount.
5569 */
5570
5118 return 0; 5571 return 0;
5119} 5572}
5120EXPORT_SYMBOL_GPL(init_dummy_netdev); 5573EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5138,24 +5591,22 @@ int register_netdev(struct net_device *dev)
5138 int err; 5591 int err;
5139 5592
5140 rtnl_lock(); 5593 rtnl_lock();
5141
5142 /*
5143 * If the name is a format string the caller wants us to do a
5144 * name allocation.
5145 */
5146 if (strchr(dev->name, '%')) {
5147 err = dev_alloc_name(dev, dev->name);
5148 if (err < 0)
5149 goto out;
5150 }
5151
5152 err = register_netdevice(dev); 5594 err = register_netdevice(dev);
5153out:
5154 rtnl_unlock(); 5595 rtnl_unlock();
5155 return err; 5596 return err;
5156} 5597}
5157EXPORT_SYMBOL(register_netdev); 5598EXPORT_SYMBOL(register_netdev);
5158 5599
5600int netdev_refcnt_read(const struct net_device *dev)
5601{
5602 int i, refcnt = 0;
5603
5604 for_each_possible_cpu(i)
5605 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5606 return refcnt;
5607}
5608EXPORT_SYMBOL(netdev_refcnt_read);
5609
5159/* 5610/*
5160 * netdev_wait_allrefs - wait until all references are gone. 5611 * netdev_wait_allrefs - wait until all references are gone.
5161 * 5612 *
@@ -5170,11 +5621,14 @@ EXPORT_SYMBOL(register_netdev);
5170static void netdev_wait_allrefs(struct net_device *dev) 5621static void netdev_wait_allrefs(struct net_device *dev)
5171{ 5622{
5172 unsigned long rebroadcast_time, warning_time; 5623 unsigned long rebroadcast_time, warning_time;
5624 int refcnt;
5173 5625
5174 linkwatch_forget_dev(dev); 5626 linkwatch_forget_dev(dev);
5175 5627
5176 rebroadcast_time = warning_time = jiffies; 5628 rebroadcast_time = warning_time = jiffies;
5177 while (atomic_read(&dev->refcnt) != 0) { 5629 refcnt = netdev_refcnt_read(dev);
5630
5631 while (refcnt != 0) {
5178 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5632 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5179 rtnl_lock(); 5633 rtnl_lock();
5180 5634
@@ -5201,11 +5655,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
5201 5655
5202 msleep(250); 5656 msleep(250);
5203 5657
5658 refcnt = netdev_refcnt_read(dev);
5659
5204 if (time_after(jiffies, warning_time + 10 * HZ)) { 5660 if (time_after(jiffies, warning_time + 10 * HZ)) {
5205 printk(KERN_EMERG "unregister_netdevice: " 5661 printk(KERN_EMERG "unregister_netdevice: "
5206 "waiting for %s to become free. Usage " 5662 "waiting for %s to become free. Usage "
5207 "count = %d\n", 5663 "count = %d\n",
5208 dev->name, atomic_read(&dev->refcnt)); 5664 dev->name, refcnt);
5209 warning_time = jiffies; 5665 warning_time = jiffies;
5210 } 5666 }
5211 } 5667 }
@@ -5263,9 +5719,9 @@ void netdev_run_todo(void)
5263 netdev_wait_allrefs(dev); 5719 netdev_wait_allrefs(dev);
5264 5720
5265 /* paranoia */ 5721 /* paranoia */
5266 BUG_ON(atomic_read(&dev->refcnt)); 5722 BUG_ON(netdev_refcnt_read(dev));
5267 WARN_ON(dev->ip_ptr); 5723 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5268 WARN_ON(dev->ip6_ptr); 5724 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5269 WARN_ON(dev->dn_ptr); 5725 WARN_ON(dev->dn_ptr);
5270 5726
5271 if (dev->destructor) 5727 if (dev->destructor)
@@ -5276,34 +5732,6 @@ void netdev_run_todo(void)
5276 } 5732 }
5277} 5733}
5278 5734
5279/**
5280 * dev_txq_stats_fold - fold tx_queues stats
5281 * @dev: device to get statistics from
5282 * @stats: struct rtnl_link_stats64 to hold results
5283 */
5284void dev_txq_stats_fold(const struct net_device *dev,
5285 struct rtnl_link_stats64 *stats)
5286{
5287 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5288 unsigned int i;
5289 struct netdev_queue *txq;
5290
5291 for (i = 0; i < dev->num_tx_queues; i++) {
5292 txq = netdev_get_tx_queue(dev, i);
5293 spin_lock_bh(&txq->_xmit_lock);
5294 tx_bytes += txq->tx_bytes;
5295 tx_packets += txq->tx_packets;
5296 tx_dropped += txq->tx_dropped;
5297 spin_unlock_bh(&txq->_xmit_lock);
5298 }
5299 if (tx_bytes || tx_packets || tx_dropped) {
5300 stats->tx_bytes = tx_bytes;
5301 stats->tx_packets = tx_packets;
5302 stats->tx_dropped = tx_dropped;
5303 }
5304}
5305EXPORT_SYMBOL(dev_txq_stats_fold);
5306
5307/* Convert net_device_stats to rtnl_link_stats64. They have the same 5735/* Convert net_device_stats to rtnl_link_stats64. They have the same
5308 * fields in the same order, with only the type differing. 5736 * fields in the same order, with only the type differing.
5309 */ 5737 */
@@ -5342,57 +5770,71 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5342 5770
5343 if (ops->ndo_get_stats64) { 5771 if (ops->ndo_get_stats64) {
5344 memset(storage, 0, sizeof(*storage)); 5772 memset(storage, 0, sizeof(*storage));
5345 return ops->ndo_get_stats64(dev, storage); 5773 ops->ndo_get_stats64(dev, storage);
5346 } 5774 } else if (ops->ndo_get_stats) {
5347 if (ops->ndo_get_stats) {
5348 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 5775 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5349 return storage; 5776 } else {
5777 netdev_stats_to_stats64(storage, &dev->stats);
5350 } 5778 }
5351 netdev_stats_to_stats64(storage, &dev->stats); 5779 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5352 dev_txq_stats_fold(dev, storage);
5353 return storage; 5780 return storage;
5354} 5781}
5355EXPORT_SYMBOL(dev_get_stats); 5782EXPORT_SYMBOL(dev_get_stats);
5356 5783
5357static void netdev_init_one_queue(struct net_device *dev, 5784struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5358 struct netdev_queue *queue,
5359 void *_unused)
5360{ 5785{
5361 queue->dev = dev; 5786 struct netdev_queue *queue = dev_ingress_queue(dev);
5362}
5363 5787
5364static void netdev_init_queues(struct net_device *dev) 5788#ifdef CONFIG_NET_CLS_ACT
5365{ 5789 if (queue)
5366 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 5790 return queue;
5367 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5791 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5368 spin_lock_init(&dev->tx_global_lock); 5792 if (!queue)
5793 return NULL;
5794 netdev_init_one_queue(dev, queue, NULL);
5795 queue->qdisc = &noop_qdisc;
5796 queue->qdisc_sleeping = &noop_qdisc;
5797 rcu_assign_pointer(dev->ingress_queue, queue);
5798#endif
5799 return queue;
5369} 5800}
5370 5801
5371/** 5802/**
5372 * alloc_netdev_mq - allocate network device 5803 * alloc_netdev_mqs - allocate network device
5373 * @sizeof_priv: size of private data to allocate space for 5804 * @sizeof_priv: size of private data to allocate space for
5374 * @name: device name format string 5805 * @name: device name format string
5375 * @setup: callback to initialize device 5806 * @setup: callback to initialize device
5376 * @queue_count: the number of subqueues to allocate 5807 * @txqs: the number of TX subqueues to allocate
5808 * @rxqs: the number of RX subqueues to allocate
5377 * 5809 *
5378 * Allocates a struct net_device with private data area for driver use 5810 * Allocates a struct net_device with private data area for driver use
5379 * and performs basic initialization. Also allocates subquue structs 5811 * and performs basic initialization. Also allocates subquue structs
5380 * for each queue on the device at the end of the netdevice. 5812 * for each queue on the device.
5381 */ 5813 */
5382struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 5814struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5383 void (*setup)(struct net_device *), unsigned int queue_count) 5815 void (*setup)(struct net_device *),
5816 unsigned int txqs, unsigned int rxqs)
5384{ 5817{
5385 struct netdev_queue *tx;
5386 struct net_device *dev; 5818 struct net_device *dev;
5387 size_t alloc_size; 5819 size_t alloc_size;
5388 struct net_device *p; 5820 struct net_device *p;
5389#ifdef CONFIG_RPS
5390 struct netdev_rx_queue *rx;
5391 int i;
5392#endif
5393 5821
5394 BUG_ON(strlen(name) >= sizeof(dev->name)); 5822 BUG_ON(strlen(name) >= sizeof(dev->name));
5395 5823
5824 if (txqs < 1) {
5825 pr_err("alloc_netdev: Unable to allocate device "
5826 "with zero queues.\n");
5827 return NULL;
5828 }
5829
5830#ifdef CONFIG_RPS
5831 if (rxqs < 1) {
5832 pr_err("alloc_netdev: Unable to allocate device "
5833 "with zero RX queues.\n");
5834 return NULL;
5835 }
5836#endif
5837
5396 alloc_size = sizeof(struct net_device); 5838 alloc_size = sizeof(struct net_device);
5397 if (sizeof_priv) { 5839 if (sizeof_priv) {
5398 /* ensure 32-byte alignment of private area */ 5840 /* ensure 32-byte alignment of private area */
@@ -5408,55 +5850,23 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5408 return NULL; 5850 return NULL;
5409 } 5851 }
5410 5852
5411 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5412 if (!tx) {
5413 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5414 "tx qdiscs.\n");
5415 goto free_p;
5416 }
5417
5418#ifdef CONFIG_RPS
5419 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5420 if (!rx) {
5421 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5422 "rx queues.\n");
5423 goto free_tx;
5424 }
5425
5426 atomic_set(&rx->count, queue_count);
5427
5428 /*
5429 * Set a pointer to first element in the array which holds the
5430 * reference count.
5431 */
5432 for (i = 0; i < queue_count; i++)
5433 rx[i].first = rx;
5434#endif
5435
5436 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5853 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5437 dev->padded = (char *)dev - (char *)p; 5854 dev->padded = (char *)dev - (char *)p;
5438 5855
5856 dev->pcpu_refcnt = alloc_percpu(int);
5857 if (!dev->pcpu_refcnt)
5858 goto free_p;
5859
5439 if (dev_addr_init(dev)) 5860 if (dev_addr_init(dev))
5440 goto free_rx; 5861 goto free_pcpu;
5441 5862
5442 dev_mc_init(dev); 5863 dev_mc_init(dev);
5443 dev_uc_init(dev); 5864 dev_uc_init(dev);
5444 5865
5445 dev_net_set(dev, &init_net); 5866 dev_net_set(dev, &init_net);
5446 5867
5447 dev->_tx = tx;
5448 dev->num_tx_queues = queue_count;
5449 dev->real_num_tx_queues = queue_count;
5450
5451#ifdef CONFIG_RPS
5452 dev->_rx = rx;
5453 dev->num_rx_queues = queue_count;
5454#endif
5455
5456 dev->gso_max_size = GSO_MAX_SIZE; 5868 dev->gso_max_size = GSO_MAX_SIZE;
5457 5869
5458 netdev_init_queues(dev);
5459
5460 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); 5870 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5461 dev->ethtool_ntuple_list.count = 0; 5871 dev->ethtool_ntuple_list.count = 0;
5462 INIT_LIST_HEAD(&dev->napi_list); 5872 INIT_LIST_HEAD(&dev->napi_list);
@@ -5464,20 +5874,39 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5464 INIT_LIST_HEAD(&dev->link_watch_list); 5874 INIT_LIST_HEAD(&dev->link_watch_list);
5465 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5875 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5466 setup(dev); 5876 setup(dev);
5877
5878 dev->num_tx_queues = txqs;
5879 dev->real_num_tx_queues = txqs;
5880 if (netif_alloc_netdev_queues(dev))
5881 goto free_all;
5882
5883#ifdef CONFIG_RPS
5884 dev->num_rx_queues = rxqs;
5885 dev->real_num_rx_queues = rxqs;
5886 if (netif_alloc_rx_queues(dev))
5887 goto free_all;
5888#endif
5889
5467 strcpy(dev->name, name); 5890 strcpy(dev->name, name);
5891 dev->group = INIT_NETDEV_GROUP;
5468 return dev; 5892 return dev;
5469 5893
5470free_rx: 5894free_all:
5895 free_netdev(dev);
5896 return NULL;
5897
5898free_pcpu:
5899 free_percpu(dev->pcpu_refcnt);
5900 kfree(dev->_tx);
5471#ifdef CONFIG_RPS 5901#ifdef CONFIG_RPS
5472 kfree(rx); 5902 kfree(dev->_rx);
5473free_tx:
5474#endif 5903#endif
5475 kfree(tx); 5904
5476free_p: 5905free_p:
5477 kfree(p); 5906 kfree(p);
5478 return NULL; 5907 return NULL;
5479} 5908}
5480EXPORT_SYMBOL(alloc_netdev_mq); 5909EXPORT_SYMBOL(alloc_netdev_mqs);
5481 5910
5482/** 5911/**
5483 * free_netdev - free network device 5912 * free_netdev - free network device
@@ -5494,6 +5923,11 @@ void free_netdev(struct net_device *dev)
5494 release_net(dev_net(dev)); 5923 release_net(dev_net(dev));
5495 5924
5496 kfree(dev->_tx); 5925 kfree(dev->_tx);
5926#ifdef CONFIG_RPS
5927 kfree(dev->_rx);
5928#endif
5929
5930 kfree(rcu_dereference_raw(dev->ingress_queue));
5497 5931
5498 /* Flush device addresses */ 5932 /* Flush device addresses */
5499 dev_addr_flush(dev); 5933 dev_addr_flush(dev);
@@ -5504,6 +5938,9 @@ void free_netdev(struct net_device *dev)
5504 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5938 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5505 netif_napi_del(p); 5939 netif_napi_del(p);
5506 5940
5941 free_percpu(dev->pcpu_refcnt);
5942 dev->pcpu_refcnt = NULL;
5943
5507 /* Compatibility with error handling in drivers */ 5944 /* Compatibility with error handling in drivers */
5508 if (dev->reg_state == NETREG_UNINITIALIZED) { 5945 if (dev->reg_state == NETREG_UNINITIALIZED) {
5509 kfree((char *)dev - dev->padded); 5946 kfree((char *)dev - dev->padded);
@@ -5527,7 +5964,10 @@ EXPORT_SYMBOL(free_netdev);
5527void synchronize_net(void) 5964void synchronize_net(void)
5528{ 5965{
5529 might_sleep(); 5966 might_sleep();
5530 synchronize_rcu(); 5967 if (rtnl_is_locked())
5968 synchronize_rcu_expedited();
5969 else
5970 synchronize_rcu();
5531} 5971}
5532EXPORT_SYMBOL(synchronize_net); 5972EXPORT_SYMBOL(synchronize_net);
5533 5973
@@ -5636,7 +6076,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5636 /* We get here if we can't use the current device name */ 6076 /* We get here if we can't use the current device name */
5637 if (!pat) 6077 if (!pat)
5638 goto out; 6078 goto out;
5639 if (dev_get_valid_name(dev, pat, 1)) 6079 if (dev_get_valid_name(dev, pat) < 0)
5640 goto out; 6080 goto out;
5641 } 6081 }
5642 6082
@@ -5658,6 +6098,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5658 6098
5659 /* Notify protocols, that we are about to destroy 6099 /* Notify protocols, that we are about to destroy
5660 this device. They should clean all the things. 6100 this device. They should clean all the things.
6101
6102 Note that dev->reg_state stays at NETREG_REGISTERED.
6103 This is wanted because this way 8021q and macvlan know
6104 the device is just moving and can keep their slaves up.
5661 */ 6105 */
5662 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6106 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5663 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 6107 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
@@ -5734,6 +6178,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5734 oldsd->output_queue = NULL; 6178 oldsd->output_queue = NULL;
5735 oldsd->output_queue_tailp = &oldsd->output_queue; 6179 oldsd->output_queue_tailp = &oldsd->output_queue;
5736 } 6180 }
6181 /* Append NAPI poll list from offline CPU. */
6182 if (!list_empty(&oldsd->poll_list)) {
6183 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6184 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6185 }
5737 6186
5738 raise_softirq_irqoff(NET_TX_SOFTIRQ); 6187 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5739 local_irq_enable(); 6188 local_irq_enable();
@@ -5762,32 +6211,22 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5762 * @one to the master device with current feature set @all. Will not 6211 * @one to the master device with current feature set @all. Will not
5763 * enable anything that is off in @mask. Returns the new feature set. 6212 * enable anything that is off in @mask. Returns the new feature set.
5764 */ 6213 */
5765unsigned long netdev_increment_features(unsigned long all, unsigned long one, 6214u32 netdev_increment_features(u32 all, u32 one, u32 mask)
5766 unsigned long mask)
5767{ 6215{
5768 /* If device needs checksumming, downgrade to it. */ 6216 if (mask & NETIF_F_GEN_CSUM)
5769 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 6217 mask |= NETIF_F_ALL_CSUM;
5770 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); 6218 mask |= NETIF_F_VLAN_CHALLENGED;
5771 else if (mask & NETIF_F_ALL_CSUM) {
5772 /* If one device supports v4/v6 checksumming, set for all. */
5773 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5774 !(all & NETIF_F_GEN_CSUM)) {
5775 all &= ~NETIF_F_ALL_CSUM;
5776 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5777 }
5778 6219
5779 /* If one device supports hw checksumming, set for all. */ 6220 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
5780 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { 6221 all &= one | ~NETIF_F_ALL_FOR_ALL;
5781 all &= ~NETIF_F_ALL_CSUM;
5782 all |= NETIF_F_HW_CSUM;
5783 }
5784 }
5785 6222
5786 one |= NETIF_F_ALL_CSUM; 6223 /* If device needs checksumming, downgrade to it. */
6224 if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6225 all &= ~NETIF_F_NO_CSUM;
5787 6226
5788 one |= all & NETIF_F_ONE_FOR_ALL; 6227 /* If one device supports hw checksumming, set for all. */
5789 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; 6228 if (all & NETIF_F_GEN_CSUM)
5790 all |= one & mask & NETIF_F_ONE_FOR_ALL; 6229 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
5791 6230
5792 return all; 6231 return all;
5793} 6232}
@@ -5830,29 +6269,23 @@ err_name:
5830/** 6269/**
5831 * netdev_drivername - network driver for the device 6270 * netdev_drivername - network driver for the device
5832 * @dev: network device 6271 * @dev: network device
5833 * @buffer: buffer for resulting name
5834 * @len: size of buffer
5835 * 6272 *
5836 * Determine network driver for device. 6273 * Determine network driver for device.
5837 */ 6274 */
5838char *netdev_drivername(const struct net_device *dev, char *buffer, int len) 6275const char *netdev_drivername(const struct net_device *dev)
5839{ 6276{
5840 const struct device_driver *driver; 6277 const struct device_driver *driver;
5841 const struct device *parent; 6278 const struct device *parent;
5842 6279 const char *empty = "";
5843 if (len <= 0 || !buffer)
5844 return buffer;
5845 buffer[0] = 0;
5846 6280
5847 parent = dev->dev.parent; 6281 parent = dev->dev.parent;
5848
5849 if (!parent) 6282 if (!parent)
5850 return buffer; 6283 return empty;
5851 6284
5852 driver = parent->driver; 6285 driver = parent->driver;
5853 if (driver && driver->name) 6286 if (driver && driver->name)
5854 strlcpy(buffer, driver->name, len); 6287 return driver->name;
5855 return buffer; 6288 return empty;
5856} 6289}
5857 6290
5858static int __netdev_printk(const char *level, const struct net_device *dev, 6291static int __netdev_printk(const char *level, const struct net_device *dev,
@@ -5948,7 +6381,7 @@ static void __net_exit default_device_exit(struct net *net)
5948 if (dev->rtnl_link_ops) 6381 if (dev->rtnl_link_ops)
5949 continue; 6382 continue;
5950 6383
5951 /* Push remaing network devices to init_net */ 6384 /* Push remaining network devices to init_net */
5952 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 6385 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5953 err = dev_change_net_namespace(dev, &init_net, fb_name); 6386 err = dev_change_net_namespace(dev, &init_net, fb_name);
5954 if (err) { 6387 if (err) {
@@ -5963,7 +6396,7 @@ static void __net_exit default_device_exit(struct net *net)
5963static void __net_exit default_device_exit_batch(struct list_head *net_list) 6396static void __net_exit default_device_exit_batch(struct list_head *net_list)
5964{ 6397{
5965 /* At exit all network devices most be removed from a network 6398 /* At exit all network devices most be removed from a network
5966 * namespace. Do this in the reverse order of registeration. 6399 * namespace. Do this in the reverse order of registration.
5967 * Do this across as many network namespaces as possible to 6400 * Do this across as many network namespaces as possible to
5968 * improve batching efficiency. 6401 * improve batching efficiency.
5969 */ 6402 */
@@ -5981,6 +6414,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
5981 } 6414 }
5982 } 6415 }
5983 unregister_netdevice_many(&dev_kill_list); 6416 unregister_netdevice_many(&dev_kill_list);
6417 list_del(&dev_kill_list);
5984 rtnl_unlock(); 6418 rtnl_unlock();
5985} 6419}
5986 6420
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 508f9c18992f..e2e66939ed00 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -68,14 +68,6 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
68 return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); 68 return __hw_addr_add_ex(list, addr, addr_len, addr_type, false);
69} 69}
70 70
71static void ha_rcu_free(struct rcu_head *head)
72{
73 struct netdev_hw_addr *ha;
74
75 ha = container_of(head, struct netdev_hw_addr, rcu_head);
76 kfree(ha);
77}
78
79static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, 71static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
80 unsigned char *addr, int addr_len, 72 unsigned char *addr, int addr_len,
81 unsigned char addr_type, bool global) 73 unsigned char addr_type, bool global)
@@ -94,7 +86,7 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
94 if (--ha->refcount) 86 if (--ha->refcount)
95 return 0; 87 return 0;
96 list_del_rcu(&ha->list); 88 list_del_rcu(&ha->list);
97 call_rcu(&ha->rcu_head, ha_rcu_free); 89 kfree_rcu(ha, rcu_head);
98 list->count--; 90 list->count--;
99 return 0; 91 return 0;
100 } 92 }
@@ -144,7 +136,7 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
144 136
145 list_for_each_entry(ha, &from_list->list, list) { 137 list_for_each_entry(ha, &from_list->list, list) {
146 type = addr_type ? addr_type : ha->type; 138 type = addr_type ? addr_type : ha->type;
147 __hw_addr_del(to_list, ha->addr, addr_len, addr_type); 139 __hw_addr_del(to_list, ha->addr, addr_len, type);
148 } 140 }
149} 141}
150EXPORT_SYMBOL(__hw_addr_del_multiple); 142EXPORT_SYMBOL(__hw_addr_del_multiple);
@@ -197,7 +189,7 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list)
197 189
198 list_for_each_entry_safe(ha, tmp, &list->list, list) { 190 list_for_each_entry_safe(ha, tmp, &list->list, list) {
199 list_del_rcu(&ha->list); 191 list_del_rcu(&ha->list);
200 call_rcu(&ha->rcu_head, ha_rcu_free); 192 kfree_rcu(ha, rcu_head);
201 } 193 }
202 list->count = 0; 194 list->count = 0;
203} 195}
@@ -357,8 +349,8 @@ EXPORT_SYMBOL(dev_addr_add_multiple);
357/** 349/**
358 * dev_addr_del_multiple - Delete device addresses by another device 350 * dev_addr_del_multiple - Delete device addresses by another device
359 * @to_dev: device where the addresses will be deleted 351 * @to_dev: device where the addresses will be deleted
360 * @from_dev: device by which addresses the addresses will be deleted 352 * @from_dev: device supplying the addresses to be deleted
361 * @addr_type: address type - 0 means type will used from from_dev 353 * @addr_type: address type - 0 means type will be used from from_dev
362 * 354 *
363 * Deletes addresses in to device by the list of addresses in from device. 355 * Deletes addresses in to device by the list of addresses in from device.
364 * 356 *
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 36e603c78ce9..7f36b38e060f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -207,14 +207,6 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
207 rcu_read_unlock(); 207 rcu_read_unlock();
208} 208}
209 209
210
211static void free_dm_hw_stat(struct rcu_head *head)
212{
213 struct dm_hw_stat_delta *n;
214 n = container_of(head, struct dm_hw_stat_delta, rcu);
215 kfree(n);
216}
217
218static int set_all_monitor_traces(int state) 210static int set_all_monitor_traces(int state)
219{ 211{
220 int rc = 0; 212 int rc = 0;
@@ -245,7 +237,7 @@ static int set_all_monitor_traces(int state)
245 list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { 237 list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
246 if (new_stat->dev == NULL) { 238 if (new_stat->dev == NULL) {
247 list_del_rcu(&new_stat->list); 239 list_del_rcu(&new_stat->list);
248 call_rcu(&new_stat->rcu, free_dm_hw_stat); 240 kfree_rcu(new_stat, rcu);
249 } 241 }
250 } 242 }
251 break; 243 break;
@@ -314,7 +306,7 @@ static int dropmon_net_event(struct notifier_block *ev_block,
314 new_stat->dev = NULL; 306 new_stat->dev = NULL;
315 if (trace_state == TRACE_OFF) { 307 if (trace_state == TRACE_OFF) {
316 list_del_rcu(&new_stat->list); 308 list_del_rcu(&new_stat->list);
317 call_rcu(&new_stat->rcu, free_dm_hw_stat); 309 kfree_rcu(new_stat, rcu);
318 break; 310 break;
319 } 311 }
320 } 312 }
@@ -350,7 +342,7 @@ static int __init init_net_drop_monitor(void)
350 struct per_cpu_dm_data *data; 342 struct per_cpu_dm_data *data;
351 int cpu, rc; 343 int cpu, rc;
352 344
353 printk(KERN_INFO "Initalizing network drop monitor service\n"); 345 printk(KERN_INFO "Initializing network drop monitor service\n");
354 346
355 if (sizeof(void *) > 8) { 347 if (sizeof(void *) > 8) {
356 printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); 348 printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
diff --git a/net/core/dst.c b/net/core/dst.c
index 6c41b1fac3db..6135f3671692 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -19,6 +19,7 @@
19#include <linux/types.h> 19#include <linux/types.h>
20#include <net/net_namespace.h> 20#include <net/net_namespace.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/prefetch.h>
22 23
23#include <net/dst.h> 24#include <net/dst.h>
24 25
@@ -33,9 +34,6 @@
33 * 3) This list is guarded by a mutex, 34 * 3) This list is guarded by a mutex,
34 * so that the gc_task and dst_dev_event() can be synchronized. 35 * so that the gc_task and dst_dev_event() can be synchronized.
35 */ 36 */
36#if RT_CACHE_DEBUG >= 2
37static atomic_t dst_total = ATOMIC_INIT(0);
38#endif
39 37
40/* 38/*
41 * We want to keep lock & list close together 39 * We want to keep lock & list close together
@@ -69,10 +67,6 @@ static void dst_gc_task(struct work_struct *work)
69 unsigned long expires = ~0L; 67 unsigned long expires = ~0L;
70 struct dst_entry *dst, *next, head; 68 struct dst_entry *dst, *next, head;
71 struct dst_entry *last = &head; 69 struct dst_entry *last = &head;
72#if RT_CACHE_DEBUG >= 2
73 ktime_t time_start = ktime_get();
74 struct timespec elapsed;
75#endif
76 70
77 mutex_lock(&dst_gc_mutex); 71 mutex_lock(&dst_gc_mutex);
78 next = dst_busy_list; 72 next = dst_busy_list;
@@ -146,15 +140,6 @@ loop:
146 140
147 spin_unlock_bh(&dst_garbage.lock); 141 spin_unlock_bh(&dst_garbage.lock);
148 mutex_unlock(&dst_gc_mutex); 142 mutex_unlock(&dst_gc_mutex);
149#if RT_CACHE_DEBUG >= 2
150 elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start));
151 printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d"
152 " expires: %lu elapsed: %lu us\n",
153 atomic_read(&dst_total), delayed, work_performed,
154 expires,
155 elapsed.tv_sec * USEC_PER_SEC +
156 elapsed.tv_nsec / NSEC_PER_USEC);
157#endif
158} 143}
159 144
160int dst_discard(struct sk_buff *skb) 145int dst_discard(struct sk_buff *skb)
@@ -164,26 +149,49 @@ int dst_discard(struct sk_buff *skb)
164} 149}
165EXPORT_SYMBOL(dst_discard); 150EXPORT_SYMBOL(dst_discard);
166 151
167void *dst_alloc(struct dst_ops *ops) 152const u32 dst_default_metrics[RTAX_MAX];
153
154void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
155 int initial_ref, int initial_obsolete, int flags)
168{ 156{
169 struct dst_entry *dst; 157 struct dst_entry *dst;
170 158
171 if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { 159 if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
172 if (ops->gc(ops)) 160 if (ops->gc(ops))
173 return NULL; 161 return NULL;
174 } 162 }
175 dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); 163 dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
176 if (!dst) 164 if (!dst)
177 return NULL; 165 return NULL;
178 atomic_set(&dst->__refcnt, 0); 166 dst->child = NULL;
167 dst->dev = dev;
168 if (dev)
169 dev_hold(dev);
179 dst->ops = ops; 170 dst->ops = ops;
180 dst->lastuse = jiffies; 171 dst_init_metrics(dst, dst_default_metrics, true);
172 dst->expires = 0UL;
181 dst->path = dst; 173 dst->path = dst;
182 dst->input = dst->output = dst_discard; 174 dst->neighbour = NULL;
183#if RT_CACHE_DEBUG >= 2 175 dst->hh = NULL;
184 atomic_inc(&dst_total); 176#ifdef CONFIG_XFRM
177 dst->xfrm = NULL;
185#endif 178#endif
186 atomic_inc(&ops->entries); 179 dst->input = dst_discard;
180 dst->output = dst_discard;
181 dst->error = 0;
182 dst->obsolete = initial_obsolete;
183 dst->header_len = 0;
184 dst->trailer_len = 0;
185#ifdef CONFIG_IP_ROUTE_CLASSID
186 dst->tclassid = 0;
187#endif
188 atomic_set(&dst->__refcnt, initial_ref);
189 dst->__use = 0;
190 dst->lastuse = jiffies;
191 dst->flags = flags;
192 dst->next = NULL;
193 if (!(flags & DST_NOCOUNT))
194 dst_entries_add(ops, 1);
187 return dst; 195 return dst;
188} 196}
189EXPORT_SYMBOL(dst_alloc); 197EXPORT_SYMBOL(dst_alloc);
@@ -228,23 +236,21 @@ again:
228 child = dst->child; 236 child = dst->child;
229 237
230 dst->hh = NULL; 238 dst->hh = NULL;
231 if (hh && atomic_dec_and_test(&hh->hh_refcnt)) 239 if (hh)
232 kfree(hh); 240 hh_cache_put(hh);
233 241
234 if (neigh) { 242 if (neigh) {
235 dst->neighbour = NULL; 243 dst->neighbour = NULL;
236 neigh_release(neigh); 244 neigh_release(neigh);
237 } 245 }
238 246
239 atomic_dec(&dst->ops->entries); 247 if (!(dst->flags & DST_NOCOUNT))
248 dst_entries_add(dst->ops, -1);
240 249
241 if (dst->ops->destroy) 250 if (dst->ops->destroy)
242 dst->ops->destroy(dst); 251 dst->ops->destroy(dst);
243 if (dst->dev) 252 if (dst->dev)
244 dev_put(dst->dev); 253 dev_put(dst->dev);
245#if RT_CACHE_DEBUG >= 2
246 atomic_dec(&dst_total);
247#endif
248 kmem_cache_free(dst->ops->kmem_cachep, dst); 254 kmem_cache_free(dst->ops->kmem_cachep, dst);
249 255
250 dst = child; 256 dst = child;
@@ -271,13 +277,76 @@ void dst_release(struct dst_entry *dst)
271 if (dst) { 277 if (dst) {
272 int newrefcnt; 278 int newrefcnt;
273 279
274 smp_mb__before_atomic_dec();
275 newrefcnt = atomic_dec_return(&dst->__refcnt); 280 newrefcnt = atomic_dec_return(&dst->__refcnt);
276 WARN_ON(newrefcnt < 0); 281 WARN_ON(newrefcnt < 0);
282 if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
283 dst = dst_destroy(dst);
284 if (dst)
285 __dst_free(dst);
286 }
277 } 287 }
278} 288}
279EXPORT_SYMBOL(dst_release); 289EXPORT_SYMBOL(dst_release);
280 290
291u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
292{
293 u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
294
295 if (p) {
296 u32 *old_p = __DST_METRICS_PTR(old);
297 unsigned long prev, new;
298
299 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
300
301 new = (unsigned long) p;
302 prev = cmpxchg(&dst->_metrics, old, new);
303
304 if (prev != old) {
305 kfree(p);
306 p = __DST_METRICS_PTR(prev);
307 if (prev & DST_METRICS_READ_ONLY)
308 p = NULL;
309 }
310 }
311 return p;
312}
313EXPORT_SYMBOL(dst_cow_metrics_generic);
314
315/* Caller asserts that dst_metrics_read_only(dst) is false. */
316void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
317{
318 unsigned long prev, new;
319
320 new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
321 prev = cmpxchg(&dst->_metrics, old, new);
322 if (prev == old)
323 kfree(__DST_METRICS_PTR(old));
324}
325EXPORT_SYMBOL(__dst_destroy_metrics_generic);
326
327/**
328 * skb_dst_set_noref - sets skb dst, without a reference
329 * @skb: buffer
330 * @dst: dst entry
331 *
332 * Sets skb dst, assuming a reference was not taken on dst
333 * skb_dst_drop() should not dst_release() this dst
334 */
335void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
336{
337 WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
338 /* If dst not in cache, we must take a reference, because
339 * dst_release() will destroy dst as soon as its refcount becomes zero
340 */
341 if (unlikely(dst->flags & DST_NOCACHE)) {
342 dst_hold(dst);
343 skb_dst_set(skb, dst);
344 } else {
345 skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
346 }
347}
348EXPORT_SYMBOL(skb_dst_set_noref);
349
281/* Dirty hack. We did it in 2.2 (in __dst_free), 350/* Dirty hack. We did it in 2.2 (in __dst_free),
282 * we have _very_ good reasons not to repeat 351 * we have _very_ good reasons not to repeat
283 * this mistake in 2.3, but we have no choice 352 * this mistake in 2.3, but we have no choice
@@ -343,6 +412,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event,
343 412
344static struct notifier_block dst_dev_notifier = { 413static struct notifier_block dst_dev_notifier = {
345 .notifier_call = dst_dev_event, 414 .notifier_call = dst_dev_event,
415 .priority = -10, /* must be called after other network notifiers */
346}; 416};
347 417
348void __init dst_init(void) 418void __init dst_init(void)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 8451ab481095..fd14116ad7f0 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -19,7 +19,10 @@
19#include <linux/netdevice.h> 19#include <linux/netdevice.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/vmalloc.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/rtnetlink.h>
25#include <linux/sched.h>
23 26
24/* 27/*
25 * Some useful ethtool_ops methods that're device independent. 28 * Some useful ethtool_ops methods that're device independent.
@@ -33,12 +36,6 @@ u32 ethtool_op_get_link(struct net_device *dev)
33} 36}
34EXPORT_SYMBOL(ethtool_op_get_link); 37EXPORT_SYMBOL(ethtool_op_get_link);
35 38
36u32 ethtool_op_get_rx_csum(struct net_device *dev)
37{
38 return (dev->features & NETIF_F_ALL_CSUM) != 0;
39}
40EXPORT_SYMBOL(ethtool_op_get_rx_csum);
41
42u32 ethtool_op_get_tx_csum(struct net_device *dev) 39u32 ethtool_op_get_tx_csum(struct net_device *dev)
43{ 40{
44 return (dev->features & NETIF_F_ALL_CSUM) != 0; 41 return (dev->features & NETIF_F_ALL_CSUM) != 0;
@@ -54,6 +51,7 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
54 51
55 return 0; 52 return 0;
56} 53}
54EXPORT_SYMBOL(ethtool_op_set_tx_csum);
57 55
58int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) 56int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
59{ 57{
@@ -131,7 +129,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo);
131 * NETIF_F_xxx values in include/linux/netdevice.h 129 * NETIF_F_xxx values in include/linux/netdevice.h
132 */ 130 */
133static const u32 flags_dup_features = 131static const u32 flags_dup_features =
134 (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH); 132 (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
133 ETH_FLAG_RXHASH);
135 134
136u32 ethtool_op_get_flags(struct net_device *dev) 135u32 ethtool_op_get_flags(struct net_device *dev)
137{ 136{
@@ -144,9 +143,24 @@ u32 ethtool_op_get_flags(struct net_device *dev)
144} 143}
145EXPORT_SYMBOL(ethtool_op_get_flags); 144EXPORT_SYMBOL(ethtool_op_get_flags);
146 145
146/* Check if device can enable (or disable) particular feature coded in "data"
147 * argument. Flags "supported" describe features that can be toggled by device.
148 * If feature can not be toggled, it state (enabled or disabled) must match
149 * hardcoded device features state, otherwise flags are marked as invalid.
150 */
151bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported)
152{
153 u32 features = dev->features & flags_dup_features;
154 /* "data" can contain only flags_dup_features bits,
155 * see __ethtool_set_flags */
156
157 return (features & ~supported) != (data & ~supported);
158}
159EXPORT_SYMBOL(ethtool_invalid_flags);
160
147int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) 161int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
148{ 162{
149 if (data & ~supported) 163 if (ethtool_invalid_flags(dev, data, supported))
150 return -EINVAL; 164 return -EINVAL;
151 165
152 dev->features = ((dev->features & ~flags_dup_features) | 166 dev->features = ((dev->features & ~flags_dup_features) |
@@ -169,6 +183,404 @@ EXPORT_SYMBOL(ethtool_ntuple_flush);
169 183
170/* Handlers for each ethtool command */ 184/* Handlers for each ethtool command */
171 185
186#define ETHTOOL_DEV_FEATURE_WORDS 1
187
188static void ethtool_get_features_compat(struct net_device *dev,
189 struct ethtool_get_features_block *features)
190{
191 if (!dev->ethtool_ops)
192 return;
193
194 /* getting RX checksum */
195 if (dev->ethtool_ops->get_rx_csum)
196 if (dev->ethtool_ops->get_rx_csum(dev))
197 features[0].active |= NETIF_F_RXCSUM;
198
199 /* mark legacy-changeable features */
200 if (dev->ethtool_ops->set_sg)
201 features[0].available |= NETIF_F_SG;
202 if (dev->ethtool_ops->set_tx_csum)
203 features[0].available |= NETIF_F_ALL_CSUM;
204 if (dev->ethtool_ops->set_tso)
205 features[0].available |= NETIF_F_ALL_TSO;
206 if (dev->ethtool_ops->set_rx_csum)
207 features[0].available |= NETIF_F_RXCSUM;
208 if (dev->ethtool_ops->set_flags)
209 features[0].available |= flags_dup_features;
210}
211
212static int ethtool_set_feature_compat(struct net_device *dev,
213 int (*legacy_set)(struct net_device *, u32),
214 struct ethtool_set_features_block *features, u32 mask)
215{
216 u32 do_set;
217
218 if (!legacy_set)
219 return 0;
220
221 if (!(features[0].valid & mask))
222 return 0;
223
224 features[0].valid &= ~mask;
225
226 do_set = !!(features[0].requested & mask);
227
228 if (legacy_set(dev, do_set) < 0)
229 netdev_info(dev,
230 "Legacy feature change (%s) failed for 0x%08x\n",
231 do_set ? "set" : "clear", mask);
232
233 return 1;
234}
235
236static int ethtool_set_flags_compat(struct net_device *dev,
237 int (*legacy_set)(struct net_device *, u32),
238 struct ethtool_set_features_block *features, u32 mask)
239{
240 u32 value;
241
242 if (!legacy_set)
243 return 0;
244
245 if (!(features[0].valid & mask))
246 return 0;
247
248 value = dev->features & ~features[0].valid;
249 value |= features[0].requested;
250
251 features[0].valid &= ~mask;
252
253 if (legacy_set(dev, value & mask) < 0)
254 netdev_info(dev, "Legacy flags change failed\n");
255
256 return 1;
257}
258
259static int ethtool_set_features_compat(struct net_device *dev,
260 struct ethtool_set_features_block *features)
261{
262 int compat;
263
264 if (!dev->ethtool_ops)
265 return 0;
266
267 compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
268 features, NETIF_F_SG);
269 compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
270 features, NETIF_F_ALL_CSUM);
271 compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
272 features, NETIF_F_ALL_TSO);
273 compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
274 features, NETIF_F_RXCSUM);
275 compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags,
276 features, flags_dup_features);
277
278 return compat;
279}
280
281static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
282{
283 struct ethtool_gfeatures cmd = {
284 .cmd = ETHTOOL_GFEATURES,
285 .size = ETHTOOL_DEV_FEATURE_WORDS,
286 };
287 struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
288 {
289 .available = dev->hw_features,
290 .requested = dev->wanted_features,
291 .active = dev->features,
292 .never_changed = NETIF_F_NEVER_CHANGE,
293 },
294 };
295 u32 __user *sizeaddr;
296 u32 copy_size;
297
298 ethtool_get_features_compat(dev, features);
299
300 sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
301 if (get_user(copy_size, sizeaddr))
302 return -EFAULT;
303
304 if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
305 copy_size = ETHTOOL_DEV_FEATURE_WORDS;
306
307 if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
308 return -EFAULT;
309 useraddr += sizeof(cmd);
310 if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
311 return -EFAULT;
312
313 return 0;
314}
315
316static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
317{
318 struct ethtool_sfeatures cmd;
319 struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
320 int ret = 0;
321
322 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
323 return -EFAULT;
324 useraddr += sizeof(cmd);
325
326 if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
327 return -EINVAL;
328
329 if (copy_from_user(features, useraddr, sizeof(features)))
330 return -EFAULT;
331
332 if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
333 return -EINVAL;
334
335 if (ethtool_set_features_compat(dev, features))
336 ret |= ETHTOOL_F_COMPAT;
337
338 if (features[0].valid & ~dev->hw_features) {
339 features[0].valid &= dev->hw_features;
340 ret |= ETHTOOL_F_UNSUPPORTED;
341 }
342
343 dev->wanted_features &= ~features[0].valid;
344 dev->wanted_features |= features[0].valid & features[0].requested;
345 __netdev_update_features(dev);
346
347 if ((dev->wanted_features ^ dev->features) & features[0].valid)
348 ret |= ETHTOOL_F_WISH;
349
350 return ret;
351}
352
353static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
354 /* NETIF_F_SG */ "tx-scatter-gather",
355 /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4",
356 /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded",
357 /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic",
358 /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6",
359 /* NETIF_F_HIGHDMA */ "highdma",
360 /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist",
361 /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert",
362
363 /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse",
364 /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter",
365 /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
366 /* NETIF_F_GSO */ "tx-generic-segmentation",
367 /* NETIF_F_LLTX */ "tx-lockless",
368 /* NETIF_F_NETNS_LOCAL */ "netns-local",
369 /* NETIF_F_GRO */ "rx-gro",
370 /* NETIF_F_LRO */ "rx-lro",
371
372 /* NETIF_F_TSO */ "tx-tcp-segmentation",
373 /* NETIF_F_UFO */ "tx-udp-fragmentation",
374 /* NETIF_F_GSO_ROBUST */ "tx-gso-robust",
375 /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation",
376 /* NETIF_F_TSO6 */ "tx-tcp6-segmentation",
377 /* NETIF_F_FSO */ "tx-fcoe-segmentation",
378 "",
379 "",
380
381 /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc",
382 /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp",
383 /* NETIF_F_FCOE_MTU */ "fcoe-mtu",
384 /* NETIF_F_NTUPLE */ "rx-ntuple-filter",
385 /* NETIF_F_RXHASH */ "rx-hashing",
386 /* NETIF_F_RXCSUM */ "rx-checksum",
387 /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy",
388 /* NETIF_F_LOOPBACK */ "loopback",
389};
390
391static int __ethtool_get_sset_count(struct net_device *dev, int sset)
392{
393 const struct ethtool_ops *ops = dev->ethtool_ops;
394
395 if (sset == ETH_SS_FEATURES)
396 return ARRAY_SIZE(netdev_features_strings);
397
398 if (ops && ops->get_sset_count && ops->get_strings)
399 return ops->get_sset_count(dev, sset);
400 else
401 return -EOPNOTSUPP;
402}
403
404static void __ethtool_get_strings(struct net_device *dev,
405 u32 stringset, u8 *data)
406{
407 const struct ethtool_ops *ops = dev->ethtool_ops;
408
409 if (stringset == ETH_SS_FEATURES)
410 memcpy(data, netdev_features_strings,
411 sizeof(netdev_features_strings));
412 else
413 /* ops->get_strings is valid because checked earlier */
414 ops->get_strings(dev, stringset, data);
415}
416
417static u32 ethtool_get_feature_mask(u32 eth_cmd)
418{
419 /* feature masks of legacy discrete ethtool ops */
420
421 switch (eth_cmd) {
422 case ETHTOOL_GTXCSUM:
423 case ETHTOOL_STXCSUM:
424 return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
425 case ETHTOOL_GRXCSUM:
426 case ETHTOOL_SRXCSUM:
427 return NETIF_F_RXCSUM;
428 case ETHTOOL_GSG:
429 case ETHTOOL_SSG:
430 return NETIF_F_SG;
431 case ETHTOOL_GTSO:
432 case ETHTOOL_STSO:
433 return NETIF_F_ALL_TSO;
434 case ETHTOOL_GUFO:
435 case ETHTOOL_SUFO:
436 return NETIF_F_UFO;
437 case ETHTOOL_GGSO:
438 case ETHTOOL_SGSO:
439 return NETIF_F_GSO;
440 case ETHTOOL_GGRO:
441 case ETHTOOL_SGRO:
442 return NETIF_F_GRO;
443 default:
444 BUG();
445 }
446}
447
448static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
449{
450 const struct ethtool_ops *ops = dev->ethtool_ops;
451
452 if (!ops)
453 return NULL;
454
455 switch (ethcmd) {
456 case ETHTOOL_GTXCSUM:
457 return ops->get_tx_csum;
458 case ETHTOOL_GRXCSUM:
459 return ops->get_rx_csum;
460 case ETHTOOL_SSG:
461 return ops->get_sg;
462 case ETHTOOL_STSO:
463 return ops->get_tso;
464 case ETHTOOL_SUFO:
465 return ops->get_ufo;
466 default:
467 return NULL;
468 }
469}
470
471static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
472{
473 return !!(dev->features & NETIF_F_ALL_CSUM);
474}
475
476static int ethtool_get_one_feature(struct net_device *dev,
477 char __user *useraddr, u32 ethcmd)
478{
479 u32 mask = ethtool_get_feature_mask(ethcmd);
480 struct ethtool_value edata = {
481 .cmd = ethcmd,
482 .data = !!(dev->features & mask),
483 };
484
485 /* compatibility with discrete get_ ops */
486 if (!(dev->hw_features & mask)) {
487 u32 (*actor)(struct net_device *);
488
489 actor = __ethtool_get_one_feature_actor(dev, ethcmd);
490
491 /* bug compatibility with old get_rx_csum */
492 if (ethcmd == ETHTOOL_GRXCSUM && !actor)
493 actor = __ethtool_get_rx_csum_oldbug;
494
495 if (actor)
496 edata.data = actor(dev);
497 }
498
499 if (copy_to_user(useraddr, &edata, sizeof(edata)))
500 return -EFAULT;
501 return 0;
502}
503
504static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
505static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
506static int __ethtool_set_sg(struct net_device *dev, u32 data);
507static int __ethtool_set_tso(struct net_device *dev, u32 data);
508static int __ethtool_set_ufo(struct net_device *dev, u32 data);
509
510static int ethtool_set_one_feature(struct net_device *dev,
511 void __user *useraddr, u32 ethcmd)
512{
513 struct ethtool_value edata;
514 u32 mask;
515
516 if (copy_from_user(&edata, useraddr, sizeof(edata)))
517 return -EFAULT;
518
519 mask = ethtool_get_feature_mask(ethcmd);
520 mask &= dev->hw_features;
521 if (mask) {
522 if (edata.data)
523 dev->wanted_features |= mask;
524 else
525 dev->wanted_features &= ~mask;
526
527 __netdev_update_features(dev);
528 return 0;
529 }
530
531 /* Driver is not converted to ndo_fix_features or does not
532 * support changing this offload. In the latter case it won't
533 * have corresponding ethtool_ops field set.
534 *
535 * Following part is to be removed after all drivers advertise
536 * their changeable features in netdev->hw_features and stop
537 * using discrete offload setting ops.
538 */
539
540 switch (ethcmd) {
541 case ETHTOOL_STXCSUM:
542 return __ethtool_set_tx_csum(dev, edata.data);
543 case ETHTOOL_SRXCSUM:
544 return __ethtool_set_rx_csum(dev, edata.data);
545 case ETHTOOL_SSG:
546 return __ethtool_set_sg(dev, edata.data);
547 case ETHTOOL_STSO:
548 return __ethtool_set_tso(dev, edata.data);
549 case ETHTOOL_SUFO:
550 return __ethtool_set_ufo(dev, edata.data);
551 default:
552 return -EOPNOTSUPP;
553 }
554}
555
556int __ethtool_set_flags(struct net_device *dev, u32 data)
557{
558 u32 changed;
559
560 if (data & ~flags_dup_features)
561 return -EINVAL;
562
563 /* legacy set_flags() op */
564 if (dev->ethtool_ops->set_flags) {
565 if (unlikely(dev->hw_features & flags_dup_features))
566 netdev_warn(dev,
567 "driver BUG: mixed hw_features and set_flags()\n");
568 return dev->ethtool_ops->set_flags(dev, data);
569 }
570
571 /* allow changing only bits set in hw_features */
572 changed = (data ^ dev->features) & flags_dup_features;
573 if (changed & ~dev->hw_features)
574 return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
575
576 dev->wanted_features =
577 (dev->wanted_features & ~changed) | (data & dev->hw_features);
578
579 __netdev_update_features(dev);
580
581 return 0;
582}
583
172static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) 584static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
173{ 585{
174 struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; 586 struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
@@ -205,18 +617,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
205 struct ethtool_drvinfo info; 617 struct ethtool_drvinfo info;
206 const struct ethtool_ops *ops = dev->ethtool_ops; 618 const struct ethtool_ops *ops = dev->ethtool_ops;
207 619
208 if (!ops->get_drvinfo)
209 return -EOPNOTSUPP;
210
211 memset(&info, 0, sizeof(info)); 620 memset(&info, 0, sizeof(info));
212 info.cmd = ETHTOOL_GDRVINFO; 621 info.cmd = ETHTOOL_GDRVINFO;
213 ops->get_drvinfo(dev, &info); 622 if (ops && ops->get_drvinfo) {
623 ops->get_drvinfo(dev, &info);
624 } else if (dev->dev.parent && dev->dev.parent->driver) {
625 strlcpy(info.bus_info, dev_name(dev->dev.parent),
626 sizeof(info.bus_info));
627 strlcpy(info.driver, dev->dev.parent->driver->name,
628 sizeof(info.driver));
629 } else {
630 return -EOPNOTSUPP;
631 }
214 632
215 /* 633 /*
216 * this method of obtaining string set info is deprecated; 634 * this method of obtaining string set info is deprecated;
217 * Use ETHTOOL_GSSET_INFO instead. 635 * Use ETHTOOL_GSSET_INFO instead.
218 */ 636 */
219 if (ops->get_sset_count) { 637 if (ops && ops->get_sset_count) {
220 int rc; 638 int rc;
221 639
222 rc = ops->get_sset_count(dev, ETH_SS_TEST); 640 rc = ops->get_sset_count(dev, ETH_SS_TEST);
@@ -229,9 +647,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
229 if (rc >= 0) 647 if (rc >= 0)
230 info.n_priv_flags = rc; 648 info.n_priv_flags = rc;
231 } 649 }
232 if (ops->get_regs_len) 650 if (ops && ops->get_regs_len)
233 info.regdump_len = ops->get_regs_len(dev); 651 info.regdump_len = ops->get_regs_len(dev);
234 if (ops->get_eeprom_len) 652 if (ops && ops->get_eeprom_len)
235 info.eedump_len = ops->get_eeprom_len(dev); 653 info.eedump_len = ops->get_eeprom_len(dev);
236 654
237 if (copy_to_user(useraddr, &info, sizeof(info))) 655 if (copy_to_user(useraddr, &info, sizeof(info)))
@@ -243,14 +661,10 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
243 void __user *useraddr) 661 void __user *useraddr)
244{ 662{
245 struct ethtool_sset_info info; 663 struct ethtool_sset_info info;
246 const struct ethtool_ops *ops = dev->ethtool_ops;
247 u64 sset_mask; 664 u64 sset_mask;
248 int i, idx = 0, n_bits = 0, ret, rc; 665 int i, idx = 0, n_bits = 0, ret, rc;
249 u32 *info_buf = NULL; 666 u32 *info_buf = NULL;
250 667
251 if (!ops->get_sset_count)
252 return -EOPNOTSUPP;
253
254 if (copy_from_user(&info, useraddr, sizeof(info))) 668 if (copy_from_user(&info, useraddr, sizeof(info)))
255 return -EFAULT; 669 return -EFAULT;
256 670
@@ -277,7 +691,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
277 if (!(sset_mask & (1ULL << i))) 691 if (!(sset_mask & (1ULL << i)))
278 continue; 692 continue;
279 693
280 rc = ops->get_sset_count(dev, i); 694 rc = __ethtool_get_sset_count(dev, i);
281 if (rc >= 0) { 695 if (rc >= 0) {
282 info.sset_mask |= (1ULL << i); 696 info.sset_mask |= (1ULL << i);
283 info_buf[idx++] = rc; 697 info_buf[idx++] = rc;
@@ -479,6 +893,38 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
479 list->count++; 893 list->count++;
480} 894}
481 895
896/*
897 * ethtool does not (or did not) set masks for flow parameters that are
898 * not specified, so if both value and mask are 0 then this must be
899 * treated as equivalent to a mask with all bits set. Implement that
900 * here rather than in drivers.
901 */
902static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
903{
904 struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
905 struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
906
907 if (fs->flow_type != TCP_V4_FLOW &&
908 fs->flow_type != UDP_V4_FLOW &&
909 fs->flow_type != SCTP_V4_FLOW)
910 return;
911
912 if (!(entry->ip4src | mask->ip4src))
913 mask->ip4src = htonl(0xffffffff);
914 if (!(entry->ip4dst | mask->ip4dst))
915 mask->ip4dst = htonl(0xffffffff);
916 if (!(entry->psrc | mask->psrc))
917 mask->psrc = htons(0xffff);
918 if (!(entry->pdst | mask->pdst))
919 mask->pdst = htons(0xffff);
920 if (!(entry->tos | mask->tos))
921 mask->tos = 0xff;
922 if (!(fs->vlan_tag | fs->vlan_tag_mask))
923 fs->vlan_tag_mask = 0xffff;
924 if (!(fs->data | fs->data_mask))
925 fs->data_mask = 0xffffffffffffffffULL;
926}
927
482static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, 928static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
483 void __user *useraddr) 929 void __user *useraddr)
484{ 930{
@@ -487,12 +933,17 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
487 struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; 933 struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
488 int ret; 934 int ret;
489 935
936 if (!ops->set_rx_ntuple)
937 return -EOPNOTSUPP;
938
490 if (!(dev->features & NETIF_F_NTUPLE)) 939 if (!(dev->features & NETIF_F_NTUPLE))
491 return -EINVAL; 940 return -EINVAL;
492 941
493 if (copy_from_user(&cmd, useraddr, sizeof(cmd))) 942 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
494 return -EFAULT; 943 return -EFAULT;
495 944
945 rx_ntuple_fix_masks(&cmd.fs);
946
496 /* 947 /*
497 * Cache filter in dev struct for GET operation only if 948 * Cache filter in dev struct for GET operation only if
498 * the underlying driver doesn't have its own GET operation, and 949 * the underlying driver doesn't have its own GET operation, and
@@ -667,19 +1118,19 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
667 break; 1118 break;
668 case IP_USER_FLOW: 1119 case IP_USER_FLOW:
669 sprintf(p, "\tSrc IP addr: 0x%x\n", 1120 sprintf(p, "\tSrc IP addr: 0x%x\n",
670 fsc->fs.h_u.raw_ip4_spec.ip4src); 1121 fsc->fs.h_u.usr_ip4_spec.ip4src);
671 p += ETH_GSTRING_LEN; 1122 p += ETH_GSTRING_LEN;
672 num_strings++; 1123 num_strings++;
673 sprintf(p, "\tSrc IP mask: 0x%x\n", 1124 sprintf(p, "\tSrc IP mask: 0x%x\n",
674 fsc->fs.m_u.raw_ip4_spec.ip4src); 1125 fsc->fs.m_u.usr_ip4_spec.ip4src);
675 p += ETH_GSTRING_LEN; 1126 p += ETH_GSTRING_LEN;
676 num_strings++; 1127 num_strings++;
677 sprintf(p, "\tDest IP addr: 0x%x\n", 1128 sprintf(p, "\tDest IP addr: 0x%x\n",
678 fsc->fs.h_u.raw_ip4_spec.ip4dst); 1129 fsc->fs.h_u.usr_ip4_spec.ip4dst);
679 p += ETH_GSTRING_LEN; 1130 p += ETH_GSTRING_LEN;
680 num_strings++; 1131 num_strings++;
681 sprintf(p, "\tDest IP mask: 0x%x\n", 1132 sprintf(p, "\tDest IP mask: 0x%x\n",
682 fsc->fs.m_u.raw_ip4_spec.ip4dst); 1133 fsc->fs.m_u.usr_ip4_spec.ip4dst);
683 p += ETH_GSTRING_LEN; 1134 p += ETH_GSTRING_LEN;
684 num_strings++; 1135 num_strings++;
685 break; 1136 break;
@@ -775,7 +1226,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
775 if (regs.len > reglen) 1226 if (regs.len > reglen)
776 regs.len = reglen; 1227 regs.len = reglen;
777 1228
778 regbuf = kzalloc(reglen, GFP_USER); 1229 regbuf = vzalloc(reglen);
779 if (!regbuf) 1230 if (!regbuf)
780 return -ENOMEM; 1231 return -ENOMEM;
781 1232
@@ -790,7 +1241,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
790 ret = 0; 1241 ret = 0;
791 1242
792 out: 1243 out:
793 kfree(regbuf); 1244 vfree(regbuf);
794 return ret; 1245 return ret;
795} 1246}
796 1247
@@ -849,6 +1300,20 @@ static int ethtool_nway_reset(struct net_device *dev)
849 return dev->ethtool_ops->nway_reset(dev); 1300 return dev->ethtool_ops->nway_reset(dev);
850} 1301}
851 1302
1303static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
1304{
1305 struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
1306
1307 if (!dev->ethtool_ops->get_link)
1308 return -EOPNOTSUPP;
1309
1310 edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
1311
1312 if (copy_to_user(useraddr, &edata, sizeof(edata)))
1313 return -EFAULT;
1314 return 0;
1315}
1316
852static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) 1317static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
853{ 1318{
854 struct ethtool_eeprom eeprom; 1319 struct ethtool_eeprom eeprom;
@@ -1004,6 +1469,35 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
1004 return dev->ethtool_ops->set_ringparam(dev, &ringparam); 1469 return dev->ethtool_ops->set_ringparam(dev, &ringparam);
1005} 1470}
1006 1471
1472static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
1473 void __user *useraddr)
1474{
1475 struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
1476
1477 if (!dev->ethtool_ops->get_channels)
1478 return -EOPNOTSUPP;
1479
1480 dev->ethtool_ops->get_channels(dev, &channels);
1481
1482 if (copy_to_user(useraddr, &channels, sizeof(channels)))
1483 return -EFAULT;
1484 return 0;
1485}
1486
1487static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
1488 void __user *useraddr)
1489{
1490 struct ethtool_channels channels;
1491
1492 if (!dev->ethtool_ops->set_channels)
1493 return -EOPNOTSUPP;
1494
1495 if (copy_from_user(&channels, useraddr, sizeof(channels)))
1496 return -EFAULT;
1497
1498 return dev->ethtool_ops->set_channels(dev, &channels);
1499}
1500
1007static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) 1501static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
1008{ 1502{
1009 struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; 1503 struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
@@ -1035,6 +1529,12 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
1035{ 1529{
1036 int err; 1530 int err;
1037 1531
1532 if (!dev->ethtool_ops->set_sg)
1533 return -EOPNOTSUPP;
1534
1535 if (data && !(dev->features & NETIF_F_ALL_CSUM))
1536 return -EINVAL;
1537
1038 if (!data && dev->ethtool_ops->set_tso) { 1538 if (!data && dev->ethtool_ops->set_tso) {
1039 err = dev->ethtool_ops->set_tso(dev, 0); 1539 err = dev->ethtool_ops->set_tso(dev, 0);
1040 if (err) 1540 if (err)
@@ -1049,140 +1549,55 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
1049 return dev->ethtool_ops->set_sg(dev, data); 1549 return dev->ethtool_ops->set_sg(dev, data);
1050} 1550}
1051 1551
1052static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) 1552static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
1053{ 1553{
1054 struct ethtool_value edata;
1055 int err; 1554 int err;
1056 1555
1057 if (!dev->ethtool_ops->set_tx_csum) 1556 if (!dev->ethtool_ops->set_tx_csum)
1058 return -EOPNOTSUPP; 1557 return -EOPNOTSUPP;
1059 1558
1060 if (copy_from_user(&edata, useraddr, sizeof(edata))) 1559 if (!data && dev->ethtool_ops->set_sg) {
1061 return -EFAULT;
1062
1063 if (!edata.data && dev->ethtool_ops->set_sg) {
1064 err = __ethtool_set_sg(dev, 0); 1560 err = __ethtool_set_sg(dev, 0);
1065 if (err) 1561 if (err)
1066 return err; 1562 return err;
1067 } 1563 }
1068 1564
1069 return dev->ethtool_ops->set_tx_csum(dev, edata.data); 1565 return dev->ethtool_ops->set_tx_csum(dev, data);
1070} 1566}
1071EXPORT_SYMBOL(ethtool_op_set_tx_csum);
1072 1567
1073static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) 1568static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
1074{ 1569{
1075 struct ethtool_value edata;
1076
1077 if (!dev->ethtool_ops->set_rx_csum) 1570 if (!dev->ethtool_ops->set_rx_csum)
1078 return -EOPNOTSUPP; 1571 return -EOPNOTSUPP;
1079 1572
1080 if (copy_from_user(&edata, useraddr, sizeof(edata))) 1573 if (!data)
1081 return -EFAULT;
1082
1083 if (!edata.data && dev->ethtool_ops->set_sg)
1084 dev->features &= ~NETIF_F_GRO; 1574 dev->features &= ~NETIF_F_GRO;
1085 1575
1086 return dev->ethtool_ops->set_rx_csum(dev, edata.data); 1576 return dev->ethtool_ops->set_rx_csum(dev, data);
1087} 1577}
1088 1578
1089static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) 1579static int __ethtool_set_tso(struct net_device *dev, u32 data)
1090{ 1580{
1091 struct ethtool_value edata;
1092
1093 if (!dev->ethtool_ops->set_sg)
1094 return -EOPNOTSUPP;
1095
1096 if (copy_from_user(&edata, useraddr, sizeof(edata)))
1097 return -EFAULT;
1098
1099 if (edata.data &&
1100 !(dev->features & NETIF_F_ALL_CSUM))
1101 return -EINVAL;
1102
1103 return __ethtool_set_sg(dev, edata.data);
1104}
1105
1106static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
1107{
1108 struct ethtool_value edata;
1109
1110 if (!dev->ethtool_ops->set_tso) 1581 if (!dev->ethtool_ops->set_tso)
1111 return -EOPNOTSUPP; 1582 return -EOPNOTSUPP;
1112 1583
1113 if (copy_from_user(&edata, useraddr, sizeof(edata))) 1584 if (data && !(dev->features & NETIF_F_SG))
1114 return -EFAULT;
1115
1116 if (edata.data && !(dev->features & NETIF_F_SG))
1117 return -EINVAL; 1585 return -EINVAL;
1118 1586
1119 return dev->ethtool_ops->set_tso(dev, edata.data); 1587 return dev->ethtool_ops->set_tso(dev, data);
1120} 1588}
1121 1589
1122static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) 1590static int __ethtool_set_ufo(struct net_device *dev, u32 data)
1123{ 1591{
1124 struct ethtool_value edata;
1125
1126 if (!dev->ethtool_ops->set_ufo) 1592 if (!dev->ethtool_ops->set_ufo)
1127 return -EOPNOTSUPP; 1593 return -EOPNOTSUPP;
1128 if (copy_from_user(&edata, useraddr, sizeof(edata))) 1594 if (data && !(dev->features & NETIF_F_SG))
1129 return -EFAULT;
1130 if (edata.data && !(dev->features & NETIF_F_SG))
1131 return -EINVAL; 1595 return -EINVAL;
1132 if (edata.data && !(dev->features & NETIF_F_HW_CSUM)) 1596 if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
1597 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
1598 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
1133 return -EINVAL; 1599 return -EINVAL;
1134 return dev->ethtool_ops->set_ufo(dev, edata.data); 1600 return dev->ethtool_ops->set_ufo(dev, data);
1135}
1136
1137static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
1138{
1139 struct ethtool_value edata = { ETHTOOL_GGSO };
1140
1141 edata.data = dev->features & NETIF_F_GSO;
1142 if (copy_to_user(useraddr, &edata, sizeof(edata)))
1143 return -EFAULT;
1144 return 0;
1145}
1146
1147static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
1148{
1149 struct ethtool_value edata;
1150
1151 if (copy_from_user(&edata, useraddr, sizeof(edata)))
1152 return -EFAULT;
1153 if (edata.data)
1154 dev->features |= NETIF_F_GSO;
1155 else
1156 dev->features &= ~NETIF_F_GSO;
1157 return 0;
1158}
1159
1160static int ethtool_get_gro(struct net_device *dev, char __user *useraddr)
1161{
1162 struct ethtool_value edata = { ETHTOOL_GGRO };
1163
1164 edata.data = dev->features & NETIF_F_GRO;
1165 if (copy_to_user(useraddr, &edata, sizeof(edata)))
1166 return -EFAULT;
1167 return 0;
1168}
1169
1170static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
1171{
1172 struct ethtool_value edata;
1173
1174 if (copy_from_user(&edata, useraddr, sizeof(edata)))
1175 return -EFAULT;
1176
1177 if (edata.data) {
1178 if (!dev->ethtool_ops->get_rx_csum ||
1179 !dev->ethtool_ops->get_rx_csum(dev))
1180 return -EINVAL;
1181 dev->features |= NETIF_F_GRO;
1182 } else
1183 dev->features &= ~NETIF_F_GRO;
1184
1185 return 0;
1186} 1601}
1187 1602
1188static int ethtool_self_test(struct net_device *dev, char __user *useraddr) 1603static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -1226,17 +1641,13 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
1226static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) 1641static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1227{ 1642{
1228 struct ethtool_gstrings gstrings; 1643 struct ethtool_gstrings gstrings;
1229 const struct ethtool_ops *ops = dev->ethtool_ops;
1230 u8 *data; 1644 u8 *data;
1231 int ret; 1645 int ret;
1232 1646
1233 if (!ops->get_strings || !ops->get_sset_count)
1234 return -EOPNOTSUPP;
1235
1236 if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) 1647 if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
1237 return -EFAULT; 1648 return -EFAULT;
1238 1649
1239 ret = ops->get_sset_count(dev, gstrings.string_set); 1650 ret = __ethtool_get_sset_count(dev, gstrings.string_set);
1240 if (ret < 0) 1651 if (ret < 0)
1241 return ret; 1652 return ret;
1242 1653
@@ -1246,7 +1657,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1246 if (!data) 1657 if (!data)
1247 return -ENOMEM; 1658 return -ENOMEM;
1248 1659
1249 ops->get_strings(dev, gstrings.string_set, data); 1660 __ethtool_get_strings(dev, gstrings.string_set, data);
1250 1661
1251 ret = -EFAULT; 1662 ret = -EFAULT;
1252 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) 1663 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
@@ -1256,7 +1667,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1256 goto out; 1667 goto out;
1257 ret = 0; 1668 ret = 0;
1258 1669
1259 out: 1670out:
1260 kfree(data); 1671 kfree(data);
1261 return ret; 1672 return ret;
1262} 1673}
@@ -1264,14 +1675,60 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1264static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) 1675static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
1265{ 1676{
1266 struct ethtool_value id; 1677 struct ethtool_value id;
1678 static bool busy;
1679 int rc;
1267 1680
1268 if (!dev->ethtool_ops->phys_id) 1681 if (!dev->ethtool_ops->set_phys_id)
1269 return -EOPNOTSUPP; 1682 return -EOPNOTSUPP;
1270 1683
1684 if (busy)
1685 return -EBUSY;
1686
1271 if (copy_from_user(&id, useraddr, sizeof(id))) 1687 if (copy_from_user(&id, useraddr, sizeof(id)))
1272 return -EFAULT; 1688 return -EFAULT;
1273 1689
1274 return dev->ethtool_ops->phys_id(dev, id.data); 1690 rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
1691 if (rc < 0)
1692 return rc;
1693
1694 /* Drop the RTNL lock while waiting, but prevent reentry or
1695 * removal of the device.
1696 */
1697 busy = true;
1698 dev_hold(dev);
1699 rtnl_unlock();
1700
1701 if (rc == 0) {
1702 /* Driver will handle this itself */
1703 schedule_timeout_interruptible(
1704 id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
1705 } else {
1706 /* Driver expects to be called at twice the frequency in rc */
1707 int n = rc * 2, i, interval = HZ / n;
1708
1709 /* Count down seconds */
1710 do {
1711 /* Count down iterations per second */
1712 i = n;
1713 do {
1714 rtnl_lock();
1715 rc = dev->ethtool_ops->set_phys_id(dev,
1716 (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
1717 rtnl_unlock();
1718 if (rc)
1719 break;
1720 schedule_timeout_interruptible(interval);
1721 } while (!signal_pending(current) && --i != 0);
1722 } while (!signal_pending(current) &&
1723 (id.data == 0 || --id.data != 0));
1724 }
1725
1726 rtnl_lock();
1727 dev_put(dev);
1728 busy = false;
1729
1730 (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
1731 return rc;
1275} 1732}
1276 1733
1277static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) 1734static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
@@ -1389,6 +1846,87 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
1389 return dev->ethtool_ops->flash_device(dev, &efl); 1846 return dev->ethtool_ops->flash_device(dev, &efl);
1390} 1847}
1391 1848
1849static int ethtool_set_dump(struct net_device *dev,
1850 void __user *useraddr)
1851{
1852 struct ethtool_dump dump;
1853
1854 if (!dev->ethtool_ops->set_dump)
1855 return -EOPNOTSUPP;
1856
1857 if (copy_from_user(&dump, useraddr, sizeof(dump)))
1858 return -EFAULT;
1859
1860 return dev->ethtool_ops->set_dump(dev, &dump);
1861}
1862
1863static int ethtool_get_dump_flag(struct net_device *dev,
1864 void __user *useraddr)
1865{
1866 int ret;
1867 struct ethtool_dump dump;
1868 const struct ethtool_ops *ops = dev->ethtool_ops;
1869
1870 if (!dev->ethtool_ops->get_dump_flag)
1871 return -EOPNOTSUPP;
1872
1873 if (copy_from_user(&dump, useraddr, sizeof(dump)))
1874 return -EFAULT;
1875
1876 ret = ops->get_dump_flag(dev, &dump);
1877 if (ret)
1878 return ret;
1879
1880 if (copy_to_user(useraddr, &dump, sizeof(dump)))
1881 return -EFAULT;
1882 return 0;
1883}
1884
1885static int ethtool_get_dump_data(struct net_device *dev,
1886 void __user *useraddr)
1887{
1888 int ret;
1889 __u32 len;
1890 struct ethtool_dump dump, tmp;
1891 const struct ethtool_ops *ops = dev->ethtool_ops;
1892 void *data = NULL;
1893
1894 if (!dev->ethtool_ops->get_dump_data ||
1895 !dev->ethtool_ops->get_dump_flag)
1896 return -EOPNOTSUPP;
1897
1898 if (copy_from_user(&dump, useraddr, sizeof(dump)))
1899 return -EFAULT;
1900
1901 memset(&tmp, 0, sizeof(tmp));
1902 tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
1903 ret = ops->get_dump_flag(dev, &tmp);
1904 if (ret)
1905 return ret;
1906
1907 len = (tmp.len > dump.len) ? dump.len : tmp.len;
1908 if (!len)
1909 return -EFAULT;
1910
1911 data = vzalloc(tmp.len);
1912 if (!data)
1913 return -ENOMEM;
1914 ret = ops->get_dump_data(dev, &dump, data);
1915 if (ret)
1916 goto out;
1917
1918 if (copy_to_user(useraddr, &dump, sizeof(dump))) {
1919 ret = -EFAULT;
1920 goto out;
1921 }
1922 useraddr += offsetof(struct ethtool_dump, data);
1923 if (copy_to_user(useraddr, data, len))
1924 ret = -EFAULT;
1925out:
1926 vfree(data);
1927 return ret;
1928}
1929
1392/* The main entry point in this file. Called from net/core/dev.c */ 1930/* The main entry point in this file. Called from net/core/dev.c */
1393 1931
1394int dev_ethtool(struct net *net, struct ifreq *ifr) 1932int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -1397,19 +1935,27 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1397 void __user *useraddr = ifr->ifr_data; 1935 void __user *useraddr = ifr->ifr_data;
1398 u32 ethcmd; 1936 u32 ethcmd;
1399 int rc; 1937 int rc;
1400 unsigned long old_features; 1938 u32 old_features;
1401 1939
1402 if (!dev || !netif_device_present(dev)) 1940 if (!dev || !netif_device_present(dev))
1403 return -ENODEV; 1941 return -ENODEV;
1404 1942
1405 if (!dev->ethtool_ops)
1406 return -EOPNOTSUPP;
1407
1408 if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd))) 1943 if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
1409 return -EFAULT; 1944 return -EFAULT;
1410 1945
1946 if (!dev->ethtool_ops) {
1947 /* ETHTOOL_GDRVINFO does not require any driver support.
1948 * It is also unprivileged and does not change anything,
1949 * so we can take a shortcut to it. */
1950 if (ethcmd == ETHTOOL_GDRVINFO)
1951 return ethtool_get_drvinfo(dev, useraddr);
1952 else
1953 return -EOPNOTSUPP;
1954 }
1955
1411 /* Allow some commands to be done by anyone */ 1956 /* Allow some commands to be done by anyone */
1412 switch (ethcmd) { 1957 switch (ethcmd) {
1958 case ETHTOOL_GSET:
1413 case ETHTOOL_GDRVINFO: 1959 case ETHTOOL_GDRVINFO:
1414 case ETHTOOL_GMSGLVL: 1960 case ETHTOOL_GMSGLVL:
1415 case ETHTOOL_GCOALESCE: 1961 case ETHTOOL_GCOALESCE:
@@ -1431,6 +1977,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1431 case ETHTOOL_GRXCLSRLCNT: 1977 case ETHTOOL_GRXCLSRLCNT:
1432 case ETHTOOL_GRXCLSRULE: 1978 case ETHTOOL_GRXCLSRULE:
1433 case ETHTOOL_GRXCLSRLALL: 1979 case ETHTOOL_GRXCLSRLALL:
1980 case ETHTOOL_GFEATURES:
1434 break; 1981 break;
1435 default: 1982 default:
1436 if (!capable(CAP_NET_ADMIN)) 1983 if (!capable(CAP_NET_ADMIN))
@@ -1475,8 +2022,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1475 rc = ethtool_nway_reset(dev); 2022 rc = ethtool_nway_reset(dev);
1476 break; 2023 break;
1477 case ETHTOOL_GLINK: 2024 case ETHTOOL_GLINK:
1478 rc = ethtool_get_value(dev, useraddr, ethcmd, 2025 rc = ethtool_get_link(dev, useraddr);
1479 dev->ethtool_ops->get_link);
1480 break; 2026 break;
1481 case ETHTOOL_GEEPROM: 2027 case ETHTOOL_GEEPROM:
1482 rc = ethtool_get_eeprom(dev, useraddr); 2028 rc = ethtool_get_eeprom(dev, useraddr);
@@ -1502,42 +2048,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1502 case ETHTOOL_SPAUSEPARAM: 2048 case ETHTOOL_SPAUSEPARAM:
1503 rc = ethtool_set_pauseparam(dev, useraddr); 2049 rc = ethtool_set_pauseparam(dev, useraddr);
1504 break; 2050 break;
1505 case ETHTOOL_GRXCSUM:
1506 rc = ethtool_get_value(dev, useraddr, ethcmd,
1507 (dev->ethtool_ops->get_rx_csum ?
1508 dev->ethtool_ops->get_rx_csum :
1509 ethtool_op_get_rx_csum));
1510 break;
1511 case ETHTOOL_SRXCSUM:
1512 rc = ethtool_set_rx_csum(dev, useraddr);
1513 break;
1514 case ETHTOOL_GTXCSUM:
1515 rc = ethtool_get_value(dev, useraddr, ethcmd,
1516 (dev->ethtool_ops->get_tx_csum ?
1517 dev->ethtool_ops->get_tx_csum :
1518 ethtool_op_get_tx_csum));
1519 break;
1520 case ETHTOOL_STXCSUM:
1521 rc = ethtool_set_tx_csum(dev, useraddr);
1522 break;
1523 case ETHTOOL_GSG:
1524 rc = ethtool_get_value(dev, useraddr, ethcmd,
1525 (dev->ethtool_ops->get_sg ?
1526 dev->ethtool_ops->get_sg :
1527 ethtool_op_get_sg));
1528 break;
1529 case ETHTOOL_SSG:
1530 rc = ethtool_set_sg(dev, useraddr);
1531 break;
1532 case ETHTOOL_GTSO:
1533 rc = ethtool_get_value(dev, useraddr, ethcmd,
1534 (dev->ethtool_ops->get_tso ?
1535 dev->ethtool_ops->get_tso :
1536 ethtool_op_get_tso));
1537 break;
1538 case ETHTOOL_STSO:
1539 rc = ethtool_set_tso(dev, useraddr);
1540 break;
1541 case ETHTOOL_TEST: 2051 case ETHTOOL_TEST:
1542 rc = ethtool_self_test(dev, useraddr); 2052 rc = ethtool_self_test(dev, useraddr);
1543 break; 2053 break;
@@ -1553,21 +2063,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1553 case ETHTOOL_GPERMADDR: 2063 case ETHTOOL_GPERMADDR:
1554 rc = ethtool_get_perm_addr(dev, useraddr); 2064 rc = ethtool_get_perm_addr(dev, useraddr);
1555 break; 2065 break;
1556 case ETHTOOL_GUFO:
1557 rc = ethtool_get_value(dev, useraddr, ethcmd,
1558 (dev->ethtool_ops->get_ufo ?
1559 dev->ethtool_ops->get_ufo :
1560 ethtool_op_get_ufo));
1561 break;
1562 case ETHTOOL_SUFO:
1563 rc = ethtool_set_ufo(dev, useraddr);
1564 break;
1565 case ETHTOOL_GGSO:
1566 rc = ethtool_get_gso(dev, useraddr);
1567 break;
1568 case ETHTOOL_SGSO:
1569 rc = ethtool_set_gso(dev, useraddr);
1570 break;
1571 case ETHTOOL_GFLAGS: 2066 case ETHTOOL_GFLAGS:
1572 rc = ethtool_get_value(dev, useraddr, ethcmd, 2067 rc = ethtool_get_value(dev, useraddr, ethcmd,
1573 (dev->ethtool_ops->get_flags ? 2068 (dev->ethtool_ops->get_flags ?
@@ -1575,8 +2070,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1575 ethtool_op_get_flags)); 2070 ethtool_op_get_flags));
1576 break; 2071 break;
1577 case ETHTOOL_SFLAGS: 2072 case ETHTOOL_SFLAGS:
1578 rc = ethtool_set_value(dev, useraddr, 2073 rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
1579 dev->ethtool_ops->set_flags);
1580 break; 2074 break;
1581 case ETHTOOL_GPFLAGS: 2075 case ETHTOOL_GPFLAGS:
1582 rc = ethtool_get_value(dev, useraddr, ethcmd, 2076 rc = ethtool_get_value(dev, useraddr, ethcmd,
@@ -1598,12 +2092,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1598 case ETHTOOL_SRXCLSRLINS: 2092 case ETHTOOL_SRXCLSRLINS:
1599 rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); 2093 rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
1600 break; 2094 break;
1601 case ETHTOOL_GGRO:
1602 rc = ethtool_get_gro(dev, useraddr);
1603 break;
1604 case ETHTOOL_SGRO:
1605 rc = ethtool_set_gro(dev, useraddr);
1606 break;
1607 case ETHTOOL_FLASHDEV: 2095 case ETHTOOL_FLASHDEV:
1608 rc = ethtool_flash_device(dev, useraddr); 2096 rc = ethtool_flash_device(dev, useraddr);
1609 break; 2097 break;
@@ -1625,6 +2113,45 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1625 case ETHTOOL_SRXFHINDIR: 2113 case ETHTOOL_SRXFHINDIR:
1626 rc = ethtool_set_rxfh_indir(dev, useraddr); 2114 rc = ethtool_set_rxfh_indir(dev, useraddr);
1627 break; 2115 break;
2116 case ETHTOOL_GFEATURES:
2117 rc = ethtool_get_features(dev, useraddr);
2118 break;
2119 case ETHTOOL_SFEATURES:
2120 rc = ethtool_set_features(dev, useraddr);
2121 break;
2122 case ETHTOOL_GTXCSUM:
2123 case ETHTOOL_GRXCSUM:
2124 case ETHTOOL_GSG:
2125 case ETHTOOL_GTSO:
2126 case ETHTOOL_GUFO:
2127 case ETHTOOL_GGSO:
2128 case ETHTOOL_GGRO:
2129 rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
2130 break;
2131 case ETHTOOL_STXCSUM:
2132 case ETHTOOL_SRXCSUM:
2133 case ETHTOOL_SSG:
2134 case ETHTOOL_STSO:
2135 case ETHTOOL_SUFO:
2136 case ETHTOOL_SGSO:
2137 case ETHTOOL_SGRO:
2138 rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
2139 break;
2140 case ETHTOOL_GCHANNELS:
2141 rc = ethtool_get_channels(dev, useraddr);
2142 break;
2143 case ETHTOOL_SCHANNELS:
2144 rc = ethtool_set_channels(dev, useraddr);
2145 break;
2146 case ETHTOOL_SET_DUMP:
2147 rc = ethtool_set_dump(dev, useraddr);
2148 break;
2149 case ETHTOOL_GET_DUMP_FLAG:
2150 rc = ethtool_get_dump_flag(dev, useraddr);
2151 break;
2152 case ETHTOOL_GET_DUMP_DATA:
2153 rc = ethtool_get_dump_data(dev, useraddr);
2154 break;
1628 default: 2155 default:
1629 rc = -EOPNOTSUPP; 2156 rc = -EOPNOTSUPP;
1630 } 2157 }
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 42e84e08a1be..008dc70b064b 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
144} 144}
145EXPORT_SYMBOL_GPL(fib_rules_register); 145EXPORT_SYMBOL_GPL(fib_rules_register);
146 146
147void fib_rules_cleanup_ops(struct fib_rules_ops *ops) 147static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
148{ 148{
149 struct fib_rule *rule, *tmp; 149 struct fib_rule *rule, *tmp;
150 150
@@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
153 fib_rule_put(rule); 153 fib_rule_put(rule);
154 } 154 }
155} 155}
156EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
157 156
158static void fib_rules_put_rcu(struct rcu_head *head) 157static void fib_rules_put_rcu(struct rcu_head *head)
159{ 158{
@@ -182,13 +181,13 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
182{ 181{
183 int ret = 0; 182 int ret = 0;
184 183
185 if (rule->iifindex && (rule->iifindex != fl->iif)) 184 if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
186 goto out; 185 goto out;
187 186
188 if (rule->oifindex && (rule->oifindex != fl->oif)) 187 if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
189 goto out; 188 goto out;
190 189
191 if ((rule->mark ^ fl->mark) & rule->mark_mask) 190 if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
192 goto out; 191 goto out;
193 192
194 ret = ops->match(rule, fl, flags); 193 ret = ops->match(rule, fl, flags);
@@ -225,9 +224,12 @@ jumped:
225 err = ops->action(rule, fl, flags, arg); 224 err = ops->action(rule, fl, flags, arg);
226 225
227 if (err != -EAGAIN) { 226 if (err != -EAGAIN) {
228 fib_rule_get(rule); 227 if ((arg->flags & FIB_LOOKUP_NOREF) ||
229 arg->rule = rule; 228 likely(atomic_inc_not_zero(&rule->refcnt))) {
230 goto out; 229 arg->rule = rule;
230 goto out;
231 }
232 break;
231 } 233 }
232 } 234 }
233 235
@@ -348,12 +350,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
348 350
349 list_for_each_entry(r, &ops->rules_list, list) { 351 list_for_each_entry(r, &ops->rules_list, list) {
350 if (r->pref == rule->target) { 352 if (r->pref == rule->target) {
351 rule->ctarget = r; 353 RCU_INIT_POINTER(rule->ctarget, r);
352 break; 354 break;
353 } 355 }
354 } 356 }
355 357
356 if (rule->ctarget == NULL) 358 if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
357 unresolved = 1; 359 unresolved = 1;
358 } else if (rule->action == FR_ACT_GOTO) 360 } else if (rule->action == FR_ACT_GOTO)
359 goto errout_free; 361 goto errout_free;
@@ -370,6 +372,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
370 372
371 fib_rule_get(rule); 373 fib_rule_get(rule);
372 374
375 if (last)
376 list_add_rcu(&rule->list, &last->list);
377 else
378 list_add_rcu(&rule->list, &ops->rules_list);
379
373 if (ops->unresolved_rules) { 380 if (ops->unresolved_rules) {
374 /* 381 /*
375 * There are unresolved goto rules in the list, check if 382 * There are unresolved goto rules in the list, check if
@@ -378,7 +385,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
378 list_for_each_entry(r, &ops->rules_list, list) { 385 list_for_each_entry(r, &ops->rules_list, list) {
379 if (r->action == FR_ACT_GOTO && 386 if (r->action == FR_ACT_GOTO &&
380 r->target == rule->pref) { 387 r->target == rule->pref) {
381 BUG_ON(r->ctarget != NULL); 388 BUG_ON(rtnl_dereference(r->ctarget) != NULL);
382 rcu_assign_pointer(r->ctarget, rule); 389 rcu_assign_pointer(r->ctarget, rule);
383 if (--ops->unresolved_rules == 0) 390 if (--ops->unresolved_rules == 0)
384 break; 391 break;
@@ -392,11 +399,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
392 if (unresolved) 399 if (unresolved)
393 ops->unresolved_rules++; 400 ops->unresolved_rules++;
394 401
395 if (last)
396 list_add_rcu(&rule->list, &last->list);
397 else
398 list_add_rcu(&rule->list, &ops->rules_list);
399
400 notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); 402 notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
401 flush_route_cache(ops); 403 flush_route_cache(ops);
402 rules_ops_put(ops); 404 rules_ops_put(ops);
@@ -484,14 +486,13 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
484 */ 486 */
485 if (ops->nr_goto_rules > 0) { 487 if (ops->nr_goto_rules > 0) {
486 list_for_each_entry(tmp, &ops->rules_list, list) { 488 list_for_each_entry(tmp, &ops->rules_list, list) {
487 if (tmp->ctarget == rule) { 489 if (rtnl_dereference(tmp->ctarget) == rule) {
488 rcu_assign_pointer(tmp->ctarget, NULL); 490 rcu_assign_pointer(tmp->ctarget, NULL);
489 ops->unresolved_rules++; 491 ops->unresolved_rules++;
490 } 492 }
491 } 493 }
492 } 494 }
493 495
494 synchronize_rcu();
495 notify_rule_change(RTM_DELRULE, rule, ops, nlh, 496 notify_rule_change(RTM_DELRULE, rule, ops, nlh,
496 NETLINK_CB(skb).pid); 497 NETLINK_CB(skb).pid);
497 fib_rule_put(rule); 498 fib_rule_put(rule);
@@ -543,7 +544,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
543 frh->action = rule->action; 544 frh->action = rule->action;
544 frh->flags = rule->flags; 545 frh->flags = rule->flags;
545 546
546 if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) 547 if (rule->action == FR_ACT_GOTO &&
548 rcu_dereference_raw(rule->ctarget) == NULL)
547 frh->flags |= FIB_RULE_UNRESOLVED; 549 frh->flags |= FIB_RULE_UNRESOLVED;
548 550
549 if (rule->iifname[0]) { 551 if (rule->iifname[0]) {
@@ -588,7 +590,8 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
588 int idx = 0; 590 int idx = 0;
589 struct fib_rule *rule; 591 struct fib_rule *rule;
590 592
591 list_for_each_entry(rule, &ops->rules_list, list) { 593 rcu_read_lock();
594 list_for_each_entry_rcu(rule, &ops->rules_list, list) {
592 if (idx < cb->args[1]) 595 if (idx < cb->args[1])
593 goto skip; 596 goto skip;
594 597
@@ -599,6 +602,7 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
599skip: 602skip:
600 idx++; 603 idx++;
601 } 604 }
605 rcu_read_unlock();
602 cb->args[1] = idx; 606 cb->args[1] = idx;
603 rules_ops_put(ops); 607 rules_ops_put(ops);
604 608
diff --git a/net/core/filter.c b/net/core/filter.c
index 52b051f82a01..36f975fa87cb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -37,9 +37,11 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/unaligned.h> 38#include <asm/unaligned.h>
39#include <linux/filter.h> 39#include <linux/filter.h>
40#include <linux/reciprocal_div.h>
41#include <linux/ratelimit.h>
40 42
41/* No hurry in this branch */ 43/* No hurry in this branch */
42static void *__load_pointer(struct sk_buff *skb, int k) 44static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size)
43{ 45{
44 u8 *ptr = NULL; 46 u8 *ptr = NULL;
45 47
@@ -48,21 +50,17 @@ static void *__load_pointer(struct sk_buff *skb, int k)
48 else if (k >= SKF_LL_OFF) 50 else if (k >= SKF_LL_OFF)
49 ptr = skb_mac_header(skb) + k - SKF_LL_OFF; 51 ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
50 52
51 if (ptr >= skb->head && ptr < skb_tail_pointer(skb)) 53 if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
52 return ptr; 54 return ptr;
53 return NULL; 55 return NULL;
54} 56}
55 57
56static inline void *load_pointer(struct sk_buff *skb, int k, 58static inline void *load_pointer(const struct sk_buff *skb, int k,
57 unsigned int size, void *buffer) 59 unsigned int size, void *buffer)
58{ 60{
59 if (k >= 0) 61 if (k >= 0)
60 return skb_header_pointer(skb, k, size, buffer); 62 return skb_header_pointer(skb, k, size, buffer);
61 else { 63 return __load_pointer(skb, k, size);
62 if (k >= SKF_AD_OFF)
63 return NULL;
64 return __load_pointer(skb, k);
65 }
66} 64}
67 65
68/** 66/**
@@ -86,14 +84,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
86 if (err) 84 if (err)
87 return err; 85 return err;
88 86
89 rcu_read_lock_bh(); 87 rcu_read_lock();
90 filter = rcu_dereference_bh(sk->sk_filter); 88 filter = rcu_dereference(sk->sk_filter);
91 if (filter) { 89 if (filter) {
92 unsigned int pkt_len = sk_run_filter(skb, filter->insns, 90 unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
93 filter->len); 91
94 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; 92 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
95 } 93 }
96 rcu_read_unlock_bh(); 94 rcu_read_unlock();
97 95
98 return err; 96 return err;
99} 97}
@@ -102,49 +100,53 @@ EXPORT_SYMBOL(sk_filter);
102/** 100/**
103 * sk_run_filter - run a filter on a socket 101 * sk_run_filter - run a filter on a socket
104 * @skb: buffer to run the filter on 102 * @skb: buffer to run the filter on
105 * @filter: filter to apply 103 * @fentry: filter to apply
106 * @flen: length of filter
107 * 104 *
108 * Decode and apply filter instructions to the skb->data. 105 * Decode and apply filter instructions to the skb->data.
109 * Return length to keep, 0 for none. skb is the data we are 106 * Return length to keep, 0 for none. @skb is the data we are
110 * filtering, filter is the array of filter instructions, and 107 * filtering, @filter is the array of filter instructions.
111 * len is the number of filter blocks in the array. 108 * Because all jumps are guaranteed to be before last instruction,
109 * and last instruction guaranteed to be a RET, we dont need to check
110 * flen. (We used to pass to this function the length of filter)
112 */ 111 */
113unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) 112unsigned int sk_run_filter(const struct sk_buff *skb,
113 const struct sock_filter *fentry)
114{ 114{
115 struct sock_filter *fentry; /* We walk down these */
116 void *ptr; 115 void *ptr;
117 u32 A = 0; /* Accumulator */ 116 u32 A = 0; /* Accumulator */
118 u32 X = 0; /* Index Register */ 117 u32 X = 0; /* Index Register */
119 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ 118 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
120 u32 tmp; 119 u32 tmp;
121 int k; 120 int k;
122 int pc;
123 121
124 /* 122 /*
125 * Process array of filter instructions. 123 * Process array of filter instructions.
126 */ 124 */
127 for (pc = 0; pc < flen; pc++) { 125 for (;; fentry++) {
128 fentry = &filter[pc]; 126#if defined(CONFIG_X86_32)
127#define K (fentry->k)
128#else
129 const u32 K = fentry->k;
130#endif
129 131
130 switch (fentry->code) { 132 switch (fentry->code) {
131 case BPF_S_ALU_ADD_X: 133 case BPF_S_ALU_ADD_X:
132 A += X; 134 A += X;
133 continue; 135 continue;
134 case BPF_S_ALU_ADD_K: 136 case BPF_S_ALU_ADD_K:
135 A += fentry->k; 137 A += K;
136 continue; 138 continue;
137 case BPF_S_ALU_SUB_X: 139 case BPF_S_ALU_SUB_X:
138 A -= X; 140 A -= X;
139 continue; 141 continue;
140 case BPF_S_ALU_SUB_K: 142 case BPF_S_ALU_SUB_K:
141 A -= fentry->k; 143 A -= K;
142 continue; 144 continue;
143 case BPF_S_ALU_MUL_X: 145 case BPF_S_ALU_MUL_X:
144 A *= X; 146 A *= X;
145 continue; 147 continue;
146 case BPF_S_ALU_MUL_K: 148 case BPF_S_ALU_MUL_K:
147 A *= fentry->k; 149 A *= K;
148 continue; 150 continue;
149 case BPF_S_ALU_DIV_X: 151 case BPF_S_ALU_DIV_X:
150 if (X == 0) 152 if (X == 0)
@@ -152,89 +154,89 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int
152 A /= X; 154 A /= X;
153 continue; 155 continue;
154 case BPF_S_ALU_DIV_K: 156 case BPF_S_ALU_DIV_K:
155 A /= fentry->k; 157 A = reciprocal_divide(A, K);
156 continue; 158 continue;
157 case BPF_S_ALU_AND_X: 159 case BPF_S_ALU_AND_X:
158 A &= X; 160 A &= X;
159 continue; 161 continue;
160 case BPF_S_ALU_AND_K: 162 case BPF_S_ALU_AND_K:
161 A &= fentry->k; 163 A &= K;
162 continue; 164 continue;
163 case BPF_S_ALU_OR_X: 165 case BPF_S_ALU_OR_X:
164 A |= X; 166 A |= X;
165 continue; 167 continue;
166 case BPF_S_ALU_OR_K: 168 case BPF_S_ALU_OR_K:
167 A |= fentry->k; 169 A |= K;
168 continue; 170 continue;
169 case BPF_S_ALU_LSH_X: 171 case BPF_S_ALU_LSH_X:
170 A <<= X; 172 A <<= X;
171 continue; 173 continue;
172 case BPF_S_ALU_LSH_K: 174 case BPF_S_ALU_LSH_K:
173 A <<= fentry->k; 175 A <<= K;
174 continue; 176 continue;
175 case BPF_S_ALU_RSH_X: 177 case BPF_S_ALU_RSH_X:
176 A >>= X; 178 A >>= X;
177 continue; 179 continue;
178 case BPF_S_ALU_RSH_K: 180 case BPF_S_ALU_RSH_K:
179 A >>= fentry->k; 181 A >>= K;
180 continue; 182 continue;
181 case BPF_S_ALU_NEG: 183 case BPF_S_ALU_NEG:
182 A = -A; 184 A = -A;
183 continue; 185 continue;
184 case BPF_S_JMP_JA: 186 case BPF_S_JMP_JA:
185 pc += fentry->k; 187 fentry += K;
186 continue; 188 continue;
187 case BPF_S_JMP_JGT_K: 189 case BPF_S_JMP_JGT_K:
188 pc += (A > fentry->k) ? fentry->jt : fentry->jf; 190 fentry += (A > K) ? fentry->jt : fentry->jf;
189 continue; 191 continue;
190 case BPF_S_JMP_JGE_K: 192 case BPF_S_JMP_JGE_K:
191 pc += (A >= fentry->k) ? fentry->jt : fentry->jf; 193 fentry += (A >= K) ? fentry->jt : fentry->jf;
192 continue; 194 continue;
193 case BPF_S_JMP_JEQ_K: 195 case BPF_S_JMP_JEQ_K:
194 pc += (A == fentry->k) ? fentry->jt : fentry->jf; 196 fentry += (A == K) ? fentry->jt : fentry->jf;
195 continue; 197 continue;
196 case BPF_S_JMP_JSET_K: 198 case BPF_S_JMP_JSET_K:
197 pc += (A & fentry->k) ? fentry->jt : fentry->jf; 199 fentry += (A & K) ? fentry->jt : fentry->jf;
198 continue; 200 continue;
199 case BPF_S_JMP_JGT_X: 201 case BPF_S_JMP_JGT_X:
200 pc += (A > X) ? fentry->jt : fentry->jf; 202 fentry += (A > X) ? fentry->jt : fentry->jf;
201 continue; 203 continue;
202 case BPF_S_JMP_JGE_X: 204 case BPF_S_JMP_JGE_X:
203 pc += (A >= X) ? fentry->jt : fentry->jf; 205 fentry += (A >= X) ? fentry->jt : fentry->jf;
204 continue; 206 continue;
205 case BPF_S_JMP_JEQ_X: 207 case BPF_S_JMP_JEQ_X:
206 pc += (A == X) ? fentry->jt : fentry->jf; 208 fentry += (A == X) ? fentry->jt : fentry->jf;
207 continue; 209 continue;
208 case BPF_S_JMP_JSET_X: 210 case BPF_S_JMP_JSET_X:
209 pc += (A & X) ? fentry->jt : fentry->jf; 211 fentry += (A & X) ? fentry->jt : fentry->jf;
210 continue; 212 continue;
211 case BPF_S_LD_W_ABS: 213 case BPF_S_LD_W_ABS:
212 k = fentry->k; 214 k = K;
213load_w: 215load_w:
214 ptr = load_pointer(skb, k, 4, &tmp); 216 ptr = load_pointer(skb, k, 4, &tmp);
215 if (ptr != NULL) { 217 if (ptr != NULL) {
216 A = get_unaligned_be32(ptr); 218 A = get_unaligned_be32(ptr);
217 continue; 219 continue;
218 } 220 }
219 break; 221 return 0;
220 case BPF_S_LD_H_ABS: 222 case BPF_S_LD_H_ABS:
221 k = fentry->k; 223 k = K;
222load_h: 224load_h:
223 ptr = load_pointer(skb, k, 2, &tmp); 225 ptr = load_pointer(skb, k, 2, &tmp);
224 if (ptr != NULL) { 226 if (ptr != NULL) {
225 A = get_unaligned_be16(ptr); 227 A = get_unaligned_be16(ptr);
226 continue; 228 continue;
227 } 229 }
228 break; 230 return 0;
229 case BPF_S_LD_B_ABS: 231 case BPF_S_LD_B_ABS:
230 k = fentry->k; 232 k = K;
231load_b: 233load_b:
232 ptr = load_pointer(skb, k, 1, &tmp); 234 ptr = load_pointer(skb, k, 1, &tmp);
233 if (ptr != NULL) { 235 if (ptr != NULL) {
234 A = *(u8 *)ptr; 236 A = *(u8 *)ptr;
235 continue; 237 continue;
236 } 238 }
237 break; 239 return 0;
238 case BPF_S_LD_W_LEN: 240 case BPF_S_LD_W_LEN:
239 A = skb->len; 241 A = skb->len;
240 continue; 242 continue;
@@ -242,32 +244,32 @@ load_b:
242 X = skb->len; 244 X = skb->len;
243 continue; 245 continue;
244 case BPF_S_LD_W_IND: 246 case BPF_S_LD_W_IND:
245 k = X + fentry->k; 247 k = X + K;
246 goto load_w; 248 goto load_w;
247 case BPF_S_LD_H_IND: 249 case BPF_S_LD_H_IND:
248 k = X + fentry->k; 250 k = X + K;
249 goto load_h; 251 goto load_h;
250 case BPF_S_LD_B_IND: 252 case BPF_S_LD_B_IND:
251 k = X + fentry->k; 253 k = X + K;
252 goto load_b; 254 goto load_b;
253 case BPF_S_LDX_B_MSH: 255 case BPF_S_LDX_B_MSH:
254 ptr = load_pointer(skb, fentry->k, 1, &tmp); 256 ptr = load_pointer(skb, K, 1, &tmp);
255 if (ptr != NULL) { 257 if (ptr != NULL) {
256 X = (*(u8 *)ptr & 0xf) << 2; 258 X = (*(u8 *)ptr & 0xf) << 2;
257 continue; 259 continue;
258 } 260 }
259 return 0; 261 return 0;
260 case BPF_S_LD_IMM: 262 case BPF_S_LD_IMM:
261 A = fentry->k; 263 A = K;
262 continue; 264 continue;
263 case BPF_S_LDX_IMM: 265 case BPF_S_LDX_IMM:
264 X = fentry->k; 266 X = K;
265 continue; 267 continue;
266 case BPF_S_LD_MEM: 268 case BPF_S_LD_MEM:
267 A = mem[fentry->k]; 269 A = mem[K];
268 continue; 270 continue;
269 case BPF_S_LDX_MEM: 271 case BPF_S_LDX_MEM:
270 X = mem[fentry->k]; 272 X = mem[K];
271 continue; 273 continue;
272 case BPF_S_MISC_TAX: 274 case BPF_S_MISC_TAX:
273 X = A; 275 X = A;
@@ -276,48 +278,44 @@ load_b:
276 A = X; 278 A = X;
277 continue; 279 continue;
278 case BPF_S_RET_K: 280 case BPF_S_RET_K:
279 return fentry->k; 281 return K;
280 case BPF_S_RET_A: 282 case BPF_S_RET_A:
281 return A; 283 return A;
282 case BPF_S_ST: 284 case BPF_S_ST:
283 mem[fentry->k] = A; 285 mem[K] = A;
284 continue; 286 continue;
285 case BPF_S_STX: 287 case BPF_S_STX:
286 mem[fentry->k] = X; 288 mem[K] = X;
287 continue; 289 continue;
288 default: 290 case BPF_S_ANC_PROTOCOL:
289 WARN_ON(1);
290 return 0;
291 }
292
293 /*
294 * Handle ancillary data, which are impossible
295 * (or very difficult) to get parsing packet contents.
296 */
297 switch (k-SKF_AD_OFF) {
298 case SKF_AD_PROTOCOL:
299 A = ntohs(skb->protocol); 291 A = ntohs(skb->protocol);
300 continue; 292 continue;
301 case SKF_AD_PKTTYPE: 293 case BPF_S_ANC_PKTTYPE:
302 A = skb->pkt_type; 294 A = skb->pkt_type;
303 continue; 295 continue;
304 case SKF_AD_IFINDEX: 296 case BPF_S_ANC_IFINDEX:
305 if (!skb->dev) 297 if (!skb->dev)
306 return 0; 298 return 0;
307 A = skb->dev->ifindex; 299 A = skb->dev->ifindex;
308 continue; 300 continue;
309 case SKF_AD_MARK: 301 case BPF_S_ANC_MARK:
310 A = skb->mark; 302 A = skb->mark;
311 continue; 303 continue;
312 case SKF_AD_QUEUE: 304 case BPF_S_ANC_QUEUE:
313 A = skb->queue_mapping; 305 A = skb->queue_mapping;
314 continue; 306 continue;
315 case SKF_AD_HATYPE: 307 case BPF_S_ANC_HATYPE:
316 if (!skb->dev) 308 if (!skb->dev)
317 return 0; 309 return 0;
318 A = skb->dev->type; 310 A = skb->dev->type;
319 continue; 311 continue;
320 case SKF_AD_NLATTR: { 312 case BPF_S_ANC_RXHASH:
313 A = skb->rxhash;
314 continue;
315 case BPF_S_ANC_CPU:
316 A = raw_smp_processor_id();
317 continue;
318 case BPF_S_ANC_NLATTR: {
321 struct nlattr *nla; 319 struct nlattr *nla;
322 320
323 if (skb_is_nonlinear(skb)) 321 if (skb_is_nonlinear(skb))
@@ -333,7 +331,7 @@ load_b:
333 A = 0; 331 A = 0;
334 continue; 332 continue;
335 } 333 }
336 case SKF_AD_NLATTR_NEST: { 334 case BPF_S_ANC_NLATTR_NEST: {
337 struct nlattr *nla; 335 struct nlattr *nla;
338 336
339 if (skb_is_nonlinear(skb)) 337 if (skb_is_nonlinear(skb))
@@ -353,6 +351,9 @@ load_b:
353 continue; 351 continue;
354 } 352 }
355 default: 353 default:
354 WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
355 fentry->code, fentry->jt,
356 fentry->jf, fentry->k);
356 return 0; 357 return 0;
357 } 358 }
358 } 359 }
@@ -361,6 +362,66 @@ load_b:
361} 362}
362EXPORT_SYMBOL(sk_run_filter); 363EXPORT_SYMBOL(sk_run_filter);
363 364
365/*
366 * Security :
367 * A BPF program is able to use 16 cells of memory to store intermediate
368 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
369 * As we dont want to clear mem[] array for each packet going through
370 * sk_run_filter(), we check that filter loaded by user never try to read
371 * a cell if not previously written, and we check all branches to be sure
372 * a malicious user doesn't try to abuse us.
373 */
374static int check_load_and_stores(struct sock_filter *filter, int flen)
375{
376 u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
377 int pc, ret = 0;
378
379 BUILD_BUG_ON(BPF_MEMWORDS > 16);
380 masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
381 if (!masks)
382 return -ENOMEM;
383 memset(masks, 0xff, flen * sizeof(*masks));
384
385 for (pc = 0; pc < flen; pc++) {
386 memvalid &= masks[pc];
387
388 switch (filter[pc].code) {
389 case BPF_S_ST:
390 case BPF_S_STX:
391 memvalid |= (1 << filter[pc].k);
392 break;
393 case BPF_S_LD_MEM:
394 case BPF_S_LDX_MEM:
395 if (!(memvalid & (1 << filter[pc].k))) {
396 ret = -EINVAL;
397 goto error;
398 }
399 break;
400 case BPF_S_JMP_JA:
401 /* a jump must set masks on target */
402 masks[pc + 1 + filter[pc].k] &= memvalid;
403 memvalid = ~0;
404 break;
405 case BPF_S_JMP_JEQ_K:
406 case BPF_S_JMP_JEQ_X:
407 case BPF_S_JMP_JGE_K:
408 case BPF_S_JMP_JGE_X:
409 case BPF_S_JMP_JGT_K:
410 case BPF_S_JMP_JGT_X:
411 case BPF_S_JMP_JSET_X:
412 case BPF_S_JMP_JSET_K:
413 /* a jump must set masks on targets */
414 masks[pc + 1 + filter[pc].jt] &= memvalid;
415 masks[pc + 1 + filter[pc].jf] &= memvalid;
416 memvalid = ~0;
417 break;
418 }
419 }
420error:
421 kfree(masks);
422 return ret;
423}
424
364/** 425/**
365 * sk_chk_filter - verify socket filter code 426 * sk_chk_filter - verify socket filter code
366 * @filter: filter to verify 427 * @filter: filter to verify
@@ -377,7 +438,57 @@ EXPORT_SYMBOL(sk_run_filter);
377 */ 438 */
378int sk_chk_filter(struct sock_filter *filter, int flen) 439int sk_chk_filter(struct sock_filter *filter, int flen)
379{ 440{
380 struct sock_filter *ftest; 441 /*
442 * Valid instructions are initialized to non-0.
443 * Invalid instructions are initialized to 0.
444 */
445 static const u8 codes[] = {
446 [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K,
447 [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X,
448 [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K,
449 [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X,
450 [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K,
451 [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X,
452 [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X,
453 [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K,
454 [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X,
455 [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K,
456 [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X,
457 [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K,
458 [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X,
459 [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K,
460 [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X,
461 [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG,
462 [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS,
463 [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS,
464 [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS,
465 [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN,
466 [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND,
467 [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND,
468 [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND,
469 [BPF_LD|BPF_IMM] = BPF_S_LD_IMM,
470 [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN,
471 [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH,
472 [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM,
473 [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX,
474 [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA,
475 [BPF_RET|BPF_K] = BPF_S_RET_K,
476 [BPF_RET|BPF_A] = BPF_S_RET_A,
477 [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K,
478 [BPF_LD|BPF_MEM] = BPF_S_LD_MEM,
479 [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM,
480 [BPF_ST] = BPF_S_ST,
481 [BPF_STX] = BPF_S_STX,
482 [BPF_JMP|BPF_JA] = BPF_S_JMP_JA,
483 [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K,
484 [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X,
485 [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K,
486 [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X,
487 [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K,
488 [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X,
489 [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
490 [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
491 };
381 int pc; 492 int pc;
382 493
383 if (flen == 0 || flen > BPF_MAXINSNS) 494 if (flen == 0 || flen > BPF_MAXINSNS)
@@ -385,136 +496,31 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
385 496
386 /* check the filter code now */ 497 /* check the filter code now */
387 for (pc = 0; pc < flen; pc++) { 498 for (pc = 0; pc < flen; pc++) {
388 ftest = &filter[pc]; 499 struct sock_filter *ftest = &filter[pc];
389 500 u16 code = ftest->code;
390 /* Only allow valid instructions */
391 switch (ftest->code) {
392 case BPF_ALU|BPF_ADD|BPF_K:
393 ftest->code = BPF_S_ALU_ADD_K;
394 break;
395 case BPF_ALU|BPF_ADD|BPF_X:
396 ftest->code = BPF_S_ALU_ADD_X;
397 break;
398 case BPF_ALU|BPF_SUB|BPF_K:
399 ftest->code = BPF_S_ALU_SUB_K;
400 break;
401 case BPF_ALU|BPF_SUB|BPF_X:
402 ftest->code = BPF_S_ALU_SUB_X;
403 break;
404 case BPF_ALU|BPF_MUL|BPF_K:
405 ftest->code = BPF_S_ALU_MUL_K;
406 break;
407 case BPF_ALU|BPF_MUL|BPF_X:
408 ftest->code = BPF_S_ALU_MUL_X;
409 break;
410 case BPF_ALU|BPF_DIV|BPF_X:
411 ftest->code = BPF_S_ALU_DIV_X;
412 break;
413 case BPF_ALU|BPF_AND|BPF_K:
414 ftest->code = BPF_S_ALU_AND_K;
415 break;
416 case BPF_ALU|BPF_AND|BPF_X:
417 ftest->code = BPF_S_ALU_AND_X;
418 break;
419 case BPF_ALU|BPF_OR|BPF_K:
420 ftest->code = BPF_S_ALU_OR_K;
421 break;
422 case BPF_ALU|BPF_OR|BPF_X:
423 ftest->code = BPF_S_ALU_OR_X;
424 break;
425 case BPF_ALU|BPF_LSH|BPF_K:
426 ftest->code = BPF_S_ALU_LSH_K;
427 break;
428 case BPF_ALU|BPF_LSH|BPF_X:
429 ftest->code = BPF_S_ALU_LSH_X;
430 break;
431 case BPF_ALU|BPF_RSH|BPF_K:
432 ftest->code = BPF_S_ALU_RSH_K;
433 break;
434 case BPF_ALU|BPF_RSH|BPF_X:
435 ftest->code = BPF_S_ALU_RSH_X;
436 break;
437 case BPF_ALU|BPF_NEG:
438 ftest->code = BPF_S_ALU_NEG;
439 break;
440 case BPF_LD|BPF_W|BPF_ABS:
441 ftest->code = BPF_S_LD_W_ABS;
442 break;
443 case BPF_LD|BPF_H|BPF_ABS:
444 ftest->code = BPF_S_LD_H_ABS;
445 break;
446 case BPF_LD|BPF_B|BPF_ABS:
447 ftest->code = BPF_S_LD_B_ABS;
448 break;
449 case BPF_LD|BPF_W|BPF_LEN:
450 ftest->code = BPF_S_LD_W_LEN;
451 break;
452 case BPF_LD|BPF_W|BPF_IND:
453 ftest->code = BPF_S_LD_W_IND;
454 break;
455 case BPF_LD|BPF_H|BPF_IND:
456 ftest->code = BPF_S_LD_H_IND;
457 break;
458 case BPF_LD|BPF_B|BPF_IND:
459 ftest->code = BPF_S_LD_B_IND;
460 break;
461 case BPF_LD|BPF_IMM:
462 ftest->code = BPF_S_LD_IMM;
463 break;
464 case BPF_LDX|BPF_W|BPF_LEN:
465 ftest->code = BPF_S_LDX_W_LEN;
466 break;
467 case BPF_LDX|BPF_B|BPF_MSH:
468 ftest->code = BPF_S_LDX_B_MSH;
469 break;
470 case BPF_LDX|BPF_IMM:
471 ftest->code = BPF_S_LDX_IMM;
472 break;
473 case BPF_MISC|BPF_TAX:
474 ftest->code = BPF_S_MISC_TAX;
475 break;
476 case BPF_MISC|BPF_TXA:
477 ftest->code = BPF_S_MISC_TXA;
478 break;
479 case BPF_RET|BPF_K:
480 ftest->code = BPF_S_RET_K;
481 break;
482 case BPF_RET|BPF_A:
483 ftest->code = BPF_S_RET_A;
484 break;
485 501
502 if (code >= ARRAY_SIZE(codes))
503 return -EINVAL;
504 code = codes[code];
505 if (!code)
506 return -EINVAL;
486 /* Some instructions need special checks */ 507 /* Some instructions need special checks */
487 508 switch (code) {
509 case BPF_S_ALU_DIV_K:
488 /* check for division by zero */ 510 /* check for division by zero */
489 case BPF_ALU|BPF_DIV|BPF_K:
490 if (ftest->k == 0) 511 if (ftest->k == 0)
491 return -EINVAL; 512 return -EINVAL;
492 ftest->code = BPF_S_ALU_DIV_K; 513 ftest->k = reciprocal_value(ftest->k);
493 break;
494
495 /* check for invalid memory addresses */
496 case BPF_LD|BPF_MEM:
497 if (ftest->k >= BPF_MEMWORDS)
498 return -EINVAL;
499 ftest->code = BPF_S_LD_MEM;
500 break;
501 case BPF_LDX|BPF_MEM:
502 if (ftest->k >= BPF_MEMWORDS)
503 return -EINVAL;
504 ftest->code = BPF_S_LDX_MEM;
505 break;
506 case BPF_ST:
507 if (ftest->k >= BPF_MEMWORDS)
508 return -EINVAL;
509 ftest->code = BPF_S_ST;
510 break; 514 break;
511 case BPF_STX: 515 case BPF_S_LD_MEM:
516 case BPF_S_LDX_MEM:
517 case BPF_S_ST:
518 case BPF_S_STX:
519 /* check for invalid memory addresses */
512 if (ftest->k >= BPF_MEMWORDS) 520 if (ftest->k >= BPF_MEMWORDS)
513 return -EINVAL; 521 return -EINVAL;
514 ftest->code = BPF_S_STX;
515 break; 522 break;
516 523 case BPF_S_JMP_JA:
517 case BPF_JMP|BPF_JA:
518 /* 524 /*
519 * Note, the large ftest->k might cause loops. 525 * Note, the large ftest->k might cause loops.
520 * Compare this with conditional jumps below, 526 * Compare this with conditional jumps below,
@@ -522,40 +528,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
522 */ 528 */
523 if (ftest->k >= (unsigned)(flen-pc-1)) 529 if (ftest->k >= (unsigned)(flen-pc-1))
524 return -EINVAL; 530 return -EINVAL;
525 ftest->code = BPF_S_JMP_JA;
526 break;
527
528 case BPF_JMP|BPF_JEQ|BPF_K:
529 ftest->code = BPF_S_JMP_JEQ_K;
530 break;
531 case BPF_JMP|BPF_JEQ|BPF_X:
532 ftest->code = BPF_S_JMP_JEQ_X;
533 break;
534 case BPF_JMP|BPF_JGE|BPF_K:
535 ftest->code = BPF_S_JMP_JGE_K;
536 break;
537 case BPF_JMP|BPF_JGE|BPF_X:
538 ftest->code = BPF_S_JMP_JGE_X;
539 break;
540 case BPF_JMP|BPF_JGT|BPF_K:
541 ftest->code = BPF_S_JMP_JGT_K;
542 break; 531 break;
543 case BPF_JMP|BPF_JGT|BPF_X:
544 ftest->code = BPF_S_JMP_JGT_X;
545 break;
546 case BPF_JMP|BPF_JSET|BPF_K:
547 ftest->code = BPF_S_JMP_JSET_K;
548 break;
549 case BPF_JMP|BPF_JSET|BPF_X:
550 ftest->code = BPF_S_JMP_JSET_X;
551 break;
552
553 default:
554 return -EINVAL;
555 }
556
557 /* for conditionals both must be safe */
558 switch (ftest->code) {
559 case BPF_S_JMP_JEQ_K: 532 case BPF_S_JMP_JEQ_K:
560 case BPF_S_JMP_JEQ_X: 533 case BPF_S_JMP_JEQ_X:
561 case BPF_S_JMP_JGE_K: 534 case BPF_S_JMP_JGE_K:
@@ -564,42 +537,55 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
564 case BPF_S_JMP_JGT_X: 537 case BPF_S_JMP_JGT_X:
565 case BPF_S_JMP_JSET_X: 538 case BPF_S_JMP_JSET_X:
566 case BPF_S_JMP_JSET_K: 539 case BPF_S_JMP_JSET_K:
540 /* for conditionals both must be safe */
567 if (pc + ftest->jt + 1 >= flen || 541 if (pc + ftest->jt + 1 >= flen ||
568 pc + ftest->jf + 1 >= flen) 542 pc + ftest->jf + 1 >= flen)
569 return -EINVAL; 543 return -EINVAL;
544 break;
545 case BPF_S_LD_W_ABS:
546 case BPF_S_LD_H_ABS:
547 case BPF_S_LD_B_ABS:
548#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \
549 code = BPF_S_ANC_##CODE; \
550 break
551 switch (ftest->k) {
552 ANCILLARY(PROTOCOL);
553 ANCILLARY(PKTTYPE);
554 ANCILLARY(IFINDEX);
555 ANCILLARY(NLATTR);
556 ANCILLARY(NLATTR_NEST);
557 ANCILLARY(MARK);
558 ANCILLARY(QUEUE);
559 ANCILLARY(HATYPE);
560 ANCILLARY(RXHASH);
561 ANCILLARY(CPU);
562 }
570 } 563 }
564 ftest->code = code;
571 } 565 }
572 566
573 /* last instruction must be a RET code */ 567 /* last instruction must be a RET code */
574 switch (filter[flen - 1].code) { 568 switch (filter[flen - 1].code) {
575 case BPF_S_RET_K: 569 case BPF_S_RET_K:
576 case BPF_S_RET_A: 570 case BPF_S_RET_A:
577 return 0; 571 return check_load_and_stores(filter, flen);
578 break; 572 }
579 default: 573 return -EINVAL;
580 return -EINVAL;
581 }
582} 574}
583EXPORT_SYMBOL(sk_chk_filter); 575EXPORT_SYMBOL(sk_chk_filter);
584 576
585/** 577/**
586 * sk_filter_rcu_release: Release a socket filter by rcu_head 578 * sk_filter_release_rcu - Release a socket filter by rcu_head
587 * @rcu: rcu_head that contains the sk_filter to free 579 * @rcu: rcu_head that contains the sk_filter to free
588 */ 580 */
589static void sk_filter_rcu_release(struct rcu_head *rcu) 581void sk_filter_release_rcu(struct rcu_head *rcu)
590{ 582{
591 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 583 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
592 584
593 sk_filter_release(fp); 585 bpf_jit_free(fp);
594} 586 kfree(fp);
595
596static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
597{
598 unsigned int size = sk_filter_len(fp);
599
600 atomic_sub(size, &sk->sk_omem_alloc);
601 call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
602} 587}
588EXPORT_SYMBOL(sk_filter_release_rcu);
603 589
604/** 590/**
605 * sk_attach_filter - attach a socket filter 591 * sk_attach_filter - attach a socket filter
@@ -631,6 +617,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
631 617
632 atomic_set(&fp->refcnt, 1); 618 atomic_set(&fp->refcnt, 1);
633 fp->len = fprog->len; 619 fp->len = fprog->len;
620 fp->bpf_func = sk_run_filter;
634 621
635 err = sk_chk_filter(fp->insns, fp->len); 622 err = sk_chk_filter(fp->insns, fp->len);
636 if (err) { 623 if (err) {
@@ -638,13 +625,14 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
638 return err; 625 return err;
639 } 626 }
640 627
641 rcu_read_lock_bh(); 628 bpf_jit_compile(fp);
642 old_fp = rcu_dereference_bh(sk->sk_filter); 629
630 old_fp = rcu_dereference_protected(sk->sk_filter,
631 sock_owned_by_user(sk));
643 rcu_assign_pointer(sk->sk_filter, fp); 632 rcu_assign_pointer(sk->sk_filter, fp);
644 rcu_read_unlock_bh();
645 633
646 if (old_fp) 634 if (old_fp)
647 sk_filter_delayed_uncharge(sk, old_fp); 635 sk_filter_uncharge(sk, old_fp);
648 return 0; 636 return 0;
649} 637}
650EXPORT_SYMBOL_GPL(sk_attach_filter); 638EXPORT_SYMBOL_GPL(sk_attach_filter);
@@ -654,14 +642,13 @@ int sk_detach_filter(struct sock *sk)
654 int ret = -ENOENT; 642 int ret = -ENOENT;
655 struct sk_filter *filter; 643 struct sk_filter *filter;
656 644
657 rcu_read_lock_bh(); 645 filter = rcu_dereference_protected(sk->sk_filter,
658 filter = rcu_dereference_bh(sk->sk_filter); 646 sock_owned_by_user(sk));
659 if (filter) { 647 if (filter) {
660 rcu_assign_pointer(sk->sk_filter, NULL); 648 rcu_assign_pointer(sk->sk_filter, NULL);
661 sk_filter_delayed_uncharge(sk, filter); 649 sk_filter_uncharge(sk, filter);
662 ret = 0; 650 ret = 0;
663 } 651 }
664 rcu_read_unlock_bh();
665 return ret; 652 return ret;
666} 653}
667EXPORT_SYMBOL_GPL(sk_detach_filter); 654EXPORT_SYMBOL_GPL(sk_detach_filter);
diff --git a/net/core/flow.c b/net/core/flow.c
index f67dcbfe54ef..990703b8863b 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -53,8 +53,7 @@ struct flow_flush_info {
53 53
54struct flow_cache { 54struct flow_cache {
55 u32 hash_shift; 55 u32 hash_shift;
56 unsigned long order; 56 struct flow_cache_percpu __percpu *percpu;
57 struct flow_cache_percpu *percpu;
58 struct notifier_block hotcpu_notifier; 57 struct notifier_block hotcpu_notifier;
59 int low_watermark; 58 int low_watermark;
60 int high_watermark; 59 int high_watermark;
@@ -64,7 +63,7 @@ struct flow_cache {
64atomic_t flow_cache_genid = ATOMIC_INIT(0); 63atomic_t flow_cache_genid = ATOMIC_INIT(0);
65EXPORT_SYMBOL(flow_cache_genid); 64EXPORT_SYMBOL(flow_cache_genid);
66static struct flow_cache flow_cache_global; 65static struct flow_cache flow_cache_global;
67static struct kmem_cache *flow_cachep; 66static struct kmem_cache *flow_cachep __read_mostly;
68 67
69static DEFINE_SPINLOCK(flow_cache_gc_lock); 68static DEFINE_SPINLOCK(flow_cache_gc_lock);
70static LIST_HEAD(flow_cache_gc_list); 69static LIST_HEAD(flow_cache_gc_list);
@@ -173,35 +172,31 @@ static void flow_new_hash_rnd(struct flow_cache *fc,
173 172
174static u32 flow_hash_code(struct flow_cache *fc, 173static u32 flow_hash_code(struct flow_cache *fc,
175 struct flow_cache_percpu *fcp, 174 struct flow_cache_percpu *fcp,
176 struct flowi *key) 175 const struct flowi *key)
177{ 176{
178 u32 *k = (u32 *) key; 177 const u32 *k = (const u32 *) key;
179 178
180 return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) 179 return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
181 & (flow_cache_hash_size(fc) - 1)); 180 & (flow_cache_hash_size(fc) - 1);
182} 181}
183 182
184#if (BITS_PER_LONG == 64) 183typedef unsigned long flow_compare_t;
185typedef u64 flow_compare_t;
186#else
187typedef u32 flow_compare_t;
188#endif
189 184
190/* I hear what you're saying, use memcmp. But memcmp cannot make 185/* I hear what you're saying, use memcmp. But memcmp cannot make
191 * important assumptions that we can here, such as alignment and 186 * important assumptions that we can here, such as alignment and
192 * constant size. 187 * constant size.
193 */ 188 */
194static int flow_key_compare(struct flowi *key1, struct flowi *key2) 189static int flow_key_compare(const struct flowi *key1, const struct flowi *key2)
195{ 190{
196 flow_compare_t *k1, *k1_lim, *k2; 191 const flow_compare_t *k1, *k1_lim, *k2;
197 const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); 192 const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
198 193
199 BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t)); 194 BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t));
200 195
201 k1 = (flow_compare_t *) key1; 196 k1 = (const flow_compare_t *) key1;
202 k1_lim = k1 + n_elem; 197 k1_lim = k1 + n_elem;
203 198
204 k2 = (flow_compare_t *) key2; 199 k2 = (const flow_compare_t *) key2;
205 200
206 do { 201 do {
207 if (*k1++ != *k2++) 202 if (*k1++ != *k2++)
@@ -212,7 +207,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
212} 207}
213 208
214struct flow_cache_object * 209struct flow_cache_object *
215flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, 210flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
216 flow_resolve_t resolver, void *ctx) 211 flow_resolve_t resolver, void *ctx)
217{ 212{
218 struct flow_cache *fc = &flow_cache_global; 213 struct flow_cache *fc = &flow_cache_global;
@@ -357,62 +352,73 @@ void flow_cache_flush(void)
357 put_online_cpus(); 352 put_online_cpus();
358} 353}
359 354
360static void __init flow_cache_cpu_prepare(struct flow_cache *fc, 355static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
361 struct flow_cache_percpu *fcp)
362{ 356{
363 fcp->hash_table = (struct hlist_head *) 357 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
364 __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); 358 size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
365 if (!fcp->hash_table)
366 panic("NET: failed to allocate flow cache order %lu\n", fc->order);
367 359
368 fcp->hash_rnd_recalc = 1; 360 if (!fcp->hash_table) {
369 fcp->hash_count = 0; 361 fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
370 tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); 362 if (!fcp->hash_table) {
363 pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
364 return -ENOMEM;
365 }
366 fcp->hash_rnd_recalc = 1;
367 fcp->hash_count = 0;
368 tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
369 }
370 return 0;
371} 371}
372 372
373static int flow_cache_cpu(struct notifier_block *nfb, 373static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
374 unsigned long action, 374 unsigned long action,
375 void *hcpu) 375 void *hcpu)
376{ 376{
377 struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); 377 struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
378 int cpu = (unsigned long) hcpu; 378 int res, cpu = (unsigned long) hcpu;
379 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 379 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
380 380
381 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 381 switch (action) {
382 case CPU_UP_PREPARE:
383 case CPU_UP_PREPARE_FROZEN:
384 res = flow_cache_cpu_prepare(fc, cpu);
385 if (res)
386 return notifier_from_errno(res);
387 break;
388 case CPU_DEAD:
389 case CPU_DEAD_FROZEN:
382 __flow_cache_shrink(fc, fcp, 0); 390 __flow_cache_shrink(fc, fcp, 0);
391 break;
392 }
383 return NOTIFY_OK; 393 return NOTIFY_OK;
384} 394}
385 395
386static int flow_cache_init(struct flow_cache *fc) 396static int __init flow_cache_init(struct flow_cache *fc)
387{ 397{
388 unsigned long order;
389 int i; 398 int i;
390 399
391 fc->hash_shift = 10; 400 fc->hash_shift = 10;
392 fc->low_watermark = 2 * flow_cache_hash_size(fc); 401 fc->low_watermark = 2 * flow_cache_hash_size(fc);
393 fc->high_watermark = 4 * flow_cache_hash_size(fc); 402 fc->high_watermark = 4 * flow_cache_hash_size(fc);
394 403
395 for (order = 0;
396 (PAGE_SIZE << order) <
397 (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
398 order++)
399 /* NOTHING */;
400 fc->order = order;
401 fc->percpu = alloc_percpu(struct flow_cache_percpu); 404 fc->percpu = alloc_percpu(struct flow_cache_percpu);
405 if (!fc->percpu)
406 return -ENOMEM;
402 407
403 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, 408 for_each_online_cpu(i) {
404 (unsigned long) fc); 409 if (flow_cache_cpu_prepare(fc, i))
405 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; 410 return -ENOMEM;
406 add_timer(&fc->rnd_timer); 411 }
407
408 for_each_possible_cpu(i)
409 flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
410
411 fc->hotcpu_notifier = (struct notifier_block){ 412 fc->hotcpu_notifier = (struct notifier_block){
412 .notifier_call = flow_cache_cpu, 413 .notifier_call = flow_cache_cpu,
413 }; 414 };
414 register_hotcpu_notifier(&fc->hotcpu_notifier); 415 register_hotcpu_notifier(&fc->hotcpu_notifier);
415 416
417 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
418 (unsigned long) fc);
419 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
420 add_timer(&fc->rnd_timer);
421
416 return 0; 422 return 0;
417} 423}
418 424
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 6743146e4d6b..43b03dd71e85 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -249,13 +249,6 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
249} 249}
250EXPORT_SYMBOL(gen_new_estimator); 250EXPORT_SYMBOL(gen_new_estimator);
251 251
252static void __gen_kill_estimator(struct rcu_head *head)
253{
254 struct gen_estimator *e = container_of(head,
255 struct gen_estimator, e_rcu);
256 kfree(e);
257}
258
259/** 252/**
260 * gen_kill_estimator - remove a rate estimator 253 * gen_kill_estimator - remove a rate estimator
261 * @bstats: basic statistics 254 * @bstats: basic statistics
@@ -274,12 +267,12 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
274 while ((e = gen_find_node(bstats, rate_est))) { 267 while ((e = gen_find_node(bstats, rate_est))) {
275 rb_erase(&e->node, &est_root); 268 rb_erase(&e->node, &est_root);
276 269
277 write_lock_bh(&est_lock); 270 write_lock(&est_lock);
278 e->bstats = NULL; 271 e->bstats = NULL;
279 write_unlock_bh(&est_lock); 272 write_unlock(&est_lock);
280 273
281 list_del_rcu(&e->list); 274 list_del_rcu(&e->list);
282 call_rcu(&e->e_rcu, __gen_kill_estimator); 275 kfree_rcu(e, e_rcu);
283 } 276 }
284 spin_unlock_bh(&est_tree_lock); 277 spin_unlock_bh(&est_tree_lock);
285} 278}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index e6b133b77ccb..c40f27e7d208 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -35,14 +35,15 @@
35 * in any case. 35 * in any case.
36 */ 36 */
37 37
38long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) 38int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
39{ 39{
40 int size, ct; 40 int size, ct, err;
41 long err;
42 41
43 if (m->msg_namelen) { 42 if (m->msg_namelen) {
44 if (mode == VERIFY_READ) { 43 if (mode == VERIFY_READ) {
45 err = move_addr_to_kernel(m->msg_name, m->msg_namelen, 44 void __user *namep;
45 namep = (void __user __force *) m->msg_name;
46 err = move_addr_to_kernel(namep, m->msg_namelen,
46 address); 47 address);
47 if (err < 0) 48 if (err < 0)
48 return err; 49 return err;
@@ -53,21 +54,20 @@ long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,
53 } 54 }
54 55
55 size = m->msg_iovlen * sizeof(struct iovec); 56 size = m->msg_iovlen * sizeof(struct iovec);
56 if (copy_from_user(iov, m->msg_iov, size)) 57 if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
57 return -EFAULT; 58 return -EFAULT;
58 59
59 m->msg_iov = iov; 60 m->msg_iov = iov;
60 err = 0; 61 err = 0;
61 62
62 for (ct = 0; ct < m->msg_iovlen; ct++) { 63 for (ct = 0; ct < m->msg_iovlen; ct++) {
63 err += iov[ct].iov_len; 64 size_t len = iov[ct].iov_len;
64 /* 65
65 * Goal is not to verify user data, but to prevent returning 66 if (len > INT_MAX - err) {
66 * negative value, which is interpreted as errno. 67 len = INT_MAX - err;
67 * Overflow is still possible, but it is harmless. 68 iov[ct].iov_len = len;
68 */ 69 }
69 if (err < 0) 70 err += len;
70 return -EMSGSIZE;
71 } 71 }
72 72
73 return err; 73 return err;
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 01a1101b5936..a7b342131869 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -129,7 +129,7 @@ static void linkwatch_schedule_work(int urgent)
129 if (!cancel_delayed_work(&linkwatch_work)) 129 if (!cancel_delayed_work(&linkwatch_work))
130 return; 130 return;
131 131
132 /* Otherwise we reschedule it again for immediate exection. */ 132 /* Otherwise we reschedule it again for immediate execution. */
133 schedule_delayed_work(&linkwatch_work, 0); 133 schedule_delayed_work(&linkwatch_work, 0);
134} 134}
135 135
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a4e0a7482c2b..799f06e03a22 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -41,7 +41,6 @@
41 41
42#define NEIGH_PRINTK(x...) printk(x) 42#define NEIGH_PRINTK(x...) printk(x)
43#define NEIGH_NOPRINTK(x...) do { ; } while(0) 43#define NEIGH_NOPRINTK(x...) do { ; } while(0)
44#define NEIGH_PRINTK0 NEIGH_PRINTK
45#define NEIGH_PRINTK1 NEIGH_NOPRINTK 44#define NEIGH_PRINTK1 NEIGH_NOPRINTK
46#define NEIGH_PRINTK2 NEIGH_NOPRINTK 45#define NEIGH_PRINTK2 NEIGH_NOPRINTK
47 46
@@ -122,7 +121,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
122 121
123unsigned long neigh_rand_reach_time(unsigned long base) 122unsigned long neigh_rand_reach_time(unsigned long base)
124{ 123{
125 return (base ? (net_random() % base) + (base >> 1) : 0); 124 return base ? (net_random() % base) + (base >> 1) : 0;
126} 125}
127EXPORT_SYMBOL(neigh_rand_reach_time); 126EXPORT_SYMBOL(neigh_rand_reach_time);
128 127
@@ -131,15 +130,20 @@ static int neigh_forced_gc(struct neigh_table *tbl)
131{ 130{
132 int shrunk = 0; 131 int shrunk = 0;
133 int i; 132 int i;
133 struct neigh_hash_table *nht;
134 134
135 NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); 135 NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
136 136
137 write_lock_bh(&tbl->lock); 137 write_lock_bh(&tbl->lock);
138 for (i = 0; i <= tbl->hash_mask; i++) { 138 nht = rcu_dereference_protected(tbl->nht,
139 struct neighbour *n, **np; 139 lockdep_is_held(&tbl->lock));
140 for (i = 0; i <= nht->hash_mask; i++) {
141 struct neighbour *n;
142 struct neighbour __rcu **np;
140 143
141 np = &tbl->hash_buckets[i]; 144 np = &nht->hash_buckets[i];
142 while ((n = *np) != NULL) { 145 while ((n = rcu_dereference_protected(*np,
146 lockdep_is_held(&tbl->lock))) != NULL) {
143 /* Neighbour record may be discarded if: 147 /* Neighbour record may be discarded if:
144 * - nobody refers to it. 148 * - nobody refers to it.
145 * - it is not permanent 149 * - it is not permanent
@@ -147,7 +151,9 @@ static int neigh_forced_gc(struct neigh_table *tbl)
147 write_lock(&n->lock); 151 write_lock(&n->lock);
148 if (atomic_read(&n->refcnt) == 1 && 152 if (atomic_read(&n->refcnt) == 1 &&
149 !(n->nud_state & NUD_PERMANENT)) { 153 !(n->nud_state & NUD_PERMANENT)) {
150 *np = n->next; 154 rcu_assign_pointer(*np,
155 rcu_dereference_protected(n->next,
156 lockdep_is_held(&tbl->lock)));
151 n->dead = 1; 157 n->dead = 1;
152 shrunk = 1; 158 shrunk = 1;
153 write_unlock(&n->lock); 159 write_unlock(&n->lock);
@@ -199,16 +205,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
199static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) 205static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
200{ 206{
201 int i; 207 int i;
208 struct neigh_hash_table *nht;
202 209
203 for (i = 0; i <= tbl->hash_mask; i++) { 210 nht = rcu_dereference_protected(tbl->nht,
204 struct neighbour *n, **np = &tbl->hash_buckets[i]; 211 lockdep_is_held(&tbl->lock));
205 212
206 while ((n = *np) != NULL) { 213 for (i = 0; i <= nht->hash_mask; i++) {
214 struct neighbour *n;
215 struct neighbour __rcu **np = &nht->hash_buckets[i];
216
217 while ((n = rcu_dereference_protected(*np,
218 lockdep_is_held(&tbl->lock))) != NULL) {
207 if (dev && n->dev != dev) { 219 if (dev && n->dev != dev) {
208 np = &n->next; 220 np = &n->next;
209 continue; 221 continue;
210 } 222 }
211 *np = n->next; 223 rcu_assign_pointer(*np,
224 rcu_dereference_protected(n->next,
225 lockdep_is_held(&tbl->lock)));
212 write_lock(&n->lock); 226 write_lock(&n->lock);
213 neigh_del_timer(n); 227 neigh_del_timer(n);
214 n->dead = 1; 228 n->dead = 1;
@@ -279,6 +293,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
279 293
280 skb_queue_head_init(&n->arp_queue); 294 skb_queue_head_init(&n->arp_queue);
281 rwlock_init(&n->lock); 295 rwlock_init(&n->lock);
296 seqlock_init(&n->ha_lock);
282 n->updated = n->used = now; 297 n->updated = n->used = now;
283 n->nud_state = NUD_NONE; 298 n->nud_state = NUD_NONE;
284 n->output = neigh_blackhole; 299 n->output = neigh_blackhole;
@@ -297,64 +312,86 @@ out_entries:
297 goto out; 312 goto out;
298} 313}
299 314
300static struct neighbour **neigh_hash_alloc(unsigned int entries) 315static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
301{ 316{
302 unsigned long size = entries * sizeof(struct neighbour *); 317 size_t size = entries * sizeof(struct neighbour *);
303 struct neighbour **ret; 318 struct neigh_hash_table *ret;
319 struct neighbour __rcu **buckets;
304 320
305 if (size <= PAGE_SIZE) { 321 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
306 ret = kzalloc(size, GFP_ATOMIC); 322 if (!ret)
307 } else { 323 return NULL;
308 ret = (struct neighbour **) 324 if (size <= PAGE_SIZE)
309 __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); 325 buckets = kzalloc(size, GFP_ATOMIC);
326 else
327 buckets = (struct neighbour __rcu **)
328 __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
329 get_order(size));
330 if (!buckets) {
331 kfree(ret);
332 return NULL;
310 } 333 }
334 ret->hash_buckets = buckets;
335 ret->hash_mask = entries - 1;
336 get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
311 return ret; 337 return ret;
312} 338}
313 339
314static void neigh_hash_free(struct neighbour **hash, unsigned int entries) 340static void neigh_hash_free_rcu(struct rcu_head *head)
315{ 341{
316 unsigned long size = entries * sizeof(struct neighbour *); 342 struct neigh_hash_table *nht = container_of(head,
343 struct neigh_hash_table,
344 rcu);
345 size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
346 struct neighbour __rcu **buckets = nht->hash_buckets;
317 347
318 if (size <= PAGE_SIZE) 348 if (size <= PAGE_SIZE)
319 kfree(hash); 349 kfree(buckets);
320 else 350 else
321 free_pages((unsigned long)hash, get_order(size)); 351 free_pages((unsigned long)buckets, get_order(size));
352 kfree(nht);
322} 353}
323 354
324static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) 355static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
356 unsigned long new_entries)
325{ 357{
326 struct neighbour **new_hash, **old_hash; 358 unsigned int i, hash;
327 unsigned int i, new_hash_mask, old_entries; 359 struct neigh_hash_table *new_nht, *old_nht;
328 360
329 NEIGH_CACHE_STAT_INC(tbl, hash_grows); 361 NEIGH_CACHE_STAT_INC(tbl, hash_grows);
330 362
331 BUG_ON(!is_power_of_2(new_entries)); 363 BUG_ON(!is_power_of_2(new_entries));
332 new_hash = neigh_hash_alloc(new_entries); 364 old_nht = rcu_dereference_protected(tbl->nht,
333 if (!new_hash) 365 lockdep_is_held(&tbl->lock));
334 return; 366 new_nht = neigh_hash_alloc(new_entries);
335 367 if (!new_nht)
336 old_entries = tbl->hash_mask + 1; 368 return old_nht;
337 new_hash_mask = new_entries - 1;
338 old_hash = tbl->hash_buckets;
339 369
340 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); 370 for (i = 0; i <= old_nht->hash_mask; i++) {
341 for (i = 0; i < old_entries; i++) {
342 struct neighbour *n, *next; 371 struct neighbour *n, *next;
343 372
344 for (n = old_hash[i]; n; n = next) { 373 for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
345 unsigned int hash_val = tbl->hash(n->primary_key, n->dev); 374 lockdep_is_held(&tbl->lock));
346 375 n != NULL;
347 hash_val &= new_hash_mask; 376 n = next) {
348 next = n->next; 377 hash = tbl->hash(n->primary_key, n->dev,
349 378 new_nht->hash_rnd);
350 n->next = new_hash[hash_val]; 379
351 new_hash[hash_val] = n; 380 hash &= new_nht->hash_mask;
381 next = rcu_dereference_protected(n->next,
382 lockdep_is_held(&tbl->lock));
383
384 rcu_assign_pointer(n->next,
385 rcu_dereference_protected(
386 new_nht->hash_buckets[hash],
387 lockdep_is_held(&tbl->lock)));
388 rcu_assign_pointer(new_nht->hash_buckets[hash], n);
352 } 389 }
353 } 390 }
354 tbl->hash_buckets = new_hash;
355 tbl->hash_mask = new_hash_mask;
356 391
357 neigh_hash_free(old_hash, old_entries); 392 rcu_assign_pointer(tbl->nht, new_nht);
393 call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
394 return new_nht;
358} 395}
359 396
360struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, 397struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -363,19 +400,26 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
363 struct neighbour *n; 400 struct neighbour *n;
364 int key_len = tbl->key_len; 401 int key_len = tbl->key_len;
365 u32 hash_val; 402 u32 hash_val;
403 struct neigh_hash_table *nht;
366 404
367 NEIGH_CACHE_STAT_INC(tbl, lookups); 405 NEIGH_CACHE_STAT_INC(tbl, lookups);
368 406
369 read_lock_bh(&tbl->lock); 407 rcu_read_lock_bh();
370 hash_val = tbl->hash(pkey, dev); 408 nht = rcu_dereference_bh(tbl->nht);
371 for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { 409 hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
410
411 for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
412 n != NULL;
413 n = rcu_dereference_bh(n->next)) {
372 if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { 414 if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
373 neigh_hold(n); 415 if (!atomic_inc_not_zero(&n->refcnt))
416 n = NULL;
374 NEIGH_CACHE_STAT_INC(tbl, hits); 417 NEIGH_CACHE_STAT_INC(tbl, hits);
375 break; 418 break;
376 } 419 }
377 } 420 }
378 read_unlock_bh(&tbl->lock); 421
422 rcu_read_unlock_bh();
379 return n; 423 return n;
380} 424}
381EXPORT_SYMBOL(neigh_lookup); 425EXPORT_SYMBOL(neigh_lookup);
@@ -386,20 +430,27 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
386 struct neighbour *n; 430 struct neighbour *n;
387 int key_len = tbl->key_len; 431 int key_len = tbl->key_len;
388 u32 hash_val; 432 u32 hash_val;
433 struct neigh_hash_table *nht;
389 434
390 NEIGH_CACHE_STAT_INC(tbl, lookups); 435 NEIGH_CACHE_STAT_INC(tbl, lookups);
391 436
392 read_lock_bh(&tbl->lock); 437 rcu_read_lock_bh();
393 hash_val = tbl->hash(pkey, NULL); 438 nht = rcu_dereference_bh(tbl->nht);
394 for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { 439 hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
440
441 for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
442 n != NULL;
443 n = rcu_dereference_bh(n->next)) {
395 if (!memcmp(n->primary_key, pkey, key_len) && 444 if (!memcmp(n->primary_key, pkey, key_len) &&
396 net_eq(dev_net(n->dev), net)) { 445 net_eq(dev_net(n->dev), net)) {
397 neigh_hold(n); 446 if (!atomic_inc_not_zero(&n->refcnt))
447 n = NULL;
398 NEIGH_CACHE_STAT_INC(tbl, hits); 448 NEIGH_CACHE_STAT_INC(tbl, hits);
399 break; 449 break;
400 } 450 }
401 } 451 }
402 read_unlock_bh(&tbl->lock); 452
453 rcu_read_unlock_bh();
403 return n; 454 return n;
404} 455}
405EXPORT_SYMBOL(neigh_lookup_nodev); 456EXPORT_SYMBOL(neigh_lookup_nodev);
@@ -411,6 +462,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
411 int key_len = tbl->key_len; 462 int key_len = tbl->key_len;
412 int error; 463 int error;
413 struct neighbour *n1, *rc, *n = neigh_alloc(tbl); 464 struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
465 struct neigh_hash_table *nht;
414 466
415 if (!n) { 467 if (!n) {
416 rc = ERR_PTR(-ENOBUFS); 468 rc = ERR_PTR(-ENOBUFS);
@@ -437,18 +489,24 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
437 n->confirmed = jiffies - (n->parms->base_reachable_time << 1); 489 n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
438 490
439 write_lock_bh(&tbl->lock); 491 write_lock_bh(&tbl->lock);
492 nht = rcu_dereference_protected(tbl->nht,
493 lockdep_is_held(&tbl->lock));
440 494
441 if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) 495 if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
442 neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); 496 nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
443 497
444 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; 498 hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
445 499
446 if (n->parms->dead) { 500 if (n->parms->dead) {
447 rc = ERR_PTR(-EINVAL); 501 rc = ERR_PTR(-EINVAL);
448 goto out_tbl_unlock; 502 goto out_tbl_unlock;
449 } 503 }
450 504
451 for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { 505 for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
506 lockdep_is_held(&tbl->lock));
507 n1 != NULL;
508 n1 = rcu_dereference_protected(n1->next,
509 lockdep_is_held(&tbl->lock))) {
452 if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { 510 if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
453 neigh_hold(n1); 511 neigh_hold(n1);
454 rc = n1; 512 rc = n1;
@@ -456,10 +514,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
456 } 514 }
457 } 515 }
458 516
459 n->next = tbl->hash_buckets[hash_val];
460 tbl->hash_buckets[hash_val] = n;
461 n->dead = 0; 517 n->dead = 0;
462 neigh_hold(n); 518 neigh_hold(n);
519 rcu_assign_pointer(n->next,
520 rcu_dereference_protected(nht->hash_buckets[hash_val],
521 lockdep_is_held(&tbl->lock)));
522 rcu_assign_pointer(nht->hash_buckets[hash_val], n);
463 write_unlock_bh(&tbl->lock); 523 write_unlock_bh(&tbl->lock);
464 NEIGH_PRINTK2("neigh %p is created.\n", n); 524 NEIGH_PRINTK2("neigh %p is created.\n", n);
465 rc = n; 525 rc = n;
@@ -616,6 +676,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
616 neigh_parms_destroy(parms); 676 neigh_parms_destroy(parms);
617} 677}
618 678
679static void neigh_destroy_rcu(struct rcu_head *head)
680{
681 struct neighbour *neigh = container_of(head, struct neighbour, rcu);
682
683 kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
684}
619/* 685/*
620 * neighbour must already be out of the table; 686 * neighbour must already be out of the table;
621 * 687 *
@@ -643,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh)
643 write_seqlock_bh(&hh->hh_lock); 709 write_seqlock_bh(&hh->hh_lock);
644 hh->hh_output = neigh_blackhole; 710 hh->hh_output = neigh_blackhole;
645 write_sequnlock_bh(&hh->hh_lock); 711 write_sequnlock_bh(&hh->hh_lock);
646 if (atomic_dec_and_test(&hh->hh_refcnt)) 712 hh_cache_put(hh);
647 kfree(hh);
648 } 713 }
649 714
650 skb_queue_purge(&neigh->arp_queue); 715 skb_queue_purge(&neigh->arp_queue);
@@ -655,7 +720,7 @@ void neigh_destroy(struct neighbour *neigh)
655 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); 720 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
656 721
657 atomic_dec(&neigh->tbl->entries); 722 atomic_dec(&neigh->tbl->entries);
658 kmem_cache_free(neigh->tbl->kmem_cachep, neigh); 723 call_rcu(&neigh->rcu, neigh_destroy_rcu);
659} 724}
660EXPORT_SYMBOL(neigh_destroy); 725EXPORT_SYMBOL(neigh_destroy);
661 726
@@ -696,12 +761,16 @@ static void neigh_connect(struct neighbour *neigh)
696static void neigh_periodic_work(struct work_struct *work) 761static void neigh_periodic_work(struct work_struct *work)
697{ 762{
698 struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); 763 struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
699 struct neighbour *n, **np; 764 struct neighbour *n;
765 struct neighbour __rcu **np;
700 unsigned int i; 766 unsigned int i;
767 struct neigh_hash_table *nht;
701 768
702 NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); 769 NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
703 770
704 write_lock_bh(&tbl->lock); 771 write_lock_bh(&tbl->lock);
772 nht = rcu_dereference_protected(tbl->nht,
773 lockdep_is_held(&tbl->lock));
705 774
706 /* 775 /*
707 * periodically recompute ReachableTime from random function 776 * periodically recompute ReachableTime from random function
@@ -715,10 +784,11 @@ static void neigh_periodic_work(struct work_struct *work)
715 neigh_rand_reach_time(p->base_reachable_time); 784 neigh_rand_reach_time(p->base_reachable_time);
716 } 785 }
717 786
718 for (i = 0 ; i <= tbl->hash_mask; i++) { 787 for (i = 0 ; i <= nht->hash_mask; i++) {
719 np = &tbl->hash_buckets[i]; 788 np = &nht->hash_buckets[i];
720 789
721 while ((n = *np) != NULL) { 790 while ((n = rcu_dereference_protected(*np,
791 lockdep_is_held(&tbl->lock))) != NULL) {
722 unsigned int state; 792 unsigned int state;
723 793
724 write_lock(&n->lock); 794 write_lock(&n->lock);
@@ -766,9 +836,9 @@ next_elt:
766static __inline__ int neigh_max_probes(struct neighbour *n) 836static __inline__ int neigh_max_probes(struct neighbour *n)
767{ 837{
768 struct neigh_parms *p = n->parms; 838 struct neigh_parms *p = n->parms;
769 return (n->nud_state & NUD_PROBE ? 839 return (n->nud_state & NUD_PROBE) ?
770 p->ucast_probes : 840 p->ucast_probes :
771 p->ucast_probes + p->app_probes + p->mcast_probes); 841 p->ucast_probes + p->app_probes + p->mcast_probes;
772} 842}
773 843
774static void neigh_invalidate(struct neighbour *neigh) 844static void neigh_invalidate(struct neighbour *neigh)
@@ -945,7 +1015,7 @@ out_unlock_bh:
945} 1015}
946EXPORT_SYMBOL(__neigh_event_send); 1016EXPORT_SYMBOL(__neigh_event_send);
947 1017
948static void neigh_update_hhs(struct neighbour *neigh) 1018static void neigh_update_hhs(const struct neighbour *neigh)
949{ 1019{
950 struct hh_cache *hh; 1020 struct hh_cache *hh;
951 void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) 1021 void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1081,7 +1151,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1081 } 1151 }
1082 1152
1083 if (lladdr != neigh->ha) { 1153 if (lladdr != neigh->ha) {
1154 write_seqlock(&neigh->ha_lock);
1084 memcpy(&neigh->ha, lladdr, dev->addr_len); 1155 memcpy(&neigh->ha, lladdr, dev->addr_len);
1156 write_sequnlock(&neigh->ha_lock);
1085 neigh_update_hhs(neigh); 1157 neigh_update_hhs(neigh);
1086 if (!(new & NUD_CONNECTED)) 1158 if (!(new & NUD_CONNECTED))
1087 neigh->confirmed = jiffies - 1159 neigh->confirmed = jiffies -
@@ -1139,44 +1211,73 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
1139} 1211}
1140EXPORT_SYMBOL(neigh_event_ns); 1212EXPORT_SYMBOL(neigh_event_ns);
1141 1213
1214static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
1215 __be16 protocol)
1216{
1217 struct hh_cache *hh;
1218
1219 smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
1220 for (hh = n->hh; hh; hh = hh->hh_next) {
1221 if (hh->hh_type == protocol) {
1222 atomic_inc(&hh->hh_refcnt);
1223 if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
1224 hh_cache_put(hh);
1225 return true;
1226 }
1227 }
1228 return false;
1229}
1230
1231/* called with read_lock_bh(&n->lock); */
1142static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, 1232static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
1143 __be16 protocol) 1233 __be16 protocol)
1144{ 1234{
1145 struct hh_cache *hh; 1235 struct hh_cache *hh;
1146 struct net_device *dev = dst->dev; 1236 struct net_device *dev = dst->dev;
1147 1237
1148 for (hh = n->hh; hh; hh = hh->hh_next) 1238 if (likely(neigh_hh_lookup(n, dst, protocol)))
1149 if (hh->hh_type == protocol) 1239 return;
1150 break;
1151 1240
1152 if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { 1241 /* slow path */
1153 seqlock_init(&hh->hh_lock); 1242 hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
1154 hh->hh_type = protocol; 1243 if (!hh)
1155 atomic_set(&hh->hh_refcnt, 0); 1244 return;
1156 hh->hh_next = NULL;
1157 1245
1158 if (dev->header_ops->cache(n, hh)) { 1246 seqlock_init(&hh->hh_lock);
1159 kfree(hh); 1247 hh->hh_type = protocol;
1160 hh = NULL; 1248 atomic_set(&hh->hh_refcnt, 2);
1161 } else { 1249
1162 atomic_inc(&hh->hh_refcnt); 1250 if (dev->header_ops->cache(n, hh)) {
1163 hh->hh_next = n->hh; 1251 kfree(hh);
1164 n->hh = hh; 1252 return;
1165 if (n->nud_state & NUD_CONNECTED)
1166 hh->hh_output = n->ops->hh_output;
1167 else
1168 hh->hh_output = n->ops->output;
1169 }
1170 } 1253 }
1171 if (hh) { 1254
1172 atomic_inc(&hh->hh_refcnt); 1255 write_lock_bh(&n->lock);
1173 dst->hh = hh; 1256
1257 /* must check if another thread already did the insert */
1258 if (neigh_hh_lookup(n, dst, protocol)) {
1259 kfree(hh);
1260 goto end;
1174 } 1261 }
1262
1263 if (n->nud_state & NUD_CONNECTED)
1264 hh->hh_output = n->ops->hh_output;
1265 else
1266 hh->hh_output = n->ops->output;
1267
1268 hh->hh_next = n->hh;
1269 smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
1270 n->hh = hh;
1271
1272 if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
1273 hh_cache_put(hh);
1274end:
1275 write_unlock_bh(&n->lock);
1175} 1276}
1176 1277
1177/* This function can be used in contexts, where only old dev_queue_xmit 1278/* This function can be used in contexts, where only old dev_queue_xmit
1178 worked, f.e. if you want to override normal output path (eql, shaper), 1279 * worked, f.e. if you want to override normal output path (eql, shaper),
1179 but resolution is not made yet. 1280 * but resolution is not made yet.
1180 */ 1281 */
1181 1282
1182int neigh_compat_output(struct sk_buff *skb) 1283int neigh_compat_output(struct sk_buff *skb)
@@ -1210,19 +1311,19 @@ int neigh_resolve_output(struct sk_buff *skb)
1210 if (!neigh_event_send(neigh, skb)) { 1311 if (!neigh_event_send(neigh, skb)) {
1211 int err; 1312 int err;
1212 struct net_device *dev = neigh->dev; 1313 struct net_device *dev = neigh->dev;
1213 if (dev->header_ops->cache && !dst->hh) { 1314 unsigned int seq;
1214 write_lock_bh(&neigh->lock); 1315
1215 if (!dst->hh) 1316 if (dev->header_ops->cache &&
1216 neigh_hh_init(neigh, dst, dst->ops->protocol); 1317 !dst->hh &&
1217 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 1318 !(dst->flags & DST_NOCACHE))
1218 neigh->ha, NULL, skb->len); 1319 neigh_hh_init(neigh, dst, dst->ops->protocol);
1219 write_unlock_bh(&neigh->lock); 1320
1220 } else { 1321 do {
1221 read_lock_bh(&neigh->lock); 1322 seq = read_seqbegin(&neigh->ha_lock);
1222 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 1323 err = dev_hard_header(skb, dev, ntohs(skb->protocol),
1223 neigh->ha, NULL, skb->len); 1324 neigh->ha, NULL, skb->len);
1224 read_unlock_bh(&neigh->lock); 1325 } while (read_seqretry(&neigh->ha_lock, seq));
1225 } 1326
1226 if (err >= 0) 1327 if (err >= 0)
1227 rc = neigh->ops->queue_xmit(skb); 1328 rc = neigh->ops->queue_xmit(skb);
1228 else 1329 else
@@ -1248,13 +1349,16 @@ int neigh_connected_output(struct sk_buff *skb)
1248 struct dst_entry *dst = skb_dst(skb); 1349 struct dst_entry *dst = skb_dst(skb);
1249 struct neighbour *neigh = dst->neighbour; 1350 struct neighbour *neigh = dst->neighbour;
1250 struct net_device *dev = neigh->dev; 1351 struct net_device *dev = neigh->dev;
1352 unsigned int seq;
1251 1353
1252 __skb_pull(skb, skb_network_offset(skb)); 1354 __skb_pull(skb, skb_network_offset(skb));
1253 1355
1254 read_lock_bh(&neigh->lock); 1356 do {
1255 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 1357 seq = read_seqbegin(&neigh->ha_lock);
1256 neigh->ha, NULL, skb->len); 1358 err = dev_hard_header(skb, dev, ntohs(skb->protocol),
1257 read_unlock_bh(&neigh->lock); 1359 neigh->ha, NULL, skb->len);
1360 } while (read_seqretry(&neigh->ha_lock, seq));
1361
1258 if (err >= 0) 1362 if (err >= 0)
1259 err = neigh->ops->queue_xmit(skb); 1363 err = neigh->ops->queue_xmit(skb);
1260 else { 1364 else {
@@ -1436,17 +1540,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
1436 panic("cannot create neighbour proc dir entry"); 1540 panic("cannot create neighbour proc dir entry");
1437#endif 1541#endif
1438 1542
1439 tbl->hash_mask = 1; 1543 RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8));
1440 tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
1441 1544
1442 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); 1545 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
1443 tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); 1546 tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
1444 1547
1445 if (!tbl->hash_buckets || !tbl->phash_buckets) 1548 if (!tbl->nht || !tbl->phash_buckets)
1446 panic("cannot allocate neighbour cache hashes"); 1549 panic("cannot allocate neighbour cache hashes");
1447 1550
1448 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
1449
1450 rwlock_init(&tbl->lock); 1551 rwlock_init(&tbl->lock);
1451 INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); 1552 INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
1452 schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); 1553 schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
@@ -1486,8 +1587,7 @@ int neigh_table_clear(struct neigh_table *tbl)
1486 struct neigh_table **tp; 1587 struct neigh_table **tp;
1487 1588
1488 /* It is not clean... Fix it to unload IPv6 module safely */ 1589 /* It is not clean... Fix it to unload IPv6 module safely */
1489 cancel_delayed_work(&tbl->gc_work); 1590 cancel_delayed_work_sync(&tbl->gc_work);
1490 flush_scheduled_work();
1491 del_timer_sync(&tbl->proxy_timer); 1591 del_timer_sync(&tbl->proxy_timer);
1492 pneigh_queue_purge(&tbl->proxy_queue); 1592 pneigh_queue_purge(&tbl->proxy_queue);
1493 neigh_ifdown(tbl, NULL); 1593 neigh_ifdown(tbl, NULL);
@@ -1502,8 +1602,9 @@ int neigh_table_clear(struct neigh_table *tbl)
1502 } 1602 }
1503 write_unlock(&neigh_tbl_lock); 1603 write_unlock(&neigh_tbl_lock);
1504 1604
1505 neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); 1605 call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
1506 tbl->hash_buckets = NULL; 1606 neigh_hash_free_rcu);
1607 tbl->nht = NULL;
1507 1608
1508 kfree(tbl->phash_buckets); 1609 kfree(tbl->phash_buckets);
1509 tbl->phash_buckets = NULL; 1610 tbl->phash_buckets = NULL;
@@ -1529,6 +1630,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1529 struct net_device *dev = NULL; 1630 struct net_device *dev = NULL;
1530 int err = -EINVAL; 1631 int err = -EINVAL;
1531 1632
1633 ASSERT_RTNL();
1532 if (nlmsg_len(nlh) < sizeof(*ndm)) 1634 if (nlmsg_len(nlh) < sizeof(*ndm))
1533 goto out; 1635 goto out;
1534 1636
@@ -1538,7 +1640,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1538 1640
1539 ndm = nlmsg_data(nlh); 1641 ndm = nlmsg_data(nlh);
1540 if (ndm->ndm_ifindex) { 1642 if (ndm->ndm_ifindex) {
1541 dev = dev_get_by_index(net, ndm->ndm_ifindex); 1643 dev = __dev_get_by_index(net, ndm->ndm_ifindex);
1542 if (dev == NULL) { 1644 if (dev == NULL) {
1543 err = -ENODEV; 1645 err = -ENODEV;
1544 goto out; 1646 goto out;
@@ -1554,34 +1656,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1554 read_unlock(&neigh_tbl_lock); 1656 read_unlock(&neigh_tbl_lock);
1555 1657
1556 if (nla_len(dst_attr) < tbl->key_len) 1658 if (nla_len(dst_attr) < tbl->key_len)
1557 goto out_dev_put; 1659 goto out;
1558 1660
1559 if (ndm->ndm_flags & NTF_PROXY) { 1661 if (ndm->ndm_flags & NTF_PROXY) {
1560 err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); 1662 err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
1561 goto out_dev_put; 1663 goto out;
1562 } 1664 }
1563 1665
1564 if (dev == NULL) 1666 if (dev == NULL)
1565 goto out_dev_put; 1667 goto out;
1566 1668
1567 neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); 1669 neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
1568 if (neigh == NULL) { 1670 if (neigh == NULL) {
1569 err = -ENOENT; 1671 err = -ENOENT;
1570 goto out_dev_put; 1672 goto out;
1571 } 1673 }
1572 1674
1573 err = neigh_update(neigh, NULL, NUD_FAILED, 1675 err = neigh_update(neigh, NULL, NUD_FAILED,
1574 NEIGH_UPDATE_F_OVERRIDE | 1676 NEIGH_UPDATE_F_OVERRIDE |
1575 NEIGH_UPDATE_F_ADMIN); 1677 NEIGH_UPDATE_F_ADMIN);
1576 neigh_release(neigh); 1678 neigh_release(neigh);
1577 goto out_dev_put; 1679 goto out;
1578 } 1680 }
1579 read_unlock(&neigh_tbl_lock); 1681 read_unlock(&neigh_tbl_lock);
1580 err = -EAFNOSUPPORT; 1682 err = -EAFNOSUPPORT;
1581 1683
1582out_dev_put:
1583 if (dev)
1584 dev_put(dev);
1585out: 1684out:
1586 return err; 1685 return err;
1587} 1686}
@@ -1595,6 +1694,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1595 struct net_device *dev = NULL; 1694 struct net_device *dev = NULL;
1596 int err; 1695 int err;
1597 1696
1697 ASSERT_RTNL();
1598 err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); 1698 err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
1599 if (err < 0) 1699 if (err < 0)
1600 goto out; 1700 goto out;
@@ -1605,14 +1705,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1605 1705
1606 ndm = nlmsg_data(nlh); 1706 ndm = nlmsg_data(nlh);
1607 if (ndm->ndm_ifindex) { 1707 if (ndm->ndm_ifindex) {
1608 dev = dev_get_by_index(net, ndm->ndm_ifindex); 1708 dev = __dev_get_by_index(net, ndm->ndm_ifindex);
1609 if (dev == NULL) { 1709 if (dev == NULL) {
1610 err = -ENODEV; 1710 err = -ENODEV;
1611 goto out; 1711 goto out;
1612 } 1712 }
1613 1713
1614 if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) 1714 if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
1615 goto out_dev_put; 1715 goto out;
1616 } 1716 }
1617 1717
1618 read_lock(&neigh_tbl_lock); 1718 read_lock(&neigh_tbl_lock);
@@ -1626,7 +1726,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1626 read_unlock(&neigh_tbl_lock); 1726 read_unlock(&neigh_tbl_lock);
1627 1727
1628 if (nla_len(tb[NDA_DST]) < tbl->key_len) 1728 if (nla_len(tb[NDA_DST]) < tbl->key_len)
1629 goto out_dev_put; 1729 goto out;
1630 dst = nla_data(tb[NDA_DST]); 1730 dst = nla_data(tb[NDA_DST]);
1631 lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; 1731 lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
1632 1732
@@ -1639,29 +1739,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1639 pn->flags = ndm->ndm_flags; 1739 pn->flags = ndm->ndm_flags;
1640 err = 0; 1740 err = 0;
1641 } 1741 }
1642 goto out_dev_put; 1742 goto out;
1643 } 1743 }
1644 1744
1645 if (dev == NULL) 1745 if (dev == NULL)
1646 goto out_dev_put; 1746 goto out;
1647 1747
1648 neigh = neigh_lookup(tbl, dst, dev); 1748 neigh = neigh_lookup(tbl, dst, dev);
1649 if (neigh == NULL) { 1749 if (neigh == NULL) {
1650 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { 1750 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1651 err = -ENOENT; 1751 err = -ENOENT;
1652 goto out_dev_put; 1752 goto out;
1653 } 1753 }
1654 1754
1655 neigh = __neigh_lookup_errno(tbl, dst, dev); 1755 neigh = __neigh_lookup_errno(tbl, dst, dev);
1656 if (IS_ERR(neigh)) { 1756 if (IS_ERR(neigh)) {
1657 err = PTR_ERR(neigh); 1757 err = PTR_ERR(neigh);
1658 goto out_dev_put; 1758 goto out;
1659 } 1759 }
1660 } else { 1760 } else {
1661 if (nlh->nlmsg_flags & NLM_F_EXCL) { 1761 if (nlh->nlmsg_flags & NLM_F_EXCL) {
1662 err = -EEXIST; 1762 err = -EEXIST;
1663 neigh_release(neigh); 1763 neigh_release(neigh);
1664 goto out_dev_put; 1764 goto out;
1665 } 1765 }
1666 1766
1667 if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) 1767 if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1674,15 +1774,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1674 } else 1774 } else
1675 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); 1775 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
1676 neigh_release(neigh); 1776 neigh_release(neigh);
1677 goto out_dev_put; 1777 goto out;
1678 } 1778 }
1679 1779
1680 read_unlock(&neigh_tbl_lock); 1780 read_unlock(&neigh_tbl_lock);
1681 err = -EAFNOSUPPORT; 1781 err = -EAFNOSUPPORT;
1682
1683out_dev_put:
1684 if (dev)
1685 dev_put(dev);
1686out: 1782out:
1687 return err; 1783 return err;
1688} 1784}
@@ -1748,18 +1844,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
1748 unsigned long now = jiffies; 1844 unsigned long now = jiffies;
1749 unsigned int flush_delta = now - tbl->last_flush; 1845 unsigned int flush_delta = now - tbl->last_flush;
1750 unsigned int rand_delta = now - tbl->last_rand; 1846 unsigned int rand_delta = now - tbl->last_rand;
1751 1847 struct neigh_hash_table *nht;
1752 struct ndt_config ndc = { 1848 struct ndt_config ndc = {
1753 .ndtc_key_len = tbl->key_len, 1849 .ndtc_key_len = tbl->key_len,
1754 .ndtc_entry_size = tbl->entry_size, 1850 .ndtc_entry_size = tbl->entry_size,
1755 .ndtc_entries = atomic_read(&tbl->entries), 1851 .ndtc_entries = atomic_read(&tbl->entries),
1756 .ndtc_last_flush = jiffies_to_msecs(flush_delta), 1852 .ndtc_last_flush = jiffies_to_msecs(flush_delta),
1757 .ndtc_last_rand = jiffies_to_msecs(rand_delta), 1853 .ndtc_last_rand = jiffies_to_msecs(rand_delta),
1758 .ndtc_hash_rnd = tbl->hash_rnd,
1759 .ndtc_hash_mask = tbl->hash_mask,
1760 .ndtc_proxy_qlen = tbl->proxy_queue.qlen, 1854 .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
1761 }; 1855 };
1762 1856
1857 rcu_read_lock_bh();
1858 nht = rcu_dereference_bh(tbl->nht);
1859 ndc.ndtc_hash_rnd = nht->hash_rnd;
1860 ndc.ndtc_hash_mask = nht->hash_mask;
1861 rcu_read_unlock_bh();
1862
1763 NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); 1863 NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
1764 } 1864 }
1765 1865
@@ -2056,10 +2156,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
2056 2156
2057 read_lock_bh(&neigh->lock); 2157 read_lock_bh(&neigh->lock);
2058 ndm->ndm_state = neigh->nud_state; 2158 ndm->ndm_state = neigh->nud_state;
2059 if ((neigh->nud_state & NUD_VALID) && 2159 if (neigh->nud_state & NUD_VALID) {
2060 nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) { 2160 char haddr[MAX_ADDR_LEN];
2061 read_unlock_bh(&neigh->lock); 2161
2062 goto nla_put_failure; 2162 neigh_ha_snapshot(haddr, neigh, neigh->dev);
2163 if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
2164 read_unlock_bh(&neigh->lock);
2165 goto nla_put_failure;
2166 }
2063 } 2167 }
2064 2168
2065 ci.ndm_used = jiffies_to_clock_t(now - neigh->used); 2169 ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
@@ -2087,18 +2191,23 @@ static void neigh_update_notify(struct neighbour *neigh)
2087static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, 2191static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2088 struct netlink_callback *cb) 2192 struct netlink_callback *cb)
2089{ 2193{
2090 struct net * net = sock_net(skb->sk); 2194 struct net *net = sock_net(skb->sk);
2091 struct neighbour *n; 2195 struct neighbour *n;
2092 int rc, h, s_h = cb->args[1]; 2196 int rc, h, s_h = cb->args[1];
2093 int idx, s_idx = idx = cb->args[2]; 2197 int idx, s_idx = idx = cb->args[2];
2198 struct neigh_hash_table *nht;
2094 2199
2095 read_lock_bh(&tbl->lock); 2200 rcu_read_lock_bh();
2096 for (h = 0; h <= tbl->hash_mask; h++) { 2201 nht = rcu_dereference_bh(tbl->nht);
2202
2203 for (h = 0; h <= nht->hash_mask; h++) {
2097 if (h < s_h) 2204 if (h < s_h)
2098 continue; 2205 continue;
2099 if (h > s_h) 2206 if (h > s_h)
2100 s_idx = 0; 2207 s_idx = 0;
2101 for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { 2208 for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
2209 n != NULL;
2210 n = rcu_dereference_bh(n->next)) {
2102 if (!net_eq(dev_net(n->dev), net)) 2211 if (!net_eq(dev_net(n->dev), net))
2103 continue; 2212 continue;
2104 if (idx < s_idx) 2213 if (idx < s_idx)
@@ -2107,17 +2216,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2107 cb->nlh->nlmsg_seq, 2216 cb->nlh->nlmsg_seq,
2108 RTM_NEWNEIGH, 2217 RTM_NEWNEIGH,
2109 NLM_F_MULTI) <= 0) { 2218 NLM_F_MULTI) <= 0) {
2110 read_unlock_bh(&tbl->lock);
2111 rc = -1; 2219 rc = -1;
2112 goto out; 2220 goto out;
2113 } 2221 }
2114 next: 2222next:
2115 idx++; 2223 idx++;
2116 } 2224 }
2117 } 2225 }
2118 read_unlock_bh(&tbl->lock);
2119 rc = skb->len; 2226 rc = skb->len;
2120out: 2227out:
2228 rcu_read_unlock_bh();
2121 cb->args[1] = h; 2229 cb->args[1] = h;
2122 cb->args[2] = idx; 2230 cb->args[2] = idx;
2123 return rc; 2231 return rc;
@@ -2150,15 +2258,22 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2150void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) 2258void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
2151{ 2259{
2152 int chain; 2260 int chain;
2261 struct neigh_hash_table *nht;
2153 2262
2154 read_lock_bh(&tbl->lock); 2263 rcu_read_lock_bh();
2155 for (chain = 0; chain <= tbl->hash_mask; chain++) { 2264 nht = rcu_dereference_bh(tbl->nht);
2265
2266 read_lock(&tbl->lock); /* avoid resizes */
2267 for (chain = 0; chain <= nht->hash_mask; chain++) {
2156 struct neighbour *n; 2268 struct neighbour *n;
2157 2269
2158 for (n = tbl->hash_buckets[chain]; n; n = n->next) 2270 for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
2271 n != NULL;
2272 n = rcu_dereference_bh(n->next))
2159 cb(n, cookie); 2273 cb(n, cookie);
2160 } 2274 }
2161 read_unlock_bh(&tbl->lock); 2275 read_unlock(&tbl->lock);
2276 rcu_read_unlock_bh();
2162} 2277}
2163EXPORT_SYMBOL(neigh_for_each); 2278EXPORT_SYMBOL(neigh_for_each);
2164 2279
@@ -2167,18 +2282,25 @@ void __neigh_for_each_release(struct neigh_table *tbl,
2167 int (*cb)(struct neighbour *)) 2282 int (*cb)(struct neighbour *))
2168{ 2283{
2169 int chain; 2284 int chain;
2285 struct neigh_hash_table *nht;
2170 2286
2171 for (chain = 0; chain <= tbl->hash_mask; chain++) { 2287 nht = rcu_dereference_protected(tbl->nht,
2172 struct neighbour *n, **np; 2288 lockdep_is_held(&tbl->lock));
2289 for (chain = 0; chain <= nht->hash_mask; chain++) {
2290 struct neighbour *n;
2291 struct neighbour __rcu **np;
2173 2292
2174 np = &tbl->hash_buckets[chain]; 2293 np = &nht->hash_buckets[chain];
2175 while ((n = *np) != NULL) { 2294 while ((n = rcu_dereference_protected(*np,
2295 lockdep_is_held(&tbl->lock))) != NULL) {
2176 int release; 2296 int release;
2177 2297
2178 write_lock(&n->lock); 2298 write_lock(&n->lock);
2179 release = cb(n); 2299 release = cb(n);
2180 if (release) { 2300 if (release) {
2181 *np = n->next; 2301 rcu_assign_pointer(*np,
2302 rcu_dereference_protected(n->next,
2303 lockdep_is_held(&tbl->lock)));
2182 n->dead = 1; 2304 n->dead = 1;
2183 } else 2305 } else
2184 np = &n->next; 2306 np = &n->next;
@@ -2196,13 +2318,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
2196{ 2318{
2197 struct neigh_seq_state *state = seq->private; 2319 struct neigh_seq_state *state = seq->private;
2198 struct net *net = seq_file_net(seq); 2320 struct net *net = seq_file_net(seq);
2199 struct neigh_table *tbl = state->tbl; 2321 struct neigh_hash_table *nht = state->nht;
2200 struct neighbour *n = NULL; 2322 struct neighbour *n = NULL;
2201 int bucket = state->bucket; 2323 int bucket = state->bucket;
2202 2324
2203 state->flags &= ~NEIGH_SEQ_IS_PNEIGH; 2325 state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
2204 for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { 2326 for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
2205 n = tbl->hash_buckets[bucket]; 2327 n = rcu_dereference_bh(nht->hash_buckets[bucket]);
2206 2328
2207 while (n) { 2329 while (n) {
2208 if (!net_eq(dev_net(n->dev), net)) 2330 if (!net_eq(dev_net(n->dev), net))
@@ -2219,8 +2341,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
2219 break; 2341 break;
2220 if (n->nud_state & ~NUD_NOARP) 2342 if (n->nud_state & ~NUD_NOARP)
2221 break; 2343 break;
2222 next: 2344next:
2223 n = n->next; 2345 n = rcu_dereference_bh(n->next);
2224 } 2346 }
2225 2347
2226 if (n) 2348 if (n)
@@ -2237,14 +2359,14 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
2237{ 2359{
2238 struct neigh_seq_state *state = seq->private; 2360 struct neigh_seq_state *state = seq->private;
2239 struct net *net = seq_file_net(seq); 2361 struct net *net = seq_file_net(seq);
2240 struct neigh_table *tbl = state->tbl; 2362 struct neigh_hash_table *nht = state->nht;
2241 2363
2242 if (state->neigh_sub_iter) { 2364 if (state->neigh_sub_iter) {
2243 void *v = state->neigh_sub_iter(state, n, pos); 2365 void *v = state->neigh_sub_iter(state, n, pos);
2244 if (v) 2366 if (v)
2245 return n; 2367 return n;
2246 } 2368 }
2247 n = n->next; 2369 n = rcu_dereference_bh(n->next);
2248 2370
2249 while (1) { 2371 while (1) {
2250 while (n) { 2372 while (n) {
@@ -2261,17 +2383,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
2261 2383
2262 if (n->nud_state & ~NUD_NOARP) 2384 if (n->nud_state & ~NUD_NOARP)
2263 break; 2385 break;
2264 next: 2386next:
2265 n = n->next; 2387 n = rcu_dereference_bh(n->next);
2266 } 2388 }
2267 2389
2268 if (n) 2390 if (n)
2269 break; 2391 break;
2270 2392
2271 if (++state->bucket > tbl->hash_mask) 2393 if (++state->bucket > nht->hash_mask)
2272 break; 2394 break;
2273 2395
2274 n = tbl->hash_buckets[state->bucket]; 2396 n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
2275 } 2397 }
2276 2398
2277 if (n && pos) 2399 if (n && pos)
@@ -2369,7 +2491,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
2369} 2491}
2370 2492
2371void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) 2493void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
2372 __acquires(tbl->lock) 2494 __acquires(rcu_bh)
2373{ 2495{
2374 struct neigh_seq_state *state = seq->private; 2496 struct neigh_seq_state *state = seq->private;
2375 2497
@@ -2377,7 +2499,8 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
2377 state->bucket = 0; 2499 state->bucket = 0;
2378 state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); 2500 state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
2379 2501
2380 read_lock_bh(&tbl->lock); 2502 rcu_read_lock_bh();
2503 state->nht = rcu_dereference_bh(tbl->nht);
2381 2504
2382 return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; 2505 return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
2383} 2506}
@@ -2411,12 +2534,9 @@ out:
2411EXPORT_SYMBOL(neigh_seq_next); 2534EXPORT_SYMBOL(neigh_seq_next);
2412 2535
2413void neigh_seq_stop(struct seq_file *seq, void *v) 2536void neigh_seq_stop(struct seq_file *seq, void *v)
2414 __releases(tbl->lock) 2537 __releases(rcu_bh)
2415{ 2538{
2416 struct neigh_seq_state *state = seq->private; 2539 rcu_read_unlock_bh();
2417 struct neigh_table *tbl = state->tbl;
2418
2419 read_unlock_bh(&tbl->lock);
2420} 2540}
2421EXPORT_SYMBOL(neigh_seq_stop); 2541EXPORT_SYMBOL(neigh_seq_stop);
2422 2542
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index af4dfbadf2a0..33d2a1fba131 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -28,6 +28,7 @@
28static const char fmt_hex[] = "%#x\n"; 28static const char fmt_hex[] = "%#x\n";
29static const char fmt_long_hex[] = "%#lx\n"; 29static const char fmt_long_hex[] = "%#lx\n";
30static const char fmt_dec[] = "%d\n"; 30static const char fmt_dec[] = "%d\n";
31static const char fmt_udec[] = "%u\n";
31static const char fmt_ulong[] = "%lu\n"; 32static const char fmt_ulong[] = "%lu\n";
32static const char fmt_u64[] = "%llu\n"; 33static const char fmt_u64[] = "%llu\n";
33 34
@@ -99,7 +100,7 @@ NETDEVICE_SHOW(addr_assign_type, fmt_dec);
99NETDEVICE_SHOW(addr_len, fmt_dec); 100NETDEVICE_SHOW(addr_len, fmt_dec);
100NETDEVICE_SHOW(iflink, fmt_dec); 101NETDEVICE_SHOW(iflink, fmt_dec);
101NETDEVICE_SHOW(ifindex, fmt_dec); 102NETDEVICE_SHOW(ifindex, fmt_dec);
102NETDEVICE_SHOW(features, fmt_long_hex); 103NETDEVICE_SHOW(features, fmt_hex);
103NETDEVICE_SHOW(type, fmt_dec); 104NETDEVICE_SHOW(type, fmt_dec);
104NETDEVICE_SHOW(link_mode, fmt_dec); 105NETDEVICE_SHOW(link_mode, fmt_dec);
105 106
@@ -145,13 +146,10 @@ static ssize_t show_speed(struct device *dev,
145 if (!rtnl_trylock()) 146 if (!rtnl_trylock())
146 return restart_syscall(); 147 return restart_syscall();
147 148
148 if (netif_running(netdev) && 149 if (netif_running(netdev)) {
149 netdev->ethtool_ops && 150 struct ethtool_cmd cmd;
150 netdev->ethtool_ops->get_settings) { 151 if (!dev_ethtool_get_settings(netdev, &cmd))
151 struct ethtool_cmd cmd = { ETHTOOL_GSET }; 152 ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
152
153 if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
154 ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
155 } 153 }
156 rtnl_unlock(); 154 rtnl_unlock();
157 return ret; 155 return ret;
@@ -166,13 +164,11 @@ static ssize_t show_duplex(struct device *dev,
166 if (!rtnl_trylock()) 164 if (!rtnl_trylock())
167 return restart_syscall(); 165 return restart_syscall();
168 166
169 if (netif_running(netdev) && 167 if (netif_running(netdev)) {
170 netdev->ethtool_ops && 168 struct ethtool_cmd cmd;
171 netdev->ethtool_ops->get_settings) { 169 if (!dev_ethtool_get_settings(netdev, &cmd))
172 struct ethtool_cmd cmd = { ETHTOOL_GSET }; 170 ret = sprintf(buf, "%s\n",
173 171 cmd.duplex ? "full" : "half");
174 if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
175 ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half");
176 } 172 }
177 rtnl_unlock(); 173 rtnl_unlock();
178 return ret; 174 return ret;
@@ -295,6 +291,20 @@ static ssize_t show_ifalias(struct device *dev,
295 return ret; 291 return ret;
296} 292}
297 293
294NETDEVICE_SHOW(group, fmt_dec);
295
296static int change_group(struct net_device *net, unsigned long new_group)
297{
298 dev_set_group(net, (int) new_group);
299 return 0;
300}
301
302static ssize_t store_group(struct device *dev, struct device_attribute *attr,
303 const char *buf, size_t len)
304{
305 return netdev_store(dev, attr, buf, len, change_group);
306}
307
298static struct device_attribute net_class_attributes[] = { 308static struct device_attribute net_class_attributes[] = {
299 __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), 309 __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
300 __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), 310 __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
@@ -316,6 +326,7 @@ static struct device_attribute net_class_attributes[] = {
316 __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), 326 __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
317 __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, 327 __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
318 store_tx_queue_len), 328 store_tx_queue_len),
329 __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group),
319 {} 330 {}
320}; 331};
321 332
@@ -515,7 +526,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
515 return attribute->store(queue, attribute, buf, count); 526 return attribute->store(queue, attribute, buf, count);
516} 527}
517 528
518static struct sysfs_ops rx_queue_sysfs_ops = { 529static const struct sysfs_ops rx_queue_sysfs_ops = {
519 .show = rx_queue_attr_show, 530 .show = rx_queue_attr_show,
520 .store = rx_queue_attr_store, 531 .store = rx_queue_attr_store,
521}; 532};
@@ -550,13 +561,6 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,
550 return len; 561 return len;
551} 562}
552 563
553static void rps_map_release(struct rcu_head *rcu)
554{
555 struct rps_map *map = container_of(rcu, struct rps_map, rcu);
556
557 kfree(map);
558}
559
560static ssize_t store_rps_map(struct netdev_rx_queue *queue, 564static ssize_t store_rps_map(struct netdev_rx_queue *queue,
561 struct rx_queue_attribute *attribute, 565 struct rx_queue_attribute *attribute,
562 const char *buf, size_t len) 566 const char *buf, size_t len)
@@ -598,12 +602,13 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
598 } 602 }
599 603
600 spin_lock(&rps_map_lock); 604 spin_lock(&rps_map_lock);
601 old_map = queue->rps_map; 605 old_map = rcu_dereference_protected(queue->rps_map,
606 lockdep_is_held(&rps_map_lock));
602 rcu_assign_pointer(queue->rps_map, map); 607 rcu_assign_pointer(queue->rps_map, map);
603 spin_unlock(&rps_map_lock); 608 spin_unlock(&rps_map_lock);
604 609
605 if (old_map) 610 if (old_map)
606 call_rcu(&old_map->rcu, rps_map_release); 611 kfree_rcu(old_map, rcu);
607 612
608 free_cpumask_var(mask); 613 free_cpumask_var(mask);
609 return len; 614 return len;
@@ -677,7 +682,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
677 table = NULL; 682 table = NULL;
678 683
679 spin_lock(&rps_dev_flow_lock); 684 spin_lock(&rps_dev_flow_lock);
680 old_table = queue->rps_flow_table; 685 old_table = rcu_dereference_protected(queue->rps_flow_table,
686 lockdep_is_held(&rps_dev_flow_lock));
681 rcu_assign_pointer(queue->rps_flow_table, table); 687 rcu_assign_pointer(queue->rps_flow_table, table);
682 spin_unlock(&rps_dev_flow_lock); 688 spin_unlock(&rps_dev_flow_lock);
683 689
@@ -704,17 +710,24 @@ static struct attribute *rx_queue_default_attrs[] = {
704static void rx_queue_release(struct kobject *kobj) 710static void rx_queue_release(struct kobject *kobj)
705{ 711{
706 struct netdev_rx_queue *queue = to_rx_queue(kobj); 712 struct netdev_rx_queue *queue = to_rx_queue(kobj);
707 struct netdev_rx_queue *first = queue->first; 713 struct rps_map *map;
714 struct rps_dev_flow_table *flow_table;
708 715
709 if (queue->rps_map)
710 call_rcu(&queue->rps_map->rcu, rps_map_release);
711 716
712 if (queue->rps_flow_table) 717 map = rcu_dereference_raw(queue->rps_map);
713 call_rcu(&queue->rps_flow_table->rcu, 718 if (map) {
714 rps_dev_flow_table_release); 719 RCU_INIT_POINTER(queue->rps_map, NULL);
720 kfree_rcu(map, rcu);
721 }
715 722
716 if (atomic_dec_and_test(&first->count)) 723 flow_table = rcu_dereference_raw(queue->rps_flow_table);
717 kfree(first); 724 if (flow_table) {
725 RCU_INIT_POINTER(queue->rps_flow_table, NULL);
726 call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
727 }
728
729 memset(kobj, 0, sizeof(*kobj));
730 dev_put(queue->dev);
718} 731}
719 732
720static struct kobj_type rx_queue_ktype = { 733static struct kobj_type rx_queue_ktype = {
@@ -738,45 +751,442 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
738 } 751 }
739 752
740 kobject_uevent(kobj, KOBJ_ADD); 753 kobject_uevent(kobj, KOBJ_ADD);
754 dev_hold(queue->dev);
741 755
742 return error; 756 return error;
743} 757}
758#endif /* CONFIG_RPS */
744 759
745static int rx_queue_register_kobjects(struct net_device *net) 760int
761net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
746{ 762{
763#ifdef CONFIG_RPS
747 int i; 764 int i;
748 int error = 0; 765 int error = 0;
749 766
767 for (i = old_num; i < new_num; i++) {
768 error = rx_queue_add_kobject(net, i);
769 if (error) {
770 new_num = old_num;
771 break;
772 }
773 }
774
775 while (--i >= new_num)
776 kobject_put(&net->_rx[i].kobj);
777
778 return error;
779#else
780 return 0;
781#endif
782}
783
784#ifdef CONFIG_XPS
785/*
786 * netdev_queue sysfs structures and functions.
787 */
788struct netdev_queue_attribute {
789 struct attribute attr;
790 ssize_t (*show)(struct netdev_queue *queue,
791 struct netdev_queue_attribute *attr, char *buf);
792 ssize_t (*store)(struct netdev_queue *queue,
793 struct netdev_queue_attribute *attr, const char *buf, size_t len);
794};
795#define to_netdev_queue_attr(_attr) container_of(_attr, \
796 struct netdev_queue_attribute, attr)
797
798#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
799
800static ssize_t netdev_queue_attr_show(struct kobject *kobj,
801 struct attribute *attr, char *buf)
802{
803 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
804 struct netdev_queue *queue = to_netdev_queue(kobj);
805
806 if (!attribute->show)
807 return -EIO;
808
809 return attribute->show(queue, attribute, buf);
810}
811
812static ssize_t netdev_queue_attr_store(struct kobject *kobj,
813 struct attribute *attr,
814 const char *buf, size_t count)
815{
816 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
817 struct netdev_queue *queue = to_netdev_queue(kobj);
818
819 if (!attribute->store)
820 return -EIO;
821
822 return attribute->store(queue, attribute, buf, count);
823}
824
825static const struct sysfs_ops netdev_queue_sysfs_ops = {
826 .show = netdev_queue_attr_show,
827 .store = netdev_queue_attr_store,
828};
829
830static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
831{
832 struct net_device *dev = queue->dev;
833 int i;
834
835 for (i = 0; i < dev->num_tx_queues; i++)
836 if (queue == &dev->_tx[i])
837 break;
838
839 BUG_ON(i >= dev->num_tx_queues);
840
841 return i;
842}
843
844
845static ssize_t show_xps_map(struct netdev_queue *queue,
846 struct netdev_queue_attribute *attribute, char *buf)
847{
848 struct net_device *dev = queue->dev;
849 struct xps_dev_maps *dev_maps;
850 cpumask_var_t mask;
851 unsigned long index;
852 size_t len = 0;
853 int i;
854
855 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
856 return -ENOMEM;
857
858 index = get_netdev_queue_index(queue);
859
860 rcu_read_lock();
861 dev_maps = rcu_dereference(dev->xps_maps);
862 if (dev_maps) {
863 for_each_possible_cpu(i) {
864 struct xps_map *map =
865 rcu_dereference(dev_maps->cpu_map[i]);
866 if (map) {
867 int j;
868 for (j = 0; j < map->len; j++) {
869 if (map->queues[j] == index) {
870 cpumask_set_cpu(i, mask);
871 break;
872 }
873 }
874 }
875 }
876 }
877 rcu_read_unlock();
878
879 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
880 if (PAGE_SIZE - len < 3) {
881 free_cpumask_var(mask);
882 return -EINVAL;
883 }
884
885 free_cpumask_var(mask);
886 len += sprintf(buf + len, "\n");
887 return len;
888}
889
890static DEFINE_MUTEX(xps_map_mutex);
891#define xmap_dereference(P) \
892 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
893
894static ssize_t store_xps_map(struct netdev_queue *queue,
895 struct netdev_queue_attribute *attribute,
896 const char *buf, size_t len)
897{
898 struct net_device *dev = queue->dev;
899 cpumask_var_t mask;
900 int err, i, cpu, pos, map_len, alloc_len, need_set;
901 unsigned long index;
902 struct xps_map *map, *new_map;
903 struct xps_dev_maps *dev_maps, *new_dev_maps;
904 int nonempty = 0;
905 int numa_node = -2;
906
907 if (!capable(CAP_NET_ADMIN))
908 return -EPERM;
909
910 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
911 return -ENOMEM;
912
913 index = get_netdev_queue_index(queue);
914
915 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
916 if (err) {
917 free_cpumask_var(mask);
918 return err;
919 }
920
921 new_dev_maps = kzalloc(max_t(unsigned,
922 XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL);
923 if (!new_dev_maps) {
924 free_cpumask_var(mask);
925 return -ENOMEM;
926 }
927
928 mutex_lock(&xps_map_mutex);
929
930 dev_maps = xmap_dereference(dev->xps_maps);
931
932 for_each_possible_cpu(cpu) {
933 map = dev_maps ?
934 xmap_dereference(dev_maps->cpu_map[cpu]) : NULL;
935 new_map = map;
936 if (map) {
937 for (pos = 0; pos < map->len; pos++)
938 if (map->queues[pos] == index)
939 break;
940 map_len = map->len;
941 alloc_len = map->alloc_len;
942 } else
943 pos = map_len = alloc_len = 0;
944
945 need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
946#ifdef CONFIG_NUMA
947 if (need_set) {
948 if (numa_node == -2)
949 numa_node = cpu_to_node(cpu);
950 else if (numa_node != cpu_to_node(cpu))
951 numa_node = -1;
952 }
953#endif
954 if (need_set && pos >= map_len) {
955 /* Need to add queue to this CPU's map */
956 if (map_len >= alloc_len) {
957 alloc_len = alloc_len ?
958 2 * alloc_len : XPS_MIN_MAP_ALLOC;
959 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len),
960 GFP_KERNEL,
961 cpu_to_node(cpu));
962 if (!new_map)
963 goto error;
964 new_map->alloc_len = alloc_len;
965 for (i = 0; i < map_len; i++)
966 new_map->queues[i] = map->queues[i];
967 new_map->len = map_len;
968 }
969 new_map->queues[new_map->len++] = index;
970 } else if (!need_set && pos < map_len) {
971 /* Need to remove queue from this CPU's map */
972 if (map_len > 1)
973 new_map->queues[pos] =
974 new_map->queues[--new_map->len];
975 else
976 new_map = NULL;
977 }
978 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map);
979 }
980
981 /* Cleanup old maps */
982 for_each_possible_cpu(cpu) {
983 map = dev_maps ?
984 xmap_dereference(dev_maps->cpu_map[cpu]) : NULL;
985 if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map)
986 kfree_rcu(map, rcu);
987 if (new_dev_maps->cpu_map[cpu])
988 nonempty = 1;
989 }
990
991 if (nonempty)
992 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
993 else {
994 kfree(new_dev_maps);
995 rcu_assign_pointer(dev->xps_maps, NULL);
996 }
997
998 if (dev_maps)
999 kfree_rcu(dev_maps, rcu);
1000
1001 netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node :
1002 NUMA_NO_NODE);
1003
1004 mutex_unlock(&xps_map_mutex);
1005
1006 free_cpumask_var(mask);
1007 return len;
1008
1009error:
1010 mutex_unlock(&xps_map_mutex);
1011
1012 if (new_dev_maps)
1013 for_each_possible_cpu(i)
1014 kfree(rcu_dereference_protected(
1015 new_dev_maps->cpu_map[i],
1016 1));
1017 kfree(new_dev_maps);
1018 free_cpumask_var(mask);
1019 return -ENOMEM;
1020}
1021
1022static struct netdev_queue_attribute xps_cpus_attribute =
1023 __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
1024
1025static struct attribute *netdev_queue_default_attrs[] = {
1026 &xps_cpus_attribute.attr,
1027 NULL
1028};
1029
1030static void netdev_queue_release(struct kobject *kobj)
1031{
1032 struct netdev_queue *queue = to_netdev_queue(kobj);
1033 struct net_device *dev = queue->dev;
1034 struct xps_dev_maps *dev_maps;
1035 struct xps_map *map;
1036 unsigned long index;
1037 int i, pos, nonempty = 0;
1038
1039 index = get_netdev_queue_index(queue);
1040
1041 mutex_lock(&xps_map_mutex);
1042 dev_maps = xmap_dereference(dev->xps_maps);
1043
1044 if (dev_maps) {
1045 for_each_possible_cpu(i) {
1046 map = xmap_dereference(dev_maps->cpu_map[i]);
1047 if (!map)
1048 continue;
1049
1050 for (pos = 0; pos < map->len; pos++)
1051 if (map->queues[pos] == index)
1052 break;
1053
1054 if (pos < map->len) {
1055 if (map->len > 1)
1056 map->queues[pos] =
1057 map->queues[--map->len];
1058 else {
1059 RCU_INIT_POINTER(dev_maps->cpu_map[i],
1060 NULL);
1061 kfree_rcu(map, rcu);
1062 map = NULL;
1063 }
1064 }
1065 if (map)
1066 nonempty = 1;
1067 }
1068
1069 if (!nonempty) {
1070 RCU_INIT_POINTER(dev->xps_maps, NULL);
1071 kfree_rcu(dev_maps, rcu);
1072 }
1073 }
1074
1075 mutex_unlock(&xps_map_mutex);
1076
1077 memset(kobj, 0, sizeof(*kobj));
1078 dev_put(queue->dev);
1079}
1080
1081static struct kobj_type netdev_queue_ktype = {
1082 .sysfs_ops = &netdev_queue_sysfs_ops,
1083 .release = netdev_queue_release,
1084 .default_attrs = netdev_queue_default_attrs,
1085};
1086
1087static int netdev_queue_add_kobject(struct net_device *net, int index)
1088{
1089 struct netdev_queue *queue = net->_tx + index;
1090 struct kobject *kobj = &queue->kobj;
1091 int error = 0;
1092
1093 kobj->kset = net->queues_kset;
1094 error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
1095 "tx-%u", index);
1096 if (error) {
1097 kobject_put(kobj);
1098 return error;
1099 }
1100
1101 kobject_uevent(kobj, KOBJ_ADD);
1102 dev_hold(queue->dev);
1103
1104 return error;
1105}
1106#endif /* CONFIG_XPS */
1107
1108int
1109netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
1110{
1111#ifdef CONFIG_XPS
1112 int i;
1113 int error = 0;
1114
1115 for (i = old_num; i < new_num; i++) {
1116 error = netdev_queue_add_kobject(net, i);
1117 if (error) {
1118 new_num = old_num;
1119 break;
1120 }
1121 }
1122
1123 while (--i >= new_num)
1124 kobject_put(&net->_tx[i].kobj);
1125
1126 return error;
1127#else
1128 return 0;
1129#endif
1130}
1131
1132static int register_queue_kobjects(struct net_device *net)
1133{
1134 int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
1135
1136#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
750 net->queues_kset = kset_create_and_add("queues", 1137 net->queues_kset = kset_create_and_add("queues",
751 NULL, &net->dev.kobj); 1138 NULL, &net->dev.kobj);
752 if (!net->queues_kset) 1139 if (!net->queues_kset)
753 return -ENOMEM; 1140 return -ENOMEM;
754 for (i = 0; i < net->num_rx_queues; i++) { 1141#endif
755 error = rx_queue_add_kobject(net, i); 1142
756 if (error) 1143#ifdef CONFIG_RPS
757 break; 1144 real_rx = net->real_num_rx_queues;
758 } 1145#endif
1146 real_tx = net->real_num_tx_queues;
759 1147
1148 error = net_rx_queue_update_kobjects(net, 0, real_rx);
760 if (error) 1149 if (error)
761 while (--i >= 0) 1150 goto error;
762 kobject_put(&net->_rx[i].kobj); 1151 rxq = real_rx;
763 1152
1153 error = netdev_queue_update_kobjects(net, 0, real_tx);
1154 if (error)
1155 goto error;
1156 txq = real_tx;
1157
1158 return 0;
1159
1160error:
1161 netdev_queue_update_kobjects(net, txq, 0);
1162 net_rx_queue_update_kobjects(net, rxq, 0);
764 return error; 1163 return error;
765} 1164}
766 1165
767static void rx_queue_remove_kobjects(struct net_device *net) 1166static void remove_queue_kobjects(struct net_device *net)
768{ 1167{
769 int i; 1168 int real_rx = 0, real_tx = 0;
770 1169
771 for (i = 0; i < net->num_rx_queues; i++) 1170#ifdef CONFIG_RPS
772 kobject_put(&net->_rx[i].kobj); 1171 real_rx = net->real_num_rx_queues;
1172#endif
1173 real_tx = net->real_num_tx_queues;
1174
1175 net_rx_queue_update_kobjects(net, real_rx, 0);
1176 netdev_queue_update_kobjects(net, real_tx, 0);
1177#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
773 kset_unregister(net->queues_kset); 1178 kset_unregister(net->queues_kset);
1179#endif
774} 1180}
775#endif /* CONFIG_RPS */
776 1181
777static const void *net_current_ns(void) 1182static void *net_grab_current_ns(void)
778{ 1183{
779 return current->nsproxy->net_ns; 1184 struct net *ns = current->nsproxy->net_ns;
1185#ifdef CONFIG_NET_NS
1186 if (ns)
1187 atomic_inc(&ns->passive);
1188#endif
1189 return ns;
780} 1190}
781 1191
782static const void *net_initial_ns(void) 1192static const void *net_initial_ns(void)
@@ -789,22 +1199,14 @@ static const void *net_netlink_ns(struct sock *sk)
789 return sock_net(sk); 1199 return sock_net(sk);
790} 1200}
791 1201
792static struct kobj_ns_type_operations net_ns_type_operations = { 1202struct kobj_ns_type_operations net_ns_type_operations = {
793 .type = KOBJ_NS_TYPE_NET, 1203 .type = KOBJ_NS_TYPE_NET,
794 .current_ns = net_current_ns, 1204 .grab_current_ns = net_grab_current_ns,
795 .netlink_ns = net_netlink_ns, 1205 .netlink_ns = net_netlink_ns,
796 .initial_ns = net_initial_ns, 1206 .initial_ns = net_initial_ns,
1207 .drop_ns = net_drop_ns,
797}; 1208};
798 1209EXPORT_SYMBOL_GPL(net_ns_type_operations);
799static void net_kobj_ns_exit(struct net *net)
800{
801 kobj_ns_exit(KOBJ_NS_TYPE_NET, net);
802}
803
804static struct pernet_operations kobj_net_ops = {
805 .exit = net_kobj_ns_exit,
806};
807
808 1210
809#ifdef CONFIG_HOTPLUG 1211#ifdef CONFIG_HOTPLUG
810static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) 1212static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
@@ -870,9 +1272,7 @@ void netdev_unregister_kobject(struct net_device * net)
870 1272
871 kobject_get(&dev->kobj); 1273 kobject_get(&dev->kobj);
872 1274
873#ifdef CONFIG_RPS 1275 remove_queue_kobjects(net);
874 rx_queue_remove_kobjects(net);
875#endif
876 1276
877 device_del(dev); 1277 device_del(dev);
878} 1278}
@@ -911,13 +1311,11 @@ int netdev_register_kobject(struct net_device *net)
911 if (error) 1311 if (error)
912 return error; 1312 return error;
913 1313
914#ifdef CONFIG_RPS 1314 error = register_queue_kobjects(net);
915 error = rx_queue_register_kobjects(net);
916 if (error) { 1315 if (error) {
917 device_del(dev); 1316 device_del(dev);
918 return error; 1317 return error;
919 } 1318 }
920#endif
921 1319
922 return error; 1320 return error;
923} 1321}
@@ -937,6 +1335,5 @@ EXPORT_SYMBOL(netdev_class_remove_file);
937int netdev_kobject_init(void) 1335int netdev_kobject_init(void)
938{ 1336{
939 kobj_ns_type_register(&net_ns_type_operations); 1337 kobj_ns_type_register(&net_ns_type_operations);
940 register_pernet_subsys(&kobj_net_ops);
941 return class_register(&net_class); 1338 return class_register(&net_class);
942} 1339}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 805555e8b187..bd7751ec1c4d 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -4,4 +4,8 @@
4int netdev_kobject_init(void); 4int netdev_kobject_init(void);
5int netdev_register_kobject(struct net_device *); 5int netdev_register_kobject(struct net_device *);
6void netdev_unregister_kobject(struct net_device *); 6void netdev_unregister_kobject(struct net_device *);
7int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
8int netdev_queue_update_kobjects(struct net_device *net,
9 int old_num, int new_num);
10
7#endif 11#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380ed88a..7f1bb2aba03b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
26 26
27#define CREATE_TRACE_POINTS 27#define CREATE_TRACE_POINTS
28#include <trace/events/skb.h> 28#include <trace/events/skb.h>
29#include <trace/events/net.h>
29#include <trace/events/napi.h> 30#include <trace/events/napi.h>
30 31
31EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); 32EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c988e685433a..ea489db1bc23 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -8,6 +8,8 @@
8#include <linux/idr.h> 8#include <linux/idr.h>
9#include <linux/rculist.h> 9#include <linux/rculist.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/proc_fs.h>
12#include <linux/file.h>
11#include <net/net_namespace.h> 13#include <net/net_namespace.h>
12#include <net/netns/generic.h> 14#include <net/netns/generic.h>
13 15
@@ -27,14 +29,6 @@ EXPORT_SYMBOL(init_net);
27 29
28#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ 30#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
29 31
30static void net_generic_release(struct rcu_head *rcu)
31{
32 struct net_generic *ng;
33
34 ng = container_of(rcu, struct net_generic, rcu);
35 kfree(ng);
36}
37
38static int net_assign_generic(struct net *net, int id, void *data) 32static int net_assign_generic(struct net *net, int id, void *data)
39{ 33{
40 struct net_generic *ng, *old_ng; 34 struct net_generic *ng, *old_ng;
@@ -42,7 +36,9 @@ static int net_assign_generic(struct net *net, int id, void *data)
42 BUG_ON(!mutex_is_locked(&net_mutex)); 36 BUG_ON(!mutex_is_locked(&net_mutex));
43 BUG_ON(id == 0); 37 BUG_ON(id == 0);
44 38
45 ng = old_ng = net->gen; 39 old_ng = rcu_dereference_protected(net->gen,
40 lockdep_is_held(&net_mutex));
41 ng = old_ng;
46 if (old_ng->len >= id) 42 if (old_ng->len >= id)
47 goto assign; 43 goto assign;
48 44
@@ -66,7 +62,7 @@ static int net_assign_generic(struct net *net, int id, void *data)
66 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); 62 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
67 63
68 rcu_assign_pointer(net->gen, ng); 64 rcu_assign_pointer(net->gen, ng);
69 call_rcu(&old_ng->rcu, net_generic_release); 65 kfree_rcu(old_ng, rcu);
70assign: 66assign:
71 ng->ptr[id - 1] = data; 67 ng->ptr[id - 1] = data;
72 return 0; 68 return 0;
@@ -132,6 +128,7 @@ static __net_init int setup_net(struct net *net)
132 LIST_HEAD(net_exit_list); 128 LIST_HEAD(net_exit_list);
133 129
134 atomic_set(&net->count, 1); 130 atomic_set(&net->count, 1);
131 atomic_set(&net->passive, 1);
135 132
136#ifdef NETNS_REFCNT_DEBUG 133#ifdef NETNS_REFCNT_DEBUG
137 atomic_set(&net->use_count, 0); 134 atomic_set(&net->use_count, 0);
@@ -214,11 +211,21 @@ static void net_free(struct net *net)
214 kmem_cache_free(net_cachep, net); 211 kmem_cache_free(net_cachep, net);
215} 212}
216 213
217static struct net *net_create(void) 214void net_drop_ns(void *p)
215{
216 struct net *ns = p;
217 if (ns && atomic_dec_and_test(&ns->passive))
218 net_free(ns);
219}
220
221struct net *copy_net_ns(unsigned long flags, struct net *old_net)
218{ 222{
219 struct net *net; 223 struct net *net;
220 int rv; 224 int rv;
221 225
226 if (!(flags & CLONE_NEWNET))
227 return get_net(old_net);
228
222 net = net_alloc(); 229 net = net_alloc();
223 if (!net) 230 if (!net)
224 return ERR_PTR(-ENOMEM); 231 return ERR_PTR(-ENOMEM);
@@ -231,19 +238,12 @@ static struct net *net_create(void)
231 } 238 }
232 mutex_unlock(&net_mutex); 239 mutex_unlock(&net_mutex);
233 if (rv < 0) { 240 if (rv < 0) {
234 net_free(net); 241 net_drop_ns(net);
235 return ERR_PTR(rv); 242 return ERR_PTR(rv);
236 } 243 }
237 return net; 244 return net;
238} 245}
239 246
240struct net *copy_net_ns(unsigned long flags, struct net *old_net)
241{
242 if (!(flags & CLONE_NEWNET))
243 return get_net(old_net);
244 return net_create();
245}
246
247static DEFINE_SPINLOCK(cleanup_list_lock); 247static DEFINE_SPINLOCK(cleanup_list_lock);
248static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ 248static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
249 249
@@ -294,7 +294,7 @@ static void cleanup_net(struct work_struct *work)
294 /* Finally it is safe to free my network namespace structure */ 294 /* Finally it is safe to free my network namespace structure */
295 list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { 295 list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
296 list_del_init(&net->exit_list); 296 list_del_init(&net->exit_list);
297 net_free(net); 297 net_drop_ns(net);
298 } 298 }
299} 299}
300static DECLARE_WORK(net_cleanup_work, cleanup_net); 300static DECLARE_WORK(net_cleanup_work, cleanup_net);
@@ -312,6 +312,26 @@ void __put_net(struct net *net)
312} 312}
313EXPORT_SYMBOL_GPL(__put_net); 313EXPORT_SYMBOL_GPL(__put_net);
314 314
315struct net *get_net_ns_by_fd(int fd)
316{
317 struct proc_inode *ei;
318 struct file *file;
319 struct net *net;
320
321 file = proc_ns_fget(fd);
322 if (IS_ERR(file))
323 return ERR_CAST(file);
324
325 ei = PROC_I(file->f_dentry->d_inode);
326 if (ei->ns_ops == &netns_operations)
327 net = get_net(ei->ns);
328 else
329 net = ERR_PTR(-EINVAL);
330
331 fput(file);
332 return net;
333}
334
315#else 335#else
316struct net *copy_net_ns(unsigned long flags, struct net *old_net) 336struct net *copy_net_ns(unsigned long flags, struct net *old_net)
317{ 337{
@@ -319,6 +339,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
319 return ERR_PTR(-EINVAL); 339 return ERR_PTR(-EINVAL);
320 return old_net; 340 return old_net;
321} 341}
342
343struct net *get_net_ns_by_fd(int fd)
344{
345 return ERR_PTR(-EINVAL);
346}
322#endif 347#endif
323 348
324struct net *get_net_ns_by_pid(pid_t pid) 349struct net *get_net_ns_by_pid(pid_t pid)
@@ -571,3 +596,39 @@ void unregister_pernet_device(struct pernet_operations *ops)
571 mutex_unlock(&net_mutex); 596 mutex_unlock(&net_mutex);
572} 597}
573EXPORT_SYMBOL_GPL(unregister_pernet_device); 598EXPORT_SYMBOL_GPL(unregister_pernet_device);
599
600#ifdef CONFIG_NET_NS
601static void *netns_get(struct task_struct *task)
602{
603 struct net *net = NULL;
604 struct nsproxy *nsproxy;
605
606 rcu_read_lock();
607 nsproxy = task_nsproxy(task);
608 if (nsproxy)
609 net = get_net(nsproxy->net_ns);
610 rcu_read_unlock();
611
612 return net;
613}
614
615static void netns_put(void *ns)
616{
617 put_net(ns);
618}
619
620static int netns_install(struct nsproxy *nsproxy, void *ns)
621{
622 put_net(nsproxy->net_ns);
623 nsproxy->net_ns = get_net(ns);
624 return 0;
625}
626
627const struct proc_ns_operations netns_operations = {
628 .name = "net",
629 .type = CLONE_NEWNET,
630 .get = netns_get,
631 .put = netns_put,
632 .install = netns_install,
633};
634#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 537e01afd81b..18d9cbda3a39 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -35,7 +35,6 @@
35 35
36#define MAX_UDP_CHUNK 1460 36#define MAX_UDP_CHUNK 1460
37#define MAX_SKBS 32 37#define MAX_SKBS 32
38#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
39 38
40static struct sk_buff_head skb_pool; 39static struct sk_buff_head skb_pool;
41 40
@@ -76,8 +75,7 @@ static void queue_process(struct work_struct *work)
76 75
77 local_irq_save(flags); 76 local_irq_save(flags);
78 __netif_tx_lock(txq, smp_processor_id()); 77 __netif_tx_lock(txq, smp_processor_id());
79 if (netif_tx_queue_stopped(txq) || 78 if (netif_tx_queue_frozen_or_stopped(txq) ||
80 netif_tx_queue_frozen(txq) ||
81 ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { 79 ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
82 skb_queue_head(&npinfo->txq, skb); 80 skb_queue_head(&npinfo->txq, skb);
83 __netif_tx_unlock(txq); 81 __netif_tx_unlock(txq);
@@ -195,6 +193,17 @@ void netpoll_poll_dev(struct net_device *dev)
195 193
196 poll_napi(dev); 194 poll_napi(dev);
197 195
196 if (dev->priv_flags & IFF_SLAVE) {
197 if (dev->npinfo) {
198 struct net_device *bond_dev = dev->master;
199 struct sk_buff *skb;
200 while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) {
201 skb->dev = bond_dev;
202 skb_queue_tail(&bond_dev->npinfo->arp_tx, skb);
203 }
204 }
205 }
206
198 service_arp_queue(dev->npinfo); 207 service_arp_queue(dev->npinfo);
199 208
200 zap_completion_queue(); 209 zap_completion_queue();
@@ -288,11 +297,11 @@ static int netpoll_owner_active(struct net_device *dev)
288 return 0; 297 return 0;
289} 298}
290 299
291void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 300void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
301 struct net_device *dev)
292{ 302{
293 int status = NETDEV_TX_BUSY; 303 int status = NETDEV_TX_BUSY;
294 unsigned long tries; 304 unsigned long tries;
295 struct net_device *dev = np->dev;
296 const struct net_device_ops *ops = dev->netdev_ops; 305 const struct net_device_ops *ops = dev->netdev_ops;
297 /* It is up to the caller to keep npinfo alive. */ 306 /* It is up to the caller to keep npinfo alive. */
298 struct netpoll_info *npinfo = np->dev->npinfo; 307 struct netpoll_info *npinfo = np->dev->npinfo;
@@ -315,9 +324,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
315 tries > 0; --tries) { 324 tries > 0; --tries) {
316 if (__netif_tx_trylock(txq)) { 325 if (__netif_tx_trylock(txq)) {
317 if (!netif_tx_queue_stopped(txq)) { 326 if (!netif_tx_queue_stopped(txq)) {
318 dev->priv_flags |= IFF_IN_NETPOLL;
319 status = ops->ndo_start_xmit(skb, dev); 327 status = ops->ndo_start_xmit(skb, dev);
320 dev->priv_flags &= ~IFF_IN_NETPOLL;
321 if (status == NETDEV_TX_OK) 328 if (status == NETDEV_TX_OK)
322 txq_trans_update(txq); 329 txq_trans_update(txq);
323 } 330 }
@@ -346,7 +353,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
346 schedule_delayed_work(&npinfo->tx_work,0); 353 schedule_delayed_work(&npinfo->tx_work,0);
347 } 354 }
348} 355}
349EXPORT_SYMBOL(netpoll_send_skb); 356EXPORT_SYMBOL(netpoll_send_skb_on_dev);
350 357
351void netpoll_send_udp(struct netpoll *np, const char *msg, int len) 358void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
352{ 359{
@@ -532,7 +539,7 @@ int __netpoll_rx(struct sk_buff *skb)
532{ 539{
533 int proto, len, ulen; 540 int proto, len, ulen;
534 int hits = 0; 541 int hits = 0;
535 struct iphdr *iph; 542 const struct iphdr *iph;
536 struct udphdr *uh; 543 struct udphdr *uh;
537 struct netpoll_info *npinfo = skb->dev->npinfo; 544 struct netpoll_info *npinfo = skb->dev->npinfo;
538 struct netpoll *np, *tmp; 545 struct netpoll *np, *tmp;
@@ -691,32 +698,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
691 698
692 if (*cur != 0) { 699 if (*cur != 0) {
693 /* MAC address */ 700 /* MAC address */
694 if ((delim = strchr(cur, ':')) == NULL) 701 if (!mac_pton(cur, np->remote_mac))
695 goto parse_failed;
696 *delim = 0;
697 np->remote_mac[0] = simple_strtol(cur, NULL, 16);
698 cur = delim + 1;
699 if ((delim = strchr(cur, ':')) == NULL)
700 goto parse_failed;
701 *delim = 0;
702 np->remote_mac[1] = simple_strtol(cur, NULL, 16);
703 cur = delim + 1;
704 if ((delim = strchr(cur, ':')) == NULL)
705 goto parse_failed; 702 goto parse_failed;
706 *delim = 0;
707 np->remote_mac[2] = simple_strtol(cur, NULL, 16);
708 cur = delim + 1;
709 if ((delim = strchr(cur, ':')) == NULL)
710 goto parse_failed;
711 *delim = 0;
712 np->remote_mac[3] = simple_strtol(cur, NULL, 16);
713 cur = delim + 1;
714 if ((delim = strchr(cur, ':')) == NULL)
715 goto parse_failed;
716 *delim = 0;
717 np->remote_mac[4] = simple_strtol(cur, NULL, 16);
718 cur = delim + 1;
719 np->remote_mac[5] = simple_strtol(cur, NULL, 16);
720 } 703 }
721 704
722 netpoll_print_options(np); 705 netpoll_print_options(np);
@@ -809,6 +792,13 @@ int netpoll_setup(struct netpoll *np)
809 return -ENODEV; 792 return -ENODEV;
810 } 793 }
811 794
795 if (ndev->master) {
796 printk(KERN_ERR "%s: %s is a slave device, aborting.\n",
797 np->name, np->dev_name);
798 err = -EBUSY;
799 goto put;
800 }
801
812 if (!netif_running(ndev)) { 802 if (!netif_running(ndev)) {
813 unsigned long atmost, atleast; 803 unsigned long atmost, atleast;
814 804
@@ -925,7 +915,7 @@ void __netpoll_cleanup(struct netpoll *np)
925 915
926 skb_queue_purge(&npinfo->arp_tx); 916 skb_queue_purge(&npinfo->arp_tx);
927 skb_queue_purge(&npinfo->txq); 917 skb_queue_purge(&npinfo->txq);
928 cancel_rearming_delayed_work(&npinfo->tx_work); 918 cancel_delayed_work_sync(&npinfo->tx_work);
929 919
930 /* clean after last, unfinished work */ 920 /* clean after last, unfinished work */
931 __skb_queue_purge(&npinfo->txq); 921 __skb_queue_purge(&npinfo->txq);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 10a1ea72010d..f76079cd750c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -156,6 +156,7 @@
156#include <linux/wait.h> 156#include <linux/wait.h>
157#include <linux/etherdevice.h> 157#include <linux/etherdevice.h>
158#include <linux/kthread.h> 158#include <linux/kthread.h>
159#include <linux/prefetch.h>
159#include <net/net_namespace.h> 160#include <net/net_namespace.h>
160#include <net/checksum.h> 161#include <net/checksum.h>
161#include <net/ipv6.h> 162#include <net/ipv6.h>
@@ -251,6 +252,7 @@ struct pktgen_dev {
251 int max_pkt_size; /* = ETH_ZLEN; */ 252 int max_pkt_size; /* = ETH_ZLEN; */
252 int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ 253 int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
253 int nfrags; 254 int nfrags;
255 struct page *page;
254 u64 delay; /* nano-seconds */ 256 u64 delay; /* nano-seconds */
255 257
256 __u64 count; /* Default No packets to send */ 258 __u64 count; /* Default No packets to send */
@@ -378,6 +380,7 @@ struct pktgen_dev {
378 380
379 u16 queue_map_min; 381 u16 queue_map_min;
380 u16 queue_map_max; 382 u16 queue_map_max;
383 __u32 skb_priority; /* skb priority field */
381 int node; /* Memory node */ 384 int node; /* Memory node */
382 385
383#ifdef CONFIG_XFRM 386#ifdef CONFIG_XFRM
@@ -394,6 +397,8 @@ struct pktgen_hdr {
394 __be32 tv_usec; 397 __be32 tv_usec;
395}; 398};
396 399
400static bool pktgen_exiting __read_mostly;
401
397struct pktgen_thread { 402struct pktgen_thread {
398 spinlock_t if_lock; /* for list of devices */ 403 spinlock_t if_lock; /* for list of devices */
399 struct list_head if_list; /* All device here */ 404 struct list_head if_list; /* All device here */
@@ -445,7 +450,6 @@ static void pktgen_stop(struct pktgen_thread *t);
445static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); 450static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
446 451
447static unsigned int scan_ip6(const char *s, char ip[16]); 452static unsigned int scan_ip6(const char *s, char ip[16]);
448static unsigned int fmt_ip6(char *s, const char ip[16]);
449 453
450/* Module parameters, defaults. */ 454/* Module parameters, defaults. */
451static int pg_count_d __read_mostly = 1000; 455static int pg_count_d __read_mostly = 1000;
@@ -547,22 +551,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
547 pkt_dev->queue_map_min, 551 pkt_dev->queue_map_min,
548 pkt_dev->queue_map_max); 552 pkt_dev->queue_map_max);
549 553
550 if (pkt_dev->flags & F_IPV6) { 554 if (pkt_dev->skb_priority)
551 char b1[128], b2[128], b3[128]; 555 seq_printf(seq, " skb_priority: %u\n",
552 fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); 556 pkt_dev->skb_priority);
553 fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
554 fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
555 seq_printf(seq,
556 " saddr: %s min_saddr: %s max_saddr: %s\n", b1,
557 b2, b3);
558 557
559 fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr); 558 if (pkt_dev->flags & F_IPV6) {
560 fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
561 fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
562 seq_printf(seq, 559 seq_printf(seq,
563 " daddr: %s min_daddr: %s max_daddr: %s\n", b1, 560 " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n"
564 b2, b3); 561 " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n",
565 562 &pkt_dev->in6_saddr,
563 &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
564 &pkt_dev->in6_daddr,
565 &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
566 } else { 566 } else {
567 seq_printf(seq, 567 seq_printf(seq,
568 " dst_min: %s dst_max: %s\n", 568 " dst_min: %s dst_max: %s\n",
@@ -698,10 +698,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
698 pkt_dev->cur_src_mac_offset); 698 pkt_dev->cur_src_mac_offset);
699 699
700 if (pkt_dev->flags & F_IPV6) { 700 if (pkt_dev->flags & F_IPV6) {
701 char b1[128], b2[128]; 701 seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n",
702 fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr); 702 &pkt_dev->cur_in6_saddr,
703 fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr); 703 &pkt_dev->cur_in6_daddr);
704 seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
705 } else 704 } else
706 seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", 705 seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
707 pkt_dev->cur_saddr, pkt_dev->cur_daddr); 706 pkt_dev->cur_saddr, pkt_dev->cur_daddr);
@@ -729,16 +728,14 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
729 *num = 0; 728 *num = 0;
730 729
731 for (; i < maxlen; i++) { 730 for (; i < maxlen; i++) {
731 int value;
732 char c; 732 char c;
733 *num <<= 4; 733 *num <<= 4;
734 if (get_user(c, &user_buffer[i])) 734 if (get_user(c, &user_buffer[i]))
735 return -EFAULT; 735 return -EFAULT;
736 if ((c >= '0') && (c <= '9')) 736 value = hex_to_bin(c);
737 *num |= c - '0'; 737 if (value >= 0)
738 else if ((c >= 'a') && (c <= 'f')) 738 *num |= value;
739 *num |= c - 'a' + 10;
740 else if ((c >= 'A') && (c <= 'F'))
741 *num |= c - 'A' + 10;
742 else 739 else
743 break; 740 break;
744 } 741 }
@@ -773,10 +770,10 @@ done:
773static unsigned long num_arg(const char __user * user_buffer, 770static unsigned long num_arg(const char __user * user_buffer,
774 unsigned long maxlen, unsigned long *num) 771 unsigned long maxlen, unsigned long *num)
775{ 772{
776 int i = 0; 773 int i;
777 *num = 0; 774 *num = 0;
778 775
779 for (; i < maxlen; i++) { 776 for (i = 0; i < maxlen; i++) {
780 char c; 777 char c;
781 if (get_user(c, &user_buffer[i])) 778 if (get_user(c, &user_buffer[i]))
782 return -EFAULT; 779 return -EFAULT;
@@ -791,9 +788,9 @@ static unsigned long num_arg(const char __user * user_buffer,
791 788
792static int strn_len(const char __user * user_buffer, unsigned int maxlen) 789static int strn_len(const char __user * user_buffer, unsigned int maxlen)
793{ 790{
794 int i = 0; 791 int i;
795 792
796 for (; i < maxlen; i++) { 793 for (i = 0; i < maxlen; i++) {
797 char c; 794 char c;
798 if (get_user(c, &user_buffer[i])) 795 if (get_user(c, &user_buffer[i]))
799 return -EFAULT; 796 return -EFAULT;
@@ -848,7 +845,7 @@ static ssize_t pktgen_if_write(struct file *file,
848{ 845{
849 struct seq_file *seq = file->private_data; 846 struct seq_file *seq = file->private_data;
850 struct pktgen_dev *pkt_dev = seq->private; 847 struct pktgen_dev *pkt_dev = seq->private;
851 int i = 0, max, len; 848 int i, max, len;
852 char name[16], valstr[32]; 849 char name[16], valstr[32];
853 unsigned long value = 0; 850 unsigned long value = 0;
854 char *pg_result = NULL; 851 char *pg_result = NULL;
@@ -862,13 +859,13 @@ static ssize_t pktgen_if_write(struct file *file,
862 return -EINVAL; 859 return -EINVAL;
863 } 860 }
864 861
865 max = count - i; 862 max = count;
866 tmp = count_trail_chars(&user_buffer[i], max); 863 tmp = count_trail_chars(user_buffer, max);
867 if (tmp < 0) { 864 if (tmp < 0) {
868 pr_warning("illegal format\n"); 865 pr_warning("illegal format\n");
869 return tmp; 866 return tmp;
870 } 867 }
871 i += tmp; 868 i = tmp;
872 869
873 /* Read variable name */ 870 /* Read variable name */
874 871
@@ -889,10 +886,11 @@ static ssize_t pktgen_if_write(struct file *file,
889 i += len; 886 i += len;
890 887
891 if (debug) { 888 if (debug) {
892 char tb[count + 1]; 889 size_t copy = min_t(size_t, count, 1023);
893 if (copy_from_user(tb, user_buffer, count)) 890 char tb[copy + 1];
891 if (copy_from_user(tb, user_buffer, copy))
894 return -EFAULT; 892 return -EFAULT;
895 tb[count] = 0; 893 tb[copy] = 0;
896 printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, 894 printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name,
897 (unsigned long)count, tb); 895 (unsigned long)count, tb);
898 } 896 }
@@ -1128,6 +1126,10 @@ static ssize_t pktgen_if_write(struct file *file,
1128 if (node_possible(value)) { 1126 if (node_possible(value)) {
1129 pkt_dev->node = value; 1127 pkt_dev->node = value;
1130 sprintf(pg_result, "OK: node=%d", pkt_dev->node); 1128 sprintf(pg_result, "OK: node=%d", pkt_dev->node);
1129 if (pkt_dev->page) {
1130 put_page(pkt_dev->page);
1131 pkt_dev->page = NULL;
1132 }
1131 } 1133 }
1132 else 1134 else
1133 sprintf(pg_result, "ERROR: node not possible"); 1135 sprintf(pg_result, "ERROR: node not possible");
@@ -1298,7 +1300,7 @@ static ssize_t pktgen_if_write(struct file *file,
1298 buf[len] = 0; 1300 buf[len] = 0;
1299 1301
1300 scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); 1302 scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
1301 fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr); 1303 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
1302 1304
1303 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); 1305 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
1304 1306
@@ -1321,7 +1323,7 @@ static ssize_t pktgen_if_write(struct file *file,
1321 buf[len] = 0; 1323 buf[len] = 0;
1322 1324
1323 scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); 1325 scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
1324 fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); 1326 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
1325 1327
1326 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, 1328 ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
1327 &pkt_dev->min_in6_daddr); 1329 &pkt_dev->min_in6_daddr);
@@ -1344,7 +1346,7 @@ static ssize_t pktgen_if_write(struct file *file,
1344 buf[len] = 0; 1346 buf[len] = 0;
1345 1347
1346 scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); 1348 scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
1347 fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); 1349 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
1348 1350
1349 if (debug) 1351 if (debug)
1350 printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf); 1352 printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf);
@@ -1365,7 +1367,7 @@ static ssize_t pktgen_if_write(struct file *file,
1365 buf[len] = 0; 1367 buf[len] = 0;
1366 1368
1367 scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); 1369 scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
1368 fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr); 1370 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
1369 1371
1370 ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); 1372 ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
1371 1373
@@ -1419,11 +1421,6 @@ static ssize_t pktgen_if_write(struct file *file,
1419 return count; 1421 return count;
1420 } 1422 }
1421 if (!strcmp(name, "dst_mac")) { 1423 if (!strcmp(name, "dst_mac")) {
1422 char *v = valstr;
1423 unsigned char old_dmac[ETH_ALEN];
1424 unsigned char *m = pkt_dev->dst_mac;
1425 memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN);
1426
1427 len = strn_len(&user_buffer[i], sizeof(valstr) - 1); 1424 len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
1428 if (len < 0) 1425 if (len < 0)
1429 return len; 1426 return len;
@@ -1431,35 +1428,16 @@ static ssize_t pktgen_if_write(struct file *file,
1431 memset(valstr, 0, sizeof(valstr)); 1428 memset(valstr, 0, sizeof(valstr));
1432 if (copy_from_user(valstr, &user_buffer[i], len)) 1429 if (copy_from_user(valstr, &user_buffer[i], len))
1433 return -EFAULT; 1430 return -EFAULT;
1434 i += len;
1435
1436 for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) {
1437 int value;
1438
1439 value = hex_to_bin(*v);
1440 if (value >= 0)
1441 *m = *m * 16 + value;
1442
1443 if (*v == ':') {
1444 m++;
1445 *m = 0;
1446 }
1447 }
1448 1431
1432 if (!mac_pton(valstr, pkt_dev->dst_mac))
1433 return -EINVAL;
1449 /* Set up Dest MAC */ 1434 /* Set up Dest MAC */
1450 if (compare_ether_addr(old_dmac, pkt_dev->dst_mac)) 1435 memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN);
1451 memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
1452 1436
1453 sprintf(pg_result, "OK: dstmac"); 1437 sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
1454 return count; 1438 return count;
1455 } 1439 }
1456 if (!strcmp(name, "src_mac")) { 1440 if (!strcmp(name, "src_mac")) {
1457 char *v = valstr;
1458 unsigned char old_smac[ETH_ALEN];
1459 unsigned char *m = pkt_dev->src_mac;
1460
1461 memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN);
1462
1463 len = strn_len(&user_buffer[i], sizeof(valstr) - 1); 1441 len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
1464 if (len < 0) 1442 if (len < 0)
1465 return len; 1443 return len;
@@ -1467,26 +1445,13 @@ static ssize_t pktgen_if_write(struct file *file,
1467 memset(valstr, 0, sizeof(valstr)); 1445 memset(valstr, 0, sizeof(valstr));
1468 if (copy_from_user(valstr, &user_buffer[i], len)) 1446 if (copy_from_user(valstr, &user_buffer[i], len))
1469 return -EFAULT; 1447 return -EFAULT;
1470 i += len;
1471
1472 for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) {
1473 int value;
1474
1475 value = hex_to_bin(*v);
1476 if (value >= 0)
1477 *m = *m * 16 + value;
1478
1479 if (*v == ':') {
1480 m++;
1481 *m = 0;
1482 }
1483 }
1484 1448
1449 if (!mac_pton(valstr, pkt_dev->src_mac))
1450 return -EINVAL;
1485 /* Set up Src MAC */ 1451 /* Set up Src MAC */
1486 if (compare_ether_addr(old_smac, pkt_dev->src_mac)) 1452 memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN);
1487 memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN);
1488 1453
1489 sprintf(pg_result, "OK: srcmac"); 1454 sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
1490 return count; 1455 return count;
1491 } 1456 }
1492 1457
@@ -1712,6 +1677,18 @@ static ssize_t pktgen_if_write(struct file *file,
1712 return count; 1677 return count;
1713 } 1678 }
1714 1679
1680 if (!strcmp(name, "skb_priority")) {
1681 len = num_arg(&user_buffer[i], 9, &value);
1682 if (len < 0)
1683 return len;
1684
1685 i += len;
1686 pkt_dev->skb_priority = value;
1687 sprintf(pg_result, "OK: skb_priority=%i",
1688 pkt_dev->skb_priority);
1689 return count;
1690 }
1691
1715 sprintf(pkt_dev->result, "No such parameter \"%s\"", name); 1692 sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
1716 return -EINVAL; 1693 return -EINVAL;
1717} 1694}
@@ -1766,7 +1743,7 @@ static ssize_t pktgen_thread_write(struct file *file,
1766{ 1743{
1767 struct seq_file *seq = file->private_data; 1744 struct seq_file *seq = file->private_data;
1768 struct pktgen_thread *t = seq->private; 1745 struct pktgen_thread *t = seq->private;
1769 int i = 0, max, len, ret; 1746 int i, max, len, ret;
1770 char name[40]; 1747 char name[40];
1771 char *pg_result; 1748 char *pg_result;
1772 1749
@@ -1775,12 +1752,12 @@ static ssize_t pktgen_thread_write(struct file *file,
1775 return -EINVAL; 1752 return -EINVAL;
1776 } 1753 }
1777 1754
1778 max = count - i; 1755 max = count;
1779 len = count_trail_chars(&user_buffer[i], max); 1756 len = count_trail_chars(user_buffer, max);
1780 if (len < 0) 1757 if (len < 0)
1781 return len; 1758 return len;
1782 1759
1783 i += len; 1760 i = len;
1784 1761
1785 /* Read variable name */ 1762 /* Read variable name */
1786 1763
@@ -1977,7 +1954,7 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,
1977 const char *ifname) 1954 const char *ifname)
1978{ 1955{
1979 char b[IFNAMSIZ+5]; 1956 char b[IFNAMSIZ+5];
1980 int i = 0; 1957 int i;
1981 1958
1982 for (i = 0; ifname[i] != '@'; i++) { 1959 for (i = 0; ifname[i] != '@'; i++) {
1983 if (i == IFNAMSIZ) 1960 if (i == IFNAMSIZ)
@@ -2491,7 +2468,6 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
2491{ 2468{
2492 struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; 2469 struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
2493 int err = 0; 2470 int err = 0;
2494 struct iphdr *iph;
2495 2471
2496 if (!x) 2472 if (!x)
2497 return 0; 2473 return 0;
@@ -2501,7 +2477,6 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
2501 return 0; 2477 return 0;
2502 2478
2503 spin_lock(&x->lock); 2479 spin_lock(&x->lock);
2504 iph = ip_hdr(skb);
2505 2480
2506 err = x->outer_mode->output(x, skb); 2481 err = x->outer_mode->output(x, skb);
2507 if (err) 2482 if (err)
@@ -2521,8 +2496,8 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
2521{ 2496{
2522 if (pkt_dev->cflows) { 2497 if (pkt_dev->cflows) {
2523 /* let go of the SAs if we have them */ 2498 /* let go of the SAs if we have them */
2524 int i = 0; 2499 int i;
2525 for (; i < pkt_dev->cflows; i++) { 2500 for (i = 0; i < pkt_dev->cflows; i++) {
2526 struct xfrm_state *x = pkt_dev->flows[i].x; 2501 struct xfrm_state *x = pkt_dev->flows[i].x;
2527 if (x) { 2502 if (x) {
2528 xfrm_state_put(x); 2503 xfrm_state_put(x);
@@ -2587,6 +2562,72 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,
2587 return htons(id | (cfi << 12) | (prio << 13)); 2562 return htons(id | (cfi << 12) | (prio << 13));
2588} 2563}
2589 2564
2565static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
2566 int datalen)
2567{
2568 struct timeval timestamp;
2569 struct pktgen_hdr *pgh;
2570
2571 pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
2572 datalen -= sizeof(*pgh);
2573
2574 if (pkt_dev->nfrags <= 0) {
2575 memset(skb_put(skb, datalen), 0, datalen);
2576 } else {
2577 int frags = pkt_dev->nfrags;
2578 int i, len;
2579 int frag_len;
2580
2581
2582 if (frags > MAX_SKB_FRAGS)
2583 frags = MAX_SKB_FRAGS;
2584 len = datalen - frags * PAGE_SIZE;
2585 if (len > 0) {
2586 memset(skb_put(skb, len), 0, len);
2587 datalen = frags * PAGE_SIZE;
2588 }
2589
2590 i = 0;
2591 frag_len = (datalen/frags) < PAGE_SIZE ?
2592 (datalen/frags) : PAGE_SIZE;
2593 while (datalen > 0) {
2594 if (unlikely(!pkt_dev->page)) {
2595 int node = numa_node_id();
2596
2597 if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
2598 node = pkt_dev->node;
2599 pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2600 if (!pkt_dev->page)
2601 break;
2602 }
2603 skb_shinfo(skb)->frags[i].page = pkt_dev->page;
2604 get_page(pkt_dev->page);
2605 skb_shinfo(skb)->frags[i].page_offset = 0;
2606 /*last fragment, fill rest of data*/
2607 if (i == (frags - 1))
2608 skb_shinfo(skb)->frags[i].size =
2609 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
2610 else
2611 skb_shinfo(skb)->frags[i].size = frag_len;
2612 datalen -= skb_shinfo(skb)->frags[i].size;
2613 skb->len += skb_shinfo(skb)->frags[i].size;
2614 skb->data_len += skb_shinfo(skb)->frags[i].size;
2615 i++;
2616 skb_shinfo(skb)->nr_frags = i;
2617 }
2618 }
2619
2620 /* Stamp the time, and sequence number,
2621 * convert them to network byte order
2622 */
2623 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2624 pgh->seq_num = htonl(pkt_dev->seq_num);
2625
2626 do_gettimeofday(&timestamp);
2627 pgh->tv_sec = htonl(timestamp.tv_sec);
2628 pgh->tv_usec = htonl(timestamp.tv_usec);
2629}
2630
2590static struct sk_buff *fill_packet_ipv4(struct net_device *odev, 2631static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2591 struct pktgen_dev *pkt_dev) 2632 struct pktgen_dev *pkt_dev)
2592{ 2633{
@@ -2595,7 +2636,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2595 struct udphdr *udph; 2636 struct udphdr *udph;
2596 int datalen, iplen; 2637 int datalen, iplen;
2597 struct iphdr *iph; 2638 struct iphdr *iph;
2598 struct pktgen_hdr *pgh = NULL;
2599 __be16 protocol = htons(ETH_P_IP); 2639 __be16 protocol = htons(ETH_P_IP);
2600 __be32 *mpls; 2640 __be32 *mpls;
2601 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ 2641 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -2613,8 +2653,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2613 /* Update any of the values, used when we're incrementing various 2653 /* Update any of the values, used when we're incrementing various
2614 * fields. 2654 * fields.
2615 */ 2655 */
2616 queue_map = pkt_dev->cur_queue_map;
2617 mod_cur_headers(pkt_dev); 2656 mod_cur_headers(pkt_dev);
2657 queue_map = pkt_dev->cur_queue_map;
2618 2658
2619 datalen = (odev->hard_header_len + 16) & ~0xf; 2659 datalen = (odev->hard_header_len + 16) & ~0xf;
2620 2660
@@ -2642,6 +2682,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2642 sprintf(pkt_dev->result, "No memory"); 2682 sprintf(pkt_dev->result, "No memory");
2643 return NULL; 2683 return NULL;
2644 } 2684 }
2685 prefetchw(skb->data);
2645 2686
2646 skb_reserve(skb, datalen); 2687 skb_reserve(skb, datalen);
2647 2688
@@ -2672,6 +2713,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2672 skb->transport_header = skb->network_header + sizeof(struct iphdr); 2713 skb->transport_header = skb->network_header + sizeof(struct iphdr);
2673 skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); 2714 skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr));
2674 skb_set_queue_mapping(skb, queue_map); 2715 skb_set_queue_mapping(skb, queue_map);
2716 skb->priority = pkt_dev->skb_priority;
2717
2675 iph = ip_hdr(skb); 2718 iph = ip_hdr(skb);
2676 udph = udp_hdr(skb); 2719 udph = udp_hdr(skb);
2677 2720
@@ -2708,76 +2751,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2708 pkt_dev->pkt_overhead); 2751 pkt_dev->pkt_overhead);
2709 skb->dev = odev; 2752 skb->dev = odev;
2710 skb->pkt_type = PACKET_HOST; 2753 skb->pkt_type = PACKET_HOST;
2711 2754 pktgen_finalize_skb(pkt_dev, skb, datalen);
2712 if (pkt_dev->nfrags <= 0) {
2713 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
2714 memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr));
2715 } else {
2716 int frags = pkt_dev->nfrags;
2717 int i, len;
2718
2719 pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
2720
2721 if (frags > MAX_SKB_FRAGS)
2722 frags = MAX_SKB_FRAGS;
2723 if (datalen > frags * PAGE_SIZE) {
2724 len = datalen - frags * PAGE_SIZE;
2725 memset(skb_put(skb, len), 0, len);
2726 datalen = frags * PAGE_SIZE;
2727 }
2728
2729 i = 0;
2730 while (datalen > 0) {
2731 struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
2732 skb_shinfo(skb)->frags[i].page = page;
2733 skb_shinfo(skb)->frags[i].page_offset = 0;
2734 skb_shinfo(skb)->frags[i].size =
2735 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
2736 datalen -= skb_shinfo(skb)->frags[i].size;
2737 skb->len += skb_shinfo(skb)->frags[i].size;
2738 skb->data_len += skb_shinfo(skb)->frags[i].size;
2739 i++;
2740 skb_shinfo(skb)->nr_frags = i;
2741 }
2742
2743 while (i < frags) {
2744 int rem;
2745
2746 if (i == 0)
2747 break;
2748
2749 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
2750 if (rem == 0)
2751 break;
2752
2753 skb_shinfo(skb)->frags[i - 1].size -= rem;
2754
2755 skb_shinfo(skb)->frags[i] =
2756 skb_shinfo(skb)->frags[i - 1];
2757 get_page(skb_shinfo(skb)->frags[i].page);
2758 skb_shinfo(skb)->frags[i].page =
2759 skb_shinfo(skb)->frags[i - 1].page;
2760 skb_shinfo(skb)->frags[i].page_offset +=
2761 skb_shinfo(skb)->frags[i - 1].size;
2762 skb_shinfo(skb)->frags[i].size = rem;
2763 i++;
2764 skb_shinfo(skb)->nr_frags = i;
2765 }
2766 }
2767
2768 /* Stamp the time, and sequence number,
2769 * convert them to network byte order
2770 */
2771 if (pgh) {
2772 struct timeval timestamp;
2773
2774 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2775 pgh->seq_num = htonl(pkt_dev->seq_num);
2776
2777 do_gettimeofday(&timestamp);
2778 pgh->tv_sec = htonl(timestamp.tv_sec);
2779 pgh->tv_usec = htonl(timestamp.tv_usec);
2780 }
2781 2755
2782#ifdef CONFIG_XFRM 2756#ifdef CONFIG_XFRM
2783 if (!process_ipsec(pkt_dev, skb, protocol)) 2757 if (!process_ipsec(pkt_dev, skb, protocol))
@@ -2878,79 +2852,6 @@ static unsigned int scan_ip6(const char *s, char ip[16])
2878 return len; 2852 return len;
2879} 2853}
2880 2854
2881static char tohex(char hexdigit)
2882{
2883 return hexdigit > 9 ? hexdigit + 'a' - 10 : hexdigit + '0';
2884}
2885
2886static int fmt_xlong(char *s, unsigned int i)
2887{
2888 char *bak = s;
2889 *s = tohex((i >> 12) & 0xf);
2890 if (s != bak || *s != '0')
2891 ++s;
2892 *s = tohex((i >> 8) & 0xf);
2893 if (s != bak || *s != '0')
2894 ++s;
2895 *s = tohex((i >> 4) & 0xf);
2896 if (s != bak || *s != '0')
2897 ++s;
2898 *s = tohex(i & 0xf);
2899 return s - bak + 1;
2900}
2901
2902static unsigned int fmt_ip6(char *s, const char ip[16])
2903{
2904 unsigned int len;
2905 unsigned int i;
2906 unsigned int temp;
2907 unsigned int compressing;
2908 int j;
2909
2910 len = 0;
2911 compressing = 0;
2912 for (j = 0; j < 16; j += 2) {
2913
2914#ifdef V4MAPPEDPREFIX
2915 if (j == 12 && !memcmp(ip, V4mappedprefix, 12)) {
2916 inet_ntoa_r(*(struct in_addr *)(ip + 12), s);
2917 temp = strlen(s);
2918 return len + temp;
2919 }
2920#endif
2921 temp = ((unsigned long)(unsigned char)ip[j] << 8) +
2922 (unsigned long)(unsigned char)ip[j + 1];
2923 if (temp == 0) {
2924 if (!compressing) {
2925 compressing = 1;
2926 if (j == 0) {
2927 *s++ = ':';
2928 ++len;
2929 }
2930 }
2931 } else {
2932 if (compressing) {
2933 compressing = 0;
2934 *s++ = ':';
2935 ++len;
2936 }
2937 i = fmt_xlong(s, temp);
2938 len += i;
2939 s += i;
2940 if (j < 14) {
2941 *s++ = ':';
2942 ++len;
2943 }
2944 }
2945 }
2946 if (compressing) {
2947 *s++ = ':';
2948 ++len;
2949 }
2950 *s = 0;
2951 return len;
2952}
2953
2954static struct sk_buff *fill_packet_ipv6(struct net_device *odev, 2855static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2955 struct pktgen_dev *pkt_dev) 2856 struct pktgen_dev *pkt_dev)
2956{ 2857{
@@ -2959,7 +2860,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2959 struct udphdr *udph; 2860 struct udphdr *udph;
2960 int datalen; 2861 int datalen;
2961 struct ipv6hdr *iph; 2862 struct ipv6hdr *iph;
2962 struct pktgen_hdr *pgh = NULL;
2963 __be16 protocol = htons(ETH_P_IPV6); 2863 __be16 protocol = htons(ETH_P_IPV6);
2964 __be32 *mpls; 2864 __be32 *mpls;
2965 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ 2865 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -2977,8 +2877,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2977 /* Update any of the values, used when we're incrementing various 2877 /* Update any of the values, used when we're incrementing various
2978 * fields. 2878 * fields.
2979 */ 2879 */
2980 queue_map = pkt_dev->cur_queue_map;
2981 mod_cur_headers(pkt_dev); 2880 mod_cur_headers(pkt_dev);
2881 queue_map = pkt_dev->cur_queue_map;
2982 2882
2983 skb = __netdev_alloc_skb(odev, 2883 skb = __netdev_alloc_skb(odev,
2984 pkt_dev->cur_pkt_size + 64 2884 pkt_dev->cur_pkt_size + 64
@@ -2987,6 +2887,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2987 sprintf(pkt_dev->result, "No memory"); 2887 sprintf(pkt_dev->result, "No memory");
2988 return NULL; 2888 return NULL;
2989 } 2889 }
2890 prefetchw(skb->data);
2990 2891
2991 skb_reserve(skb, 16); 2892 skb_reserve(skb, 16);
2992 2893
@@ -3017,6 +2918,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
3017 skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); 2918 skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
3018 skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); 2919 skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr));
3019 skb_set_queue_mapping(skb, queue_map); 2920 skb_set_queue_mapping(skb, queue_map);
2921 skb->priority = pkt_dev->skb_priority;
3020 iph = ipv6_hdr(skb); 2922 iph = ipv6_hdr(skb);
3021 udph = udp_hdr(skb); 2923 udph = udp_hdr(skb);
3022 2924
@@ -3060,75 +2962,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
3060 skb->dev = odev; 2962 skb->dev = odev;
3061 skb->pkt_type = PACKET_HOST; 2963 skb->pkt_type = PACKET_HOST;
3062 2964
3063 if (pkt_dev->nfrags <= 0) 2965 pktgen_finalize_skb(pkt_dev, skb, datalen);
3064 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
3065 else {
3066 int frags = pkt_dev->nfrags;
3067 int i;
3068
3069 pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
3070
3071 if (frags > MAX_SKB_FRAGS)
3072 frags = MAX_SKB_FRAGS;
3073 if (datalen > frags * PAGE_SIZE) {
3074 skb_put(skb, datalen - frags * PAGE_SIZE);
3075 datalen = frags * PAGE_SIZE;
3076 }
3077
3078 i = 0;
3079 while (datalen > 0) {
3080 struct page *page = alloc_pages(GFP_KERNEL, 0);
3081 skb_shinfo(skb)->frags[i].page = page;
3082 skb_shinfo(skb)->frags[i].page_offset = 0;
3083 skb_shinfo(skb)->frags[i].size =
3084 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
3085 datalen -= skb_shinfo(skb)->frags[i].size;
3086 skb->len += skb_shinfo(skb)->frags[i].size;
3087 skb->data_len += skb_shinfo(skb)->frags[i].size;
3088 i++;
3089 skb_shinfo(skb)->nr_frags = i;
3090 }
3091
3092 while (i < frags) {
3093 int rem;
3094
3095 if (i == 0)
3096 break;
3097
3098 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
3099 if (rem == 0)
3100 break;
3101
3102 skb_shinfo(skb)->frags[i - 1].size -= rem;
3103
3104 skb_shinfo(skb)->frags[i] =
3105 skb_shinfo(skb)->frags[i - 1];
3106 get_page(skb_shinfo(skb)->frags[i].page);
3107 skb_shinfo(skb)->frags[i].page =
3108 skb_shinfo(skb)->frags[i - 1].page;
3109 skb_shinfo(skb)->frags[i].page_offset +=
3110 skb_shinfo(skb)->frags[i - 1].size;
3111 skb_shinfo(skb)->frags[i].size = rem;
3112 i++;
3113 skb_shinfo(skb)->nr_frags = i;
3114 }
3115 }
3116
3117 /* Stamp the time, and sequence number,
3118 * convert them to network byte order
3119 * should we update cloned packets too ?
3120 */
3121 if (pgh) {
3122 struct timeval timestamp;
3123
3124 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
3125 pgh->seq_num = htonl(pkt_dev->seq_num);
3126
3127 do_gettimeofday(&timestamp);
3128 pgh->tv_sec = htonl(timestamp.tv_sec);
3129 pgh->tv_usec = htonl(timestamp.tv_usec);
3130 }
3131 /* pkt_dev->seq_num++; FF: you really mean this? */
3132 2966
3133 return skb; 2967 return skb;
3134} 2968}
@@ -3298,7 +3132,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
3298 pkt_dev->started_at); 3132 pkt_dev->started_at);
3299 ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); 3133 ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
3300 3134
3301 p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n", 3135 p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
3302 (unsigned long long)ktime_to_us(elapsed), 3136 (unsigned long long)ktime_to_us(elapsed),
3303 (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), 3137 (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
3304 (unsigned long long)ktime_to_us(idle), 3138 (unsigned long long)ktime_to_us(idle),
@@ -3432,11 +3266,6 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
3432 3266
3433 remove_proc_entry(t->tsk->comm, pg_proc_dir); 3267 remove_proc_entry(t->tsk->comm, pg_proc_dir);
3434 3268
3435 mutex_lock(&pktgen_thread_lock);
3436
3437 list_del(&t->th_list);
3438
3439 mutex_unlock(&pktgen_thread_lock);
3440} 3269}
3441 3270
3442static void pktgen_resched(struct pktgen_dev *pkt_dev) 3271static void pktgen_resched(struct pktgen_dev *pkt_dev)
@@ -3511,7 +3340,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3511 3340
3512 __netif_tx_lock_bh(txq); 3341 __netif_tx_lock_bh(txq);
3513 3342
3514 if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) { 3343 if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) {
3515 ret = NETDEV_TX_BUSY; 3344 ret = NETDEV_TX_BUSY;
3516 pkt_dev->last_ok = 0; 3345 pkt_dev->last_ok = 0;
3517 goto unlock; 3346 goto unlock;
@@ -3535,8 +3364,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3535 break; 3364 break;
3536 default: /* Drivers are not supposed to return other values! */ 3365 default: /* Drivers are not supposed to return other values! */
3537 if (net_ratelimit()) 3366 if (net_ratelimit())
3538 pr_info("pktgen: %s xmit error: %d\n", 3367 pr_info("%s xmit error: %d\n", pkt_dev->odevname, ret);
3539 pkt_dev->odevname, ret);
3540 pkt_dev->errors++; 3368 pkt_dev->errors++;
3541 /* fallthru */ 3369 /* fallthru */
3542 case NETDEV_TX_LOCKED: 3370 case NETDEV_TX_LOCKED:
@@ -3583,6 +3411,8 @@ static int pktgen_thread_worker(void *arg)
3583 pkt_dev = next_to_run(t); 3411 pkt_dev = next_to_run(t);
3584 3412
3585 if (unlikely(!pkt_dev && t->control == 0)) { 3413 if (unlikely(!pkt_dev && t->control == 0)) {
3414 if (pktgen_exiting)
3415 break;
3586 wait_event_interruptible_timeout(t->queue, 3416 wait_event_interruptible_timeout(t->queue,
3587 t->control != 0, 3417 t->control != 0,
3588 HZ/10); 3418 HZ/10);
@@ -3635,6 +3465,13 @@ static int pktgen_thread_worker(void *arg)
3635 pr_debug("%s removing thread\n", t->tsk->comm); 3465 pr_debug("%s removing thread\n", t->tsk->comm);
3636 pktgen_rem_thread(t); 3466 pktgen_rem_thread(t);
3637 3467
3468 /* Wait for kthread_stop */
3469 while (!kthread_should_stop()) {
3470 set_current_state(TASK_INTERRUPTIBLE);
3471 schedule();
3472 }
3473 __set_current_state(TASK_RUNNING);
3474
3638 return 0; 3475 return 0;
3639} 3476}
3640 3477
@@ -3707,13 +3544,12 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
3707 return -ENOMEM; 3544 return -ENOMEM;
3708 3545
3709 strcpy(pkt_dev->odevname, ifname); 3546 strcpy(pkt_dev->odevname, ifname);
3710 pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state), 3547 pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
3711 node); 3548 node);
3712 if (pkt_dev->flows == NULL) { 3549 if (pkt_dev->flows == NULL) {
3713 kfree(pkt_dev); 3550 kfree(pkt_dev);
3714 return -ENOMEM; 3551 return -ENOMEM;
3715 } 3552 }
3716 memset(pkt_dev->flows, 0, MAX_CFLOWS * sizeof(struct flow_state));
3717 3553
3718 pkt_dev->removal_mark = 0; 3554 pkt_dev->removal_mark = 0;
3719 pkt_dev->min_pkt_size = ETH_ZLEN; 3555 pkt_dev->min_pkt_size = ETH_ZLEN;
@@ -3786,7 +3622,10 @@ static int __init pktgen_create_thread(int cpu)
3786 list_add_tail(&t->th_list, &pktgen_threads); 3622 list_add_tail(&t->th_list, &pktgen_threads);
3787 init_completion(&t->start_done); 3623 init_completion(&t->start_done);
3788 3624
3789 p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); 3625 p = kthread_create_on_node(pktgen_thread_worker,
3626 t,
3627 cpu_to_node(cpu),
3628 "kpktgend_%d", cpu);
3790 if (IS_ERR(p)) { 3629 if (IS_ERR(p)) {
3791 pr_err("kernel_thread() failed for cpu %d\n", t->cpu); 3630 pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
3792 list_del(&t->th_list); 3631 list_del(&t->th_list);
@@ -3858,6 +3697,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
3858 free_SAs(pkt_dev); 3697 free_SAs(pkt_dev);
3859#endif 3698#endif
3860 vfree(pkt_dev->flows); 3699 vfree(pkt_dev->flows);
3700 if (pkt_dev->page)
3701 put_page(pkt_dev->page);
3861 kfree(pkt_dev); 3702 kfree(pkt_dev);
3862 return 0; 3703 return 0;
3863} 3704}
@@ -3866,6 +3707,7 @@ static int __init pg_init(void)
3866{ 3707{
3867 int cpu; 3708 int cpu;
3868 struct proc_dir_entry *pe; 3709 struct proc_dir_entry *pe;
3710 int ret = 0;
3869 3711
3870 pr_info("%s", version); 3712 pr_info("%s", version);
3871 3713
@@ -3876,11 +3718,10 @@ static int __init pg_init(void)
3876 pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); 3718 pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
3877 if (pe == NULL) { 3719 if (pe == NULL) {
3878 pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); 3720 pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL);
3879 proc_net_remove(&init_net, PG_PROC_DIR); 3721 ret = -EINVAL;
3880 return -EINVAL; 3722 goto remove_dir;
3881 } 3723 }
3882 3724
3883 /* Register us to receive netdevice events */
3884 register_netdevice_notifier(&pktgen_notifier_block); 3725 register_netdevice_notifier(&pktgen_notifier_block);
3885 3726
3886 for_each_online_cpu(cpu) { 3727 for_each_online_cpu(cpu) {
@@ -3894,23 +3735,27 @@ static int __init pg_init(void)
3894 3735
3895 if (list_empty(&pktgen_threads)) { 3736 if (list_empty(&pktgen_threads)) {
3896 pr_err("ERROR: Initialization failed for all threads\n"); 3737 pr_err("ERROR: Initialization failed for all threads\n");
3897 unregister_netdevice_notifier(&pktgen_notifier_block); 3738 ret = -ENODEV;
3898 remove_proc_entry(PGCTRL, pg_proc_dir); 3739 goto unregister;
3899 proc_net_remove(&init_net, PG_PROC_DIR);
3900 return -ENODEV;
3901 } 3740 }
3902 3741
3903 return 0; 3742 return 0;
3743
3744 unregister:
3745 unregister_netdevice_notifier(&pktgen_notifier_block);
3746 remove_proc_entry(PGCTRL, pg_proc_dir);
3747 remove_dir:
3748 proc_net_remove(&init_net, PG_PROC_DIR);
3749 return ret;
3904} 3750}
3905 3751
3906static void __exit pg_cleanup(void) 3752static void __exit pg_cleanup(void)
3907{ 3753{
3908 struct pktgen_thread *t; 3754 struct pktgen_thread *t;
3909 struct list_head *q, *n; 3755 struct list_head *q, *n;
3910 wait_queue_head_t queue;
3911 init_waitqueue_head(&queue);
3912 3756
3913 /* Stop all interfaces & threads */ 3757 /* Stop all interfaces & threads */
3758 pktgen_exiting = true;
3914 3759
3915 list_for_each_safe(q, n, &pktgen_threads) { 3760 list_for_each_safe(q, n, &pktgen_threads) {
3916 t = list_entry(q, struct pktgen_thread, th_list); 3761 t = list_entry(q, struct pktgen_thread, th_list);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 7552495aff7a..182236b2510a 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -33,6 +33,7 @@
33 * Note : Dont forget somaxconn that may limit backlog too. 33 * Note : Dont forget somaxconn that may limit backlog too.
34 */ 34 */
35int sysctl_max_syn_backlog = 256; 35int sysctl_max_syn_backlog = 256;
36EXPORT_SYMBOL(sysctl_max_syn_backlog);
36 37
37int reqsk_queue_alloc(struct request_sock_queue *queue, 38int reqsk_queue_alloc(struct request_sock_queue *queue,
38 unsigned int nr_table_entries) 39 unsigned int nr_table_entries)
@@ -45,9 +46,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
45 nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); 46 nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
46 lopt_size += nr_table_entries * sizeof(struct request_sock *); 47 lopt_size += nr_table_entries * sizeof(struct request_sock *);
47 if (lopt_size > PAGE_SIZE) 48 if (lopt_size > PAGE_SIZE)
48 lopt = __vmalloc(lopt_size, 49 lopt = vzalloc(lopt_size);
49 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
50 PAGE_KERNEL);
51 else 50 else
52 lopt = kzalloc(lopt_size, GFP_KERNEL); 51 lopt = kzalloc(lopt_size, GFP_KERNEL);
53 if (lopt == NULL) 52 if (lopt == NULL)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f78d821bd935..abd936d8a716 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -196,7 +196,7 @@ EXPORT_SYMBOL_GPL(__rtnl_register);
196 * as failure of this function is very unlikely, it can only happen due 196 * as failure of this function is very unlikely, it can only happen due
197 * to lack of memory when allocating the chain to store all message 197 * to lack of memory when allocating the chain to store all message
198 * handlers for a protocol. Meant for use in init functions where lack 198 * handlers for a protocol. Meant for use in init functions where lack
199 * of memory implies no sense in continueing. 199 * of memory implies no sense in continuing.
200 */ 200 */
201void rtnl_register(int protocol, int msgtype, 201void rtnl_register(int protocol, int msgtype,
202 rtnl_doit_func doit, rtnl_dumpit_func dumpit) 202 rtnl_doit_func doit, rtnl_dumpit_func dumpit)
@@ -299,14 +299,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
299 unregister_netdevice_many(&list_kill); 299 unregister_netdevice_many(&list_kill);
300} 300}
301 301
302void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
303{
304 rtnl_lock();
305 __rtnl_kill_links(net, ops);
306 rtnl_unlock();
307}
308EXPORT_SYMBOL_GPL(rtnl_kill_links);
309
310/** 302/**
311 * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. 303 * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
312 * @ops: struct rtnl_link_ops * to unregister 304 * @ops: struct rtnl_link_ops * to unregister
@@ -355,16 +347,106 @@ static size_t rtnl_link_get_size(const struct net_device *dev)
355 if (!ops) 347 if (!ops)
356 return 0; 348 return 0;
357 349
358 size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ 350 size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
359 nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ 351 nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */
360 352
361 if (ops->get_size) 353 if (ops->get_size)
362 /* IFLA_INFO_DATA + nested data */ 354 /* IFLA_INFO_DATA + nested data */
363 size += nlmsg_total_size(sizeof(struct nlattr)) + 355 size += nla_total_size(sizeof(struct nlattr)) +
364 ops->get_size(dev); 356 ops->get_size(dev);
365 357
366 if (ops->get_xstats_size) 358 if (ops->get_xstats_size)
367 size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ 359 /* IFLA_INFO_XSTATS */
360 size += nla_total_size(ops->get_xstats_size(dev));
361
362 return size;
363}
364
365static LIST_HEAD(rtnl_af_ops);
366
367static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
368{
369 const struct rtnl_af_ops *ops;
370
371 list_for_each_entry(ops, &rtnl_af_ops, list) {
372 if (ops->family == family)
373 return ops;
374 }
375
376 return NULL;
377}
378
379/**
380 * __rtnl_af_register - Register rtnl_af_ops with rtnetlink.
381 * @ops: struct rtnl_af_ops * to register
382 *
383 * The caller must hold the rtnl_mutex.
384 *
385 * Returns 0 on success or a negative error code.
386 */
387int __rtnl_af_register(struct rtnl_af_ops *ops)
388{
389 list_add_tail(&ops->list, &rtnl_af_ops);
390 return 0;
391}
392EXPORT_SYMBOL_GPL(__rtnl_af_register);
393
394/**
395 * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
396 * @ops: struct rtnl_af_ops * to register
397 *
398 * Returns 0 on success or a negative error code.
399 */
400int rtnl_af_register(struct rtnl_af_ops *ops)
401{
402 int err;
403
404 rtnl_lock();
405 err = __rtnl_af_register(ops);
406 rtnl_unlock();
407 return err;
408}
409EXPORT_SYMBOL_GPL(rtnl_af_register);
410
411/**
412 * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
413 * @ops: struct rtnl_af_ops * to unregister
414 *
415 * The caller must hold the rtnl_mutex.
416 */
417void __rtnl_af_unregister(struct rtnl_af_ops *ops)
418{
419 list_del(&ops->list);
420}
421EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
422
423/**
424 * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
425 * @ops: struct rtnl_af_ops * to unregister
426 */
427void rtnl_af_unregister(struct rtnl_af_ops *ops)
428{
429 rtnl_lock();
430 __rtnl_af_unregister(ops);
431 rtnl_unlock();
432}
433EXPORT_SYMBOL_GPL(rtnl_af_unregister);
434
435static size_t rtnl_link_get_af_size(const struct net_device *dev)
436{
437 struct rtnl_af_ops *af_ops;
438 size_t size;
439
440 /* IFLA_AF_SPEC */
441 size = nla_total_size(sizeof(struct nlattr));
442
443 list_for_each_entry(af_ops, &rtnl_af_ops, list) {
444 if (af_ops->get_link_af_size) {
445 /* AF_* + nested data */
446 size += nla_total_size(sizeof(struct nlattr)) +
447 af_ops->get_link_af_size(dev);
448 }
449 }
368 450
369 return size; 451 return size;
370} 452}
@@ -612,36 +694,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
612 694
613static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) 695static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
614{ 696{
615 struct rtnl_link_stats64 a; 697 memcpy(v, b, sizeof(*b));
616
617 a.rx_packets = b->rx_packets;
618 a.tx_packets = b->tx_packets;
619 a.rx_bytes = b->rx_bytes;
620 a.tx_bytes = b->tx_bytes;
621 a.rx_errors = b->rx_errors;
622 a.tx_errors = b->tx_errors;
623 a.rx_dropped = b->rx_dropped;
624 a.tx_dropped = b->tx_dropped;
625
626 a.multicast = b->multicast;
627 a.collisions = b->collisions;
628
629 a.rx_length_errors = b->rx_length_errors;
630 a.rx_over_errors = b->rx_over_errors;
631 a.rx_crc_errors = b->rx_crc_errors;
632 a.rx_frame_errors = b->rx_frame_errors;
633 a.rx_fifo_errors = b->rx_fifo_errors;
634 a.rx_missed_errors = b->rx_missed_errors;
635
636 a.tx_aborted_errors = b->tx_aborted_errors;
637 a.tx_carrier_errors = b->tx_carrier_errors;
638 a.tx_fifo_errors = b->tx_fifo_errors;
639 a.tx_heartbeat_errors = b->tx_heartbeat_errors;
640 a.tx_window_errors = b->tx_window_errors;
641
642 a.rx_compressed = b->rx_compressed;
643 a.tx_compressed = b->tx_compressed;
644 memcpy(v, &a, sizeof(a));
645} 698}
646 699
647/* All VF info */ 700/* All VF info */
@@ -707,7 +760,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev)
707 + nla_total_size(4) /* IFLA_NUM_VF */ 760 + nla_total_size(4) /* IFLA_NUM_VF */
708 + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ 761 + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
709 + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ 762 + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
710 + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ 763 + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
764 + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
711} 765}
712 766
713static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) 767static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -793,8 +847,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
793 struct nlmsghdr *nlh; 847 struct nlmsghdr *nlh;
794 struct rtnl_link_stats64 temp; 848 struct rtnl_link_stats64 temp;
795 const struct rtnl_link_stats64 *stats; 849 const struct rtnl_link_stats64 *stats;
796 struct nlattr *attr; 850 struct nlattr *attr, *af_spec;
851 struct rtnl_af_ops *af_ops;
797 852
853 ASSERT_RTNL();
798 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); 854 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
799 if (nlh == NULL) 855 if (nlh == NULL)
800 return -EMSGSIZE; 856 return -EMSGSIZE;
@@ -813,6 +869,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
813 netif_running(dev) ? dev->operstate : IF_OPER_DOWN); 869 netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
814 NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); 870 NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
815 NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); 871 NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
872 NLA_PUT_U32(skb, IFLA_GROUP, dev->group);
816 873
817 if (dev->ifindex != dev->iflink) 874 if (dev->ifindex != dev->iflink)
818 NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); 875 NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -902,6 +959,36 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
902 goto nla_put_failure; 959 goto nla_put_failure;
903 } 960 }
904 961
962 if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
963 goto nla_put_failure;
964
965 list_for_each_entry(af_ops, &rtnl_af_ops, list) {
966 if (af_ops->fill_link_af) {
967 struct nlattr *af;
968 int err;
969
970 if (!(af = nla_nest_start(skb, af_ops->family)))
971 goto nla_put_failure;
972
973 err = af_ops->fill_link_af(skb, dev);
974
975 /*
976 * Caller may return ENODATA to indicate that there
977 * was no data to be dumped. This is not an error, it
978 * means we should trim the attribute header and
979 * continue.
980 */
981 if (err == -ENODATA)
982 nla_nest_cancel(skb, af);
983 else if (err < 0)
984 goto nla_put_failure;
985
986 nla_nest_end(skb, af);
987 }
988 }
989
990 nla_nest_end(skb, af_spec);
991
905 return nlmsg_end(skb, nlh); 992 return nlmsg_end(skb, nlh);
906 993
907nla_put_failure: 994nla_put_failure:
@@ -921,10 +1008,11 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
921 s_h = cb->args[0]; 1008 s_h = cb->args[0];
922 s_idx = cb->args[1]; 1009 s_idx = cb->args[1];
923 1010
1011 rcu_read_lock();
924 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 1012 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
925 idx = 0; 1013 idx = 0;
926 head = &net->dev_index_head[h]; 1014 head = &net->dev_index_head[h];
927 hlist_for_each_entry(dev, node, head, index_hlist) { 1015 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
928 if (idx < s_idx) 1016 if (idx < s_idx)
929 goto cont; 1017 goto cont;
930 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, 1018 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
@@ -937,6 +1025,7 @@ cont:
937 } 1025 }
938 } 1026 }
939out: 1027out:
1028 rcu_read_unlock();
940 cb->args[1] = idx; 1029 cb->args[1] = idx;
941 cb->args[0] = h; 1030 cb->args[0] = h;
942 1031
@@ -950,16 +1039,19 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
950 [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, 1039 [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
951 [IFLA_MTU] = { .type = NLA_U32 }, 1040 [IFLA_MTU] = { .type = NLA_U32 },
952 [IFLA_LINK] = { .type = NLA_U32 }, 1041 [IFLA_LINK] = { .type = NLA_U32 },
1042 [IFLA_MASTER] = { .type = NLA_U32 },
953 [IFLA_TXQLEN] = { .type = NLA_U32 }, 1043 [IFLA_TXQLEN] = { .type = NLA_U32 },
954 [IFLA_WEIGHT] = { .type = NLA_U32 }, 1044 [IFLA_WEIGHT] = { .type = NLA_U32 },
955 [IFLA_OPERSTATE] = { .type = NLA_U8 }, 1045 [IFLA_OPERSTATE] = { .type = NLA_U8 },
956 [IFLA_LINKMODE] = { .type = NLA_U8 }, 1046 [IFLA_LINKMODE] = { .type = NLA_U8 },
957 [IFLA_LINKINFO] = { .type = NLA_NESTED }, 1047 [IFLA_LINKINFO] = { .type = NLA_NESTED },
958 [IFLA_NET_NS_PID] = { .type = NLA_U32 }, 1048 [IFLA_NET_NS_PID] = { .type = NLA_U32 },
1049 [IFLA_NET_NS_FD] = { .type = NLA_U32 },
959 [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, 1050 [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
960 [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, 1051 [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
961 [IFLA_VF_PORTS] = { .type = NLA_NESTED }, 1052 [IFLA_VF_PORTS] = { .type = NLA_NESTED },
962 [IFLA_PORT_SELF] = { .type = NLA_NESTED }, 1053 [IFLA_PORT_SELF] = { .type = NLA_NESTED },
1054 [IFLA_AF_SPEC] = { .type = NLA_NESTED },
963}; 1055};
964EXPORT_SYMBOL(ifla_policy); 1056EXPORT_SYMBOL(ifla_policy);
965 1057
@@ -1003,6 +1095,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
1003 */ 1095 */
1004 if (tb[IFLA_NET_NS_PID]) 1096 if (tb[IFLA_NET_NS_PID])
1005 net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); 1097 net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
1098 else if (tb[IFLA_NET_NS_FD])
1099 net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
1006 else 1100 else
1007 net = get_net(src_net); 1101 net = get_net(src_net);
1008 return net; 1102 return net;
@@ -1021,6 +1115,27 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
1021 return -EINVAL; 1115 return -EINVAL;
1022 } 1116 }
1023 1117
1118 if (tb[IFLA_AF_SPEC]) {
1119 struct nlattr *af;
1120 int rem, err;
1121
1122 nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
1123 const struct rtnl_af_ops *af_ops;
1124
1125 if (!(af_ops = rtnl_af_lookup(nla_type(af))))
1126 return -EAFNOSUPPORT;
1127
1128 if (!af_ops->set_link_af)
1129 return -EOPNOTSUPP;
1130
1131 if (af_ops->validate_link_af) {
1132 err = af_ops->validate_link_af(dev, af);
1133 if (err < 0)
1134 return err;
1135 }
1136 }
1137 }
1138
1024 return 0; 1139 return 0;
1025} 1140}
1026 1141
@@ -1070,6 +1185,41 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
1070 return err; 1185 return err;
1071} 1186}
1072 1187
1188static int do_set_master(struct net_device *dev, int ifindex)
1189{
1190 struct net_device *master_dev;
1191 const struct net_device_ops *ops;
1192 int err;
1193
1194 if (dev->master) {
1195 if (dev->master->ifindex == ifindex)
1196 return 0;
1197 ops = dev->master->netdev_ops;
1198 if (ops->ndo_del_slave) {
1199 err = ops->ndo_del_slave(dev->master, dev);
1200 if (err)
1201 return err;
1202 } else {
1203 return -EOPNOTSUPP;
1204 }
1205 }
1206
1207 if (ifindex) {
1208 master_dev = __dev_get_by_index(dev_net(dev), ifindex);
1209 if (!master_dev)
1210 return -EINVAL;
1211 ops = master_dev->netdev_ops;
1212 if (ops->ndo_add_slave) {
1213 err = ops->ndo_add_slave(master_dev, dev);
1214 if (err)
1215 return err;
1216 } else {
1217 return -EOPNOTSUPP;
1218 }
1219 }
1220 return 0;
1221}
1222
1073static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, 1223static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1074 struct nlattr **tb, char *ifname, int modified) 1224 struct nlattr **tb, char *ifname, int modified)
1075{ 1225{
@@ -1077,7 +1227,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1077 int send_addr_notify = 0; 1227 int send_addr_notify = 0;
1078 int err; 1228 int err;
1079 1229
1080 if (tb[IFLA_NET_NS_PID]) { 1230 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
1081 struct net *net = rtnl_link_get_net(dev_net(dev), tb); 1231 struct net *net = rtnl_link_get_net(dev_net(dev), tb);
1082 if (IS_ERR(net)) { 1232 if (IS_ERR(net)) {
1083 err = PTR_ERR(net); 1233 err = PTR_ERR(net);
@@ -1157,6 +1307,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1157 modified = 1; 1307 modified = 1;
1158 } 1308 }
1159 1309
1310 if (tb[IFLA_GROUP]) {
1311 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
1312 modified = 1;
1313 }
1314
1160 /* 1315 /*
1161 * Interface selected by interface index but interface 1316 * Interface selected by interface index but interface
1162 * name provided implies that a name change has been 1317 * name provided implies that a name change has been
@@ -1188,6 +1343,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1188 goto errout; 1343 goto errout;
1189 } 1344 }
1190 1345
1346 if (tb[IFLA_MASTER]) {
1347 err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
1348 if (err)
1349 goto errout;
1350 modified = 1;
1351 }
1352
1191 if (tb[IFLA_TXQLEN]) 1353 if (tb[IFLA_TXQLEN])
1192 dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); 1354 dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
1193 1355
@@ -1261,12 +1423,30 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1261 goto errout; 1423 goto errout;
1262 modified = 1; 1424 modified = 1;
1263 } 1425 }
1426
1427 if (tb[IFLA_AF_SPEC]) {
1428 struct nlattr *af;
1429 int rem;
1430
1431 nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
1432 const struct rtnl_af_ops *af_ops;
1433
1434 if (!(af_ops = rtnl_af_lookup(nla_type(af))))
1435 BUG();
1436
1437 err = af_ops->set_link_af(dev, af);
1438 if (err < 0)
1439 goto errout;
1440
1441 modified = 1;
1442 }
1443 }
1264 err = 0; 1444 err = 0;
1265 1445
1266errout: 1446errout:
1267 if (err < 0 && modified && net_ratelimit()) 1447 if (err < 0 && modified && net_ratelimit())
1268 printk(KERN_WARNING "A link change request failed with " 1448 printk(KERN_WARNING "A link change request failed with "
1269 "some changes comitted already. Interface %s may " 1449 "some changes committed already. Interface %s may "
1270 "have been left with an inconsistent configuration, " 1450 "have been left with an inconsistent configuration, "
1271 "please check.\n", dev->name); 1451 "please check.\n", dev->name);
1272 1452
@@ -1325,6 +1505,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1325 char ifname[IFNAMSIZ]; 1505 char ifname[IFNAMSIZ];
1326 struct nlattr *tb[IFLA_MAX+1]; 1506 struct nlattr *tb[IFLA_MAX+1];
1327 int err; 1507 int err;
1508 LIST_HEAD(list_kill);
1328 1509
1329 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); 1510 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
1330 if (err < 0) 1511 if (err < 0)
@@ -1348,7 +1529,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1348 if (!ops) 1529 if (!ops)
1349 return -EOPNOTSUPP; 1530 return -EOPNOTSUPP;
1350 1531
1351 ops->dellink(dev, NULL); 1532 ops->dellink(dev, &list_kill);
1533 unregister_netdevice_many(&list_kill);
1534 list_del(&list_kill);
1352 return 0; 1535 return 0;
1353} 1536}
1354 1537
@@ -1396,12 +1579,6 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
1396 dev->rtnl_link_state = RTNL_LINK_INITIALIZING; 1579 dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
1397 dev->real_num_tx_queues = real_num_queues; 1580 dev->real_num_tx_queues = real_num_queues;
1398 1581
1399 if (strchr(dev->name, '%')) {
1400 err = dev_alloc_name(dev, dev->name);
1401 if (err < 0)
1402 goto err_free;
1403 }
1404
1405 if (tb[IFLA_MTU]) 1582 if (tb[IFLA_MTU])
1406 dev->mtu = nla_get_u32(tb[IFLA_MTU]); 1583 dev->mtu = nla_get_u32(tb[IFLA_MTU]);
1407 if (tb[IFLA_ADDRESS]) 1584 if (tb[IFLA_ADDRESS])
@@ -1416,16 +1593,34 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
1416 set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); 1593 set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
1417 if (tb[IFLA_LINKMODE]) 1594 if (tb[IFLA_LINKMODE])
1418 dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); 1595 dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
1596 if (tb[IFLA_GROUP])
1597 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
1419 1598
1420 return dev; 1599 return dev;
1421 1600
1422err_free:
1423 free_netdev(dev);
1424err: 1601err:
1425 return ERR_PTR(err); 1602 return ERR_PTR(err);
1426} 1603}
1427EXPORT_SYMBOL(rtnl_create_link); 1604EXPORT_SYMBOL(rtnl_create_link);
1428 1605
1606static int rtnl_group_changelink(struct net *net, int group,
1607 struct ifinfomsg *ifm,
1608 struct nlattr **tb)
1609{
1610 struct net_device *dev;
1611 int err;
1612
1613 for_each_netdev(net, dev) {
1614 if (dev->group == group) {
1615 err = do_setlink(dev, ifm, tb, NULL, 0);
1616 if (err < 0)
1617 return err;
1618 }
1619 }
1620
1621 return 0;
1622}
1623
1429static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 1624static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1430{ 1625{
1431 struct net *net = sock_net(skb->sk); 1626 struct net *net = sock_net(skb->sk);
@@ -1453,10 +1648,12 @@ replay:
1453 ifm = nlmsg_data(nlh); 1648 ifm = nlmsg_data(nlh);
1454 if (ifm->ifi_index > 0) 1649 if (ifm->ifi_index > 0)
1455 dev = __dev_get_by_index(net, ifm->ifi_index); 1650 dev = __dev_get_by_index(net, ifm->ifi_index);
1456 else if (ifname[0]) 1651 else {
1457 dev = __dev_get_by_name(net, ifname); 1652 if (ifname[0])
1458 else 1653 dev = __dev_get_by_name(net, ifname);
1459 dev = NULL; 1654 else
1655 dev = NULL;
1656 }
1460 1657
1461 err = validate_linkmsg(dev, tb); 1658 err = validate_linkmsg(dev, tb);
1462 if (err < 0) 1659 if (err < 0)
@@ -1520,8 +1717,13 @@ replay:
1520 return do_setlink(dev, ifm, tb, ifname, modified); 1717 return do_setlink(dev, ifm, tb, ifname, modified);
1521 } 1718 }
1522 1719
1523 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) 1720 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1721 if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
1722 return rtnl_group_changelink(net,
1723 nla_get_u32(tb[IFLA_GROUP]),
1724 ifm, tb);
1524 return -ENODEV; 1725 return -ENODEV;
1726 }
1525 1727
1526 if (ifm->ifi_index) 1728 if (ifm->ifi_index)
1527 return -EOPNOTSUPP; 1729 return -EOPNOTSUPP;
@@ -1546,6 +1748,9 @@ replay:
1546 snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); 1748 snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
1547 1749
1548 dest_net = rtnl_link_get_net(net, tb); 1750 dest_net = rtnl_link_get_net(net, tb);
1751 if (IS_ERR(dest_net))
1752 return PTR_ERR(dest_net);
1753
1549 dev = rtnl_create_link(net, dest_net, ifname, ops, tb); 1754 dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
1550 1755
1551 if (IS_ERR(dev)) 1756 if (IS_ERR(dev))
@@ -1759,6 +1964,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
1759 case NETDEV_GOING_DOWN: 1964 case NETDEV_GOING_DOWN:
1760 case NETDEV_UNREGISTER: 1965 case NETDEV_UNREGISTER:
1761 case NETDEV_UNREGISTER_BATCH: 1966 case NETDEV_UNREGISTER_BATCH:
1967 case NETDEV_RELEASE:
1968 case NETDEV_JOIN:
1762 break; 1969 break;
1763 default: 1970 default:
1764 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 1971 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
diff --git a/net/core/scm.c b/net/core/scm.c
index 413cab89017d..4c1ef026d695 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -79,10 +79,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
79 return -ENOMEM; 79 return -ENOMEM;
80 *fplp = fpl; 80 *fplp = fpl;
81 fpl->count = 0; 81 fpl->count = 0;
82 fpl->max = SCM_MAX_FD;
82 } 83 }
83 fpp = &fpl->fp[fpl->count]; 84 fpp = &fpl->fp[fpl->count];
84 85
85 if (fpl->count + num > SCM_MAX_FD) 86 if (fpl->count + num > fpl->max)
86 return -EINVAL; 87 return -EINVAL;
87 88
88 /* 89 /*
@@ -94,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
94 int fd = fdp[i]; 95 int fd = fdp[i];
95 struct file *file; 96 struct file *file;
96 97
97 if (fd < 0 || !(file = fget(fd))) 98 if (fd < 0 || !(file = fget_raw(fd)))
98 return -EBADF; 99 return -EBADF;
99 *fpp++ = file; 100 *fpp++ = file;
100 fpl->count++; 101 fpl->count++;
@@ -331,11 +332,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
331 if (!fpl) 332 if (!fpl)
332 return NULL; 333 return NULL;
333 334
334 new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); 335 new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
336 GFP_KERNEL);
335 if (new_fpl) { 337 if (new_fpl) {
336 for (i=fpl->count-1; i>=0; i--) 338 for (i = 0; i < fpl->count; i++)
337 get_file(fpl->fp[i]); 339 get_file(fpl->fp[i]);
338 memcpy(new_fpl, fpl, sizeof(*fpl)); 340 new_fpl->max = new_fpl->count;
339 } 341 }
340 return new_fpl; 342 return new_fpl;
341} 343}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c83b421341c0..46cbd28f40f9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -57,6 +57,7 @@
57#include <linux/init.h> 57#include <linux/init.h>
58#include <linux/scatterlist.h> 58#include <linux/scatterlist.h>
59#include <linux/errqueue.h> 59#include <linux/errqueue.h>
60#include <linux/prefetch.h>
60 61
61#include <net/protocol.h> 62#include <net/protocol.h>
62#include <net/dst.h> 63#include <net/dst.h>
@@ -202,8 +203,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
202 skb->data = data; 203 skb->data = data;
203 skb_reset_tail_pointer(skb); 204 skb_reset_tail_pointer(skb);
204 skb->end = skb->tail + size; 205 skb->end = skb->tail + size;
205 kmemcheck_annotate_bitfield(skb, flags1);
206 kmemcheck_annotate_bitfield(skb, flags2);
207#ifdef NET_SKBUFF_DATA_USES_OFFSET 206#ifdef NET_SKBUFF_DATA_USES_OFFSET
208 skb->mac_header = ~0U; 207 skb->mac_header = ~0U;
209#endif 208#endif
@@ -212,6 +211,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
212 shinfo = skb_shinfo(skb); 211 shinfo = skb_shinfo(skb);
213 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 212 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
214 atomic_set(&shinfo->dataref, 1); 213 atomic_set(&shinfo->dataref, 1);
214 kmemcheck_annotate_variable(shinfo->destructor_arg);
215 215
216 if (fclone) { 216 if (fclone) {
217 struct sk_buff *child = skb + 1; 217 struct sk_buff *child = skb + 1;
@@ -249,10 +249,9 @@ EXPORT_SYMBOL(__alloc_skb);
249struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 249struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
250 unsigned int length, gfp_t gfp_mask) 250 unsigned int length, gfp_t gfp_mask)
251{ 251{
252 int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
253 struct sk_buff *skb; 252 struct sk_buff *skb;
254 253
255 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); 254 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
256 if (likely(skb)) { 255 if (likely(skb)) {
257 skb_reserve(skb, NET_SKB_PAD); 256 skb_reserve(skb, NET_SKB_PAD);
258 skb->dev = dev; 257 skb->dev = dev;
@@ -261,16 +260,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
261} 260}
262EXPORT_SYMBOL(__netdev_alloc_skb); 261EXPORT_SYMBOL(__netdev_alloc_skb);
263 262
264struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
265{
266 int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
267 struct page *page;
268
269 page = alloc_pages_node(node, gfp_mask, 0);
270 return page;
271}
272EXPORT_SYMBOL(__netdev_alloc_page);
273
274void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 263void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
275 int size) 264 int size)
276{ 265{
@@ -340,7 +329,7 @@ static void skb_release_data(struct sk_buff *skb)
340 put_page(skb_shinfo(skb)->frags[i].page); 329 put_page(skb_shinfo(skb)->frags[i].page);
341 } 330 }
342 331
343 if (skb_has_frags(skb)) 332 if (skb_has_frag_list(skb))
344 skb_drop_fraglist(skb); 333 skb_drop_fraglist(skb);
345 334
346 kfree(skb->head); 335 kfree(skb->head);
@@ -393,6 +382,8 @@ static void skb_release_head_state(struct sk_buff *skb)
393 } 382 }
394#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 383#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
395 nf_conntrack_put(skb->nfct); 384 nf_conntrack_put(skb->nfct);
385#endif
386#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
396 nf_conntrack_put_reasm(skb->nfct_reasm); 387 nf_conntrack_put_reasm(skb->nfct_reasm);
397#endif 388#endif
398#ifdef CONFIG_BRIDGE_NETFILTER 389#ifdef CONFIG_BRIDGE_NETFILTER
@@ -466,6 +457,7 @@ void consume_skb(struct sk_buff *skb)
466 smp_rmb(); 457 smp_rmb();
467 else if (likely(!atomic_dec_and_test(&skb->users))) 458 else if (likely(!atomic_dec_and_test(&skb->users)))
468 return; 459 return;
460 trace_consume_skb(skb);
469 __kfree_skb(skb); 461 __kfree_skb(skb);
470} 462}
471EXPORT_SYMBOL(consume_skb); 463EXPORT_SYMBOL(consume_skb);
@@ -532,7 +524,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
532 new->ip_summed = old->ip_summed; 524 new->ip_summed = old->ip_summed;
533 skb_copy_queue_mapping(new, old); 525 skb_copy_queue_mapping(new, old);
534 new->priority = old->priority; 526 new->priority = old->priority;
535 new->deliver_no_wcard = old->deliver_no_wcard;
536#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 527#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
537 new->ipvs_property = old->ipvs_property; 528 new->ipvs_property = old->ipvs_property;
538#endif 529#endif
@@ -685,16 +676,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
685 676
686struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 677struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
687{ 678{
688 int headerlen = skb->data - skb->head; 679 int headerlen = skb_headroom(skb);
689 /* 680 unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
690 * Allocate the copy buffer 681 struct sk_buff *n = alloc_skb(size, gfp_mask);
691 */ 682
692 struct sk_buff *n;
693#ifdef NET_SKBUFF_DATA_USES_OFFSET
694 n = alloc_skb(skb->end + skb->data_len, gfp_mask);
695#else
696 n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
697#endif
698 if (!n) 683 if (!n)
699 return NULL; 684 return NULL;
700 685
@@ -726,20 +711,14 @@ EXPORT_SYMBOL(skb_copy);
726 711
727struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) 712struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
728{ 713{
729 /* 714 unsigned int size = skb_end_pointer(skb) - skb->head;
730 * Allocate the copy buffer 715 struct sk_buff *n = alloc_skb(size, gfp_mask);
731 */ 716
732 struct sk_buff *n;
733#ifdef NET_SKBUFF_DATA_USES_OFFSET
734 n = alloc_skb(skb->end, gfp_mask);
735#else
736 n = alloc_skb(skb->end - skb->head, gfp_mask);
737#endif
738 if (!n) 717 if (!n)
739 goto out; 718 goto out;
740 719
741 /* Set the data pointer */ 720 /* Set the data pointer */
742 skb_reserve(n, skb->data - skb->head); 721 skb_reserve(n, skb_headroom(skb));
743 /* Set the tail pointer and length */ 722 /* Set the tail pointer and length */
744 skb_put(n, skb_headlen(skb)); 723 skb_put(n, skb_headlen(skb));
745 /* Copy the bytes */ 724 /* Copy the bytes */
@@ -759,7 +738,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
759 skb_shinfo(n)->nr_frags = i; 738 skb_shinfo(n)->nr_frags = i;
760 } 739 }
761 740
762 if (skb_has_frags(skb)) { 741 if (skb_has_frag_list(skb)) {
763 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 742 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
764 skb_clone_fraglist(n); 743 skb_clone_fraglist(n);
765 } 744 }
@@ -791,12 +770,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
791{ 770{
792 int i; 771 int i;
793 u8 *data; 772 u8 *data;
794#ifdef NET_SKBUFF_DATA_USES_OFFSET 773 int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
795 int size = nhead + skb->end + ntail;
796#else
797 int size = nhead + (skb->end - skb->head) + ntail;
798#endif
799 long off; 774 long off;
775 bool fastpath;
800 776
801 BUG_ON(nhead < 0); 777 BUG_ON(nhead < 0);
802 778
@@ -805,31 +781,56 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
805 781
806 size = SKB_DATA_ALIGN(size); 782 size = SKB_DATA_ALIGN(size);
807 783
784 /* Check if we can avoid taking references on fragments if we own
785 * the last reference on skb->head. (see skb_release_data())
786 */
787 if (!skb->cloned)
788 fastpath = true;
789 else {
790 int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
791
792 fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
793 }
794
795 if (fastpath &&
796 size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
797 memmove(skb->head + size, skb_shinfo(skb),
798 offsetof(struct skb_shared_info,
799 frags[skb_shinfo(skb)->nr_frags]));
800 memmove(skb->head + nhead, skb->head,
801 skb_tail_pointer(skb) - skb->head);
802 off = nhead;
803 goto adjust_others;
804 }
805
808 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); 806 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
809 if (!data) 807 if (!data)
810 goto nodata; 808 goto nodata;
811 809
812 /* Copy only real data... and, alas, header. This should be 810 /* Copy only real data... and, alas, header. This should be
813 * optimized for the cases when header is void. */ 811 * optimized for the cases when header is void.
814#ifdef NET_SKBUFF_DATA_USES_OFFSET 812 */
815 memcpy(data + nhead, skb->head, skb->tail); 813 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
816#else
817 memcpy(data + nhead, skb->head, skb->tail - skb->head);
818#endif
819 memcpy(data + size, skb_end_pointer(skb),
820 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
821 814
822 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 815 memcpy((struct skb_shared_info *)(data + size),
823 get_page(skb_shinfo(skb)->frags[i].page); 816 skb_shinfo(skb),
817 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
824 818
825 if (skb_has_frags(skb)) 819 if (fastpath) {
826 skb_clone_fraglist(skb); 820 kfree(skb->head);
821 } else {
822 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
823 get_page(skb_shinfo(skb)->frags[i].page);
827 824
828 skb_release_data(skb); 825 if (skb_has_frag_list(skb))
826 skb_clone_fraglist(skb);
829 827
828 skb_release_data(skb);
829 }
830 off = (data + nhead) - skb->head; 830 off = (data + nhead) - skb->head;
831 831
832 skb->head = data; 832 skb->head = data;
833adjust_others:
833 skb->data += off; 834 skb->data += off;
834#ifdef NET_SKBUFF_DATA_USES_OFFSET 835#ifdef NET_SKBUFF_DATA_USES_OFFSET
835 skb->end = size; 836 skb->end = size;
@@ -1099,7 +1100,7 @@ drop_pages:
1099 for (; i < nfrags; i++) 1100 for (; i < nfrags; i++)
1100 put_page(skb_shinfo(skb)->frags[i].page); 1101 put_page(skb_shinfo(skb)->frags[i].page);
1101 1102
1102 if (skb_has_frags(skb)) 1103 if (skb_has_frag_list(skb))
1103 skb_drop_fraglist(skb); 1104 skb_drop_fraglist(skb);
1104 goto done; 1105 goto done;
1105 } 1106 }
@@ -1194,7 +1195,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1194 /* Optimization: no fragments, no reasons to preestimate 1195 /* Optimization: no fragments, no reasons to preestimate
1195 * size of pulled pages. Superb. 1196 * size of pulled pages. Superb.
1196 */ 1197 */
1197 if (!skb_has_frags(skb)) 1198 if (!skb_has_frag_list(skb))
1198 goto pull_pages; 1199 goto pull_pages;
1199 1200
1200 /* Estimate size of pulled pages. */ 1201 /* Estimate size of pulled pages. */
@@ -1826,7 +1827,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
1826 long csstart; 1827 long csstart;
1827 1828
1828 if (skb->ip_summed == CHECKSUM_PARTIAL) 1829 if (skb->ip_summed == CHECKSUM_PARTIAL)
1829 csstart = skb->csum_start - skb_headroom(skb); 1830 csstart = skb_checksum_start_offset(skb);
1830 else 1831 else
1831 csstart = skb_headlen(skb); 1832 csstart = skb_headlen(skb);
1832 1833
@@ -2267,7 +2268,7 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
2267 * of bytes already consumed and the next call to 2268 * of bytes already consumed and the next call to
2268 * skb_seq_read() will return the remaining part of the block. 2269 * skb_seq_read() will return the remaining part of the block.
2269 * 2270 *
2270 * Note 1: The size of each block of data returned can be arbitary, 2271 * Note 1: The size of each block of data returned can be arbitrary,
2271 * this limitation is the cost for zerocopy seqeuental 2272 * this limitation is the cost for zerocopy seqeuental
2272 * reads of potentially non linear data. 2273 * reads of potentially non linear data.
2273 * 2274 *
@@ -2323,7 +2324,7 @@ next_skb:
2323 st->frag_data = NULL; 2324 st->frag_data = NULL;
2324 } 2325 }
2325 2326
2326 if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { 2327 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
2327 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 2328 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
2328 st->frag_idx = 0; 2329 st->frag_idx = 0;
2329 goto next_skb; 2330 goto next_skb;
@@ -2433,8 +2434,6 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
2433 return -ENOMEM; 2434 return -ENOMEM;
2434 2435
2435 /* initialize the next frag */ 2436 /* initialize the next frag */
2436 sk->sk_sndmsg_page = page;
2437 sk->sk_sndmsg_off = 0;
2438 skb_fill_page_desc(skb, frg_cnt, page, 0, 0); 2437 skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
2439 skb->truesize += PAGE_SIZE; 2438 skb->truesize += PAGE_SIZE;
2440 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); 2439 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
@@ -2454,7 +2453,6 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
2454 return -EFAULT; 2453 return -EFAULT;
2455 2454
2456 /* copy was successful so update the size parameters */ 2455 /* copy was successful so update the size parameters */
2457 sk->sk_sndmsg_off += copy;
2458 frag->size += copy; 2456 frag->size += copy;
2459 skb->len += copy; 2457 skb->len += copy;
2460 skb->data_len += copy; 2458 skb->data_len += copy;
@@ -2497,7 +2495,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
2497 * a pointer to the first in a list of new skbs for the segments. 2495 * a pointer to the first in a list of new skbs for the segments.
2498 * In case of error it returns ERR_PTR(err). 2496 * In case of error it returns ERR_PTR(err).
2499 */ 2497 */
2500struct sk_buff *skb_segment(struct sk_buff *skb, int features) 2498struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
2501{ 2499{
2502 struct sk_buff *segs = NULL; 2500 struct sk_buff *segs = NULL;
2503 struct sk_buff *tail = NULL; 2501 struct sk_buff *tail = NULL;
@@ -2507,7 +2505,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
2507 unsigned int offset = doffset; 2505 unsigned int offset = doffset;
2508 unsigned int headroom; 2506 unsigned int headroom;
2509 unsigned int len; 2507 unsigned int len;
2510 int sg = features & NETIF_F_SG; 2508 int sg = !!(features & NETIF_F_SG);
2511 int nfrags = skb_shinfo(skb)->nr_frags; 2509 int nfrags = skb_shinfo(skb)->nr_frags;
2512 int err = -ENOMEM; 2510 int err = -ENOMEM;
2513 int i = 0; 2511 int i = 0;
@@ -2744,8 +2742,12 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2744 2742
2745merge: 2743merge:
2746 if (offset > headlen) { 2744 if (offset > headlen) {
2747 skbinfo->frags[0].page_offset += offset - headlen; 2745 unsigned int eat = offset - headlen;
2748 skbinfo->frags[0].size -= offset - headlen; 2746
2747 skbinfo->frags[0].page_offset += eat;
2748 skbinfo->frags[0].size -= eat;
2749 skb->data_len -= eat;
2750 skb->len -= eat;
2749 offset = headlen; 2751 offset = headlen;
2750 } 2752 }
2751 2753
@@ -2893,7 +2895,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2893 return -ENOMEM; 2895 return -ENOMEM;
2894 2896
2895 /* Easy case. Most of packets will go this way. */ 2897 /* Easy case. Most of packets will go this way. */
2896 if (!skb_has_frags(skb)) { 2898 if (!skb_has_frag_list(skb)) {
2897 /* A little of trouble, not enough of space for trailer. 2899 /* A little of trouble, not enough of space for trailer.
2898 * This should not happen, when stack is tuned to generate 2900 * This should not happen, when stack is tuned to generate
2899 * good frames. OK, on miss we reallocate and reserve even more 2901 * good frames. OK, on miss we reallocate and reserve even more
@@ -2928,7 +2930,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2928 2930
2929 if (skb1->next == NULL && tailbits) { 2931 if (skb1->next == NULL && tailbits) {
2930 if (skb_shinfo(skb1)->nr_frags || 2932 if (skb_shinfo(skb1)->nr_frags ||
2931 skb_has_frags(skb1) || 2933 skb_has_frag_list(skb1) ||
2932 skb_tailroom(skb1) < tailbits) 2934 skb_tailroom(skb1) < tailbits)
2933 ntail = tailbits + 128; 2935 ntail = tailbits + 128;
2934 } 2936 }
@@ -2937,7 +2939,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2937 skb_cloned(skb1) || 2939 skb_cloned(skb1) ||
2938 ntail || 2940 ntail ||
2939 skb_shinfo(skb1)->nr_frags || 2941 skb_shinfo(skb1)->nr_frags ||
2940 skb_has_frags(skb1)) { 2942 skb_has_frag_list(skb1)) {
2941 struct sk_buff *skb2; 2943 struct sk_buff *skb2;
2942 2944
2943 /* Fuck, we are miserable poor guys... */ 2945 /* Fuck, we are miserable poor guys... */
@@ -2992,6 +2994,9 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
2992 skb->destructor = sock_rmem_free; 2994 skb->destructor = sock_rmem_free;
2993 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 2995 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
2994 2996
2997 /* before exiting rcu section, make sure dst is refcounted */
2998 skb_dst_force(skb);
2999
2995 skb_queue_tail(&sk->sk_error_queue, skb); 3000 skb_queue_tail(&sk->sk_error_queue, skb);
2996 if (!sock_flag(sk, SOCK_DEAD)) 3001 if (!sock_flag(sk, SOCK_DEAD))
2997 sk->sk_data_ready(sk, skb->len); 3002 sk->sk_data_ready(sk, skb->len);
@@ -3020,7 +3025,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
3020 } else { 3025 } else {
3021 /* 3026 /*
3022 * no hardware time stamps available, 3027 * no hardware time stamps available,
3023 * so keep the skb_shared_tx and only 3028 * so keep the shared tx_flags and only
3024 * store software time stamp 3029 * store software time stamp
3025 */ 3030 */
3026 skb->tstamp = ktime_get_real(); 3031 skb->tstamp = ktime_get_real();
diff --git a/net/core/sock.c b/net/core/sock.c
index ef30e9d286e7..6e819780c232 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -157,7 +157,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
157 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 157 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
158 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 158 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
159 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , 159 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
160 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , 160 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
161 "sk_lock-AF_MAX" 161 "sk_lock-AF_MAX"
162}; 162};
163static const char *const af_family_slock_key_strings[AF_MAX+1] = { 163static const char *const af_family_slock_key_strings[AF_MAX+1] = {
@@ -173,7 +173,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
173 "slock-27" , "slock-28" , "slock-AF_CAN" , 173 "slock-27" , "slock-28" , "slock-AF_CAN" ,
174 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 174 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
175 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , 175 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
176 "slock-AF_IEEE802154", "slock-AF_CAIF" , 176 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
177 "slock-AF_MAX" 177 "slock-AF_MAX"
178}; 178};
179static const char *const af_family_clock_key_strings[AF_MAX+1] = { 179static const char *const af_family_clock_key_strings[AF_MAX+1] = {
@@ -189,7 +189,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
189 "clock-27" , "clock-28" , "clock-AF_CAN" , 189 "clock-27" , "clock-28" , "clock-AF_CAN" ,
190 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 190 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
191 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , 191 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
192 "clock-AF_IEEE802154", "clock-AF_CAIF" , 192 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
193 "clock-AF_MAX" 193 "clock-AF_MAX"
194}; 194};
195 195
@@ -215,7 +215,7 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
215__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 215__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
216__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 216__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
217 217
218/* Maximal space eaten by iovec or ancilliary data plus some space */ 218/* Maximal space eaten by iovec or ancillary data plus some space */
219int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 219int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
220EXPORT_SYMBOL(sysctl_optmem_max); 220EXPORT_SYMBOL(sysctl_optmem_max);
221 221
@@ -992,23 +992,54 @@ static inline void sock_lock_init(struct sock *sk)
992/* 992/*
993 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 993 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
994 * even temporarly, because of RCU lookups. sk_node should also be left as is. 994 * even temporarly, because of RCU lookups. sk_node should also be left as is.
995 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
995 */ 996 */
996static void sock_copy(struct sock *nsk, const struct sock *osk) 997static void sock_copy(struct sock *nsk, const struct sock *osk)
997{ 998{
998#ifdef CONFIG_SECURITY_NETWORK 999#ifdef CONFIG_SECURITY_NETWORK
999 void *sptr = nsk->sk_security; 1000 void *sptr = nsk->sk_security;
1000#endif 1001#endif
1001 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != 1002 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1002 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) + 1003
1003 sizeof(osk->sk_tx_queue_mapping)); 1004 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1004 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, 1005 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1005 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); 1006
1006#ifdef CONFIG_SECURITY_NETWORK 1007#ifdef CONFIG_SECURITY_NETWORK
1007 nsk->sk_security = sptr; 1008 nsk->sk_security = sptr;
1008 security_sk_clone(osk, nsk); 1009 security_sk_clone(osk, nsk);
1009#endif 1010#endif
1010} 1011}
1011 1012
1013/*
1014 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1015 * un-modified. Special care is taken when initializing object to zero.
1016 */
1017static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1018{
1019 if (offsetof(struct sock, sk_node.next) != 0)
1020 memset(sk, 0, offsetof(struct sock, sk_node.next));
1021 memset(&sk->sk_node.pprev, 0,
1022 size - offsetof(struct sock, sk_node.pprev));
1023}
1024
1025void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1026{
1027 unsigned long nulls1, nulls2;
1028
1029 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1030 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1031 if (nulls1 > nulls2)
1032 swap(nulls1, nulls2);
1033
1034 if (nulls1 != 0)
1035 memset((char *)sk, 0, nulls1);
1036 memset((char *)sk + nulls1 + sizeof(void *), 0,
1037 nulls2 - nulls1 - sizeof(void *));
1038 memset((char *)sk + nulls2 + sizeof(void *), 0,
1039 size - nulls2 - sizeof(void *));
1040}
1041EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1042
1012static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1043static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1013 int family) 1044 int family)
1014{ 1045{
@@ -1021,19 +1052,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1021 if (!sk) 1052 if (!sk)
1022 return sk; 1053 return sk;
1023 if (priority & __GFP_ZERO) { 1054 if (priority & __GFP_ZERO) {
1024 /* 1055 if (prot->clear_sk)
1025 * caches using SLAB_DESTROY_BY_RCU should let 1056 prot->clear_sk(sk, prot->obj_size);
1026 * sk_node.next un-modified. Special care is taken 1057 else
1027 * when initializing object to zero. 1058 sk_prot_clear_nulls(sk, prot->obj_size);
1028 */
1029 if (offsetof(struct sock, sk_node.next) != 0)
1030 memset(sk, 0, offsetof(struct sock, sk_node.next));
1031 memset(&sk->sk_node.pprev, 0,
1032 prot->obj_size - offsetof(struct sock,
1033 sk_node.pprev));
1034 } 1059 }
1035 } 1060 } else
1036 else
1037 sk = kmalloc(prot->obj_size, priority); 1061 sk = kmalloc(prot->obj_size, priority);
1038 1062
1039 if (sk != NULL) { 1063 if (sk != NULL) {
@@ -1078,8 +1102,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1078#ifdef CONFIG_CGROUPS 1102#ifdef CONFIG_CGROUPS
1079void sock_update_classid(struct sock *sk) 1103void sock_update_classid(struct sock *sk)
1080{ 1104{
1081 u32 classid = task_cls_classid(current); 1105 u32 classid;
1082 1106
1107 rcu_read_lock(); /* doing current task, which cannot vanish. */
1108 classid = task_cls_classid(current);
1109 rcu_read_unlock();
1083 if (classid && classid != sk->sk_classid) 1110 if (classid && classid != sk->sk_classid)
1084 sk->sk_classid = classid; 1111 sk->sk_classid = classid;
1085} 1112}
@@ -1148,7 +1175,7 @@ static void __sk_free(struct sock *sk)
1148void sk_free(struct sock *sk) 1175void sk_free(struct sock *sk)
1149{ 1176{
1150 /* 1177 /*
1151 * We substract one from sk_wmem_alloc and can know if 1178 * We subtract one from sk_wmem_alloc and can know if
1152 * some packets are still in some tx queue. 1179 * some packets are still in some tx queue.
1153 * If not null, sock_wfree() will call __sk_free(sk) later 1180 * If not null, sock_wfree() will call __sk_free(sk) later
1154 */ 1181 */
@@ -1158,10 +1185,10 @@ void sk_free(struct sock *sk)
1158EXPORT_SYMBOL(sk_free); 1185EXPORT_SYMBOL(sk_free);
1159 1186
1160/* 1187/*
1161 * Last sock_put should drop referrence to sk->sk_net. It has already 1188 * Last sock_put should drop reference to sk->sk_net. It has already
1162 * been dropped in sk_change_net. Taking referrence to stopping namespace 1189 * been dropped in sk_change_net. Taking reference to stopping namespace
1163 * is not an option. 1190 * is not an option.
1164 * Take referrence to a socket to remove it from hash _alive_ and after that 1191 * Take reference to a socket to remove it from hash _alive_ and after that
1165 * destroy it in the context of init_net. 1192 * destroy it in the context of init_net.
1166 */ 1193 */
1167void sk_release_kernel(struct sock *sk) 1194void sk_release_kernel(struct sock *sk)
@@ -1222,7 +1249,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1222 sock_reset_flag(newsk, SOCK_DONE); 1249 sock_reset_flag(newsk, SOCK_DONE);
1223 skb_queue_head_init(&newsk->sk_error_queue); 1250 skb_queue_head_init(&newsk->sk_error_queue);
1224 1251
1225 filter = newsk->sk_filter; 1252 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1226 if (filter != NULL) 1253 if (filter != NULL)
1227 sk_filter_charge(newsk, filter); 1254 sk_filter_charge(newsk, filter);
1228 1255
@@ -1557,6 +1584,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1557EXPORT_SYMBOL(sock_alloc_send_skb); 1584EXPORT_SYMBOL(sock_alloc_send_skb);
1558 1585
1559static void __lock_sock(struct sock *sk) 1586static void __lock_sock(struct sock *sk)
1587 __releases(&sk->sk_lock.slock)
1588 __acquires(&sk->sk_lock.slock)
1560{ 1589{
1561 DEFINE_WAIT(wait); 1590 DEFINE_WAIT(wait);
1562 1591
@@ -1573,6 +1602,8 @@ static void __lock_sock(struct sock *sk)
1573} 1602}
1574 1603
1575static void __release_sock(struct sock *sk) 1604static void __release_sock(struct sock *sk)
1605 __releases(&sk->sk_lock.slock)
1606 __acquires(&sk->sk_lock.slock)
1576{ 1607{
1577 struct sk_buff *skb = sk->sk_backlog.head; 1608 struct sk_buff *skb = sk->sk_backlog.head;
1578 1609
@@ -1646,10 +1677,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
1646{ 1677{
1647 struct proto *prot = sk->sk_prot; 1678 struct proto *prot = sk->sk_prot;
1648 int amt = sk_mem_pages(size); 1679 int amt = sk_mem_pages(size);
1649 int allocated; 1680 long allocated;
1650 1681
1651 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 1682 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1652 allocated = atomic_add_return(amt, prot->memory_allocated); 1683 allocated = atomic_long_add_return(amt, prot->memory_allocated);
1653 1684
1654 /* Under limit. */ 1685 /* Under limit. */
1655 if (allocated <= prot->sysctl_mem[0]) { 1686 if (allocated <= prot->sysctl_mem[0]) {
@@ -1707,7 +1738,7 @@ suppress_allocation:
1707 1738
1708 /* Alas. Undo changes. */ 1739 /* Alas. Undo changes. */
1709 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 1740 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1710 atomic_sub(amt, prot->memory_allocated); 1741 atomic_long_sub(amt, prot->memory_allocated);
1711 return 0; 1742 return 0;
1712} 1743}
1713EXPORT_SYMBOL(__sk_mem_schedule); 1744EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1720,12 +1751,12 @@ void __sk_mem_reclaim(struct sock *sk)
1720{ 1751{
1721 struct proto *prot = sk->sk_prot; 1752 struct proto *prot = sk->sk_prot;
1722 1753
1723 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 1754 atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1724 prot->memory_allocated); 1755 prot->memory_allocated);
1725 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 1756 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1726 1757
1727 if (prot->memory_pressure && *prot->memory_pressure && 1758 if (prot->memory_pressure && *prot->memory_pressure &&
1728 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) 1759 (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1729 *prot->memory_pressure = 0; 1760 *prot->memory_pressure = 0;
1730} 1761}
1731EXPORT_SYMBOL(__sk_mem_reclaim); 1762EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -1877,7 +1908,7 @@ static void sock_def_readable(struct sock *sk, int len)
1877 rcu_read_lock(); 1908 rcu_read_lock();
1878 wq = rcu_dereference(sk->sk_wq); 1909 wq = rcu_dereference(sk->sk_wq);
1879 if (wq_has_sleeper(wq)) 1910 if (wq_has_sleeper(wq))
1880 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | 1911 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1881 POLLRDNORM | POLLRDBAND); 1912 POLLRDNORM | POLLRDBAND);
1882 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 1913 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1883 rcu_read_unlock(); 1914 rcu_read_unlock();
@@ -2445,12 +2476,12 @@ static char proto_method_implemented(const void *method)
2445 2476
2446static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2477static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2447{ 2478{
2448 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " 2479 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2449 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2480 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2450 proto->name, 2481 proto->name,
2451 proto->obj_size, 2482 proto->obj_size,
2452 sock_prot_inuse_get(seq_file_net(seq), proto), 2483 sock_prot_inuse_get(seq_file_net(seq), proto),
2453 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, 2484 proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
2454 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", 2485 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2455 proto->max_header, 2486 proto->max_header,
2456 proto->slab == NULL ? "no" : "yes", 2487 proto->slab == NULL ? "no" : "yes",
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 01eee5d984be..77a65f031488 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -17,6 +17,7 @@
17 17
18#include <net/ip.h> 18#include <net/ip.h>
19#include <net/sock.h> 19#include <net/sock.h>
20#include <net/net_ratelimit.h>
20 21
21#ifdef CONFIG_RPS 22#ifdef CONFIG_RPS
22static int rps_sock_flow_sysctl(ctl_table *table, int write, 23static int rps_sock_flow_sysctl(ctl_table *table, int write,
@@ -34,7 +35,8 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
34 35
35 mutex_lock(&sock_flow_mutex); 36 mutex_lock(&sock_flow_mutex);
36 37
37 orig_sock_table = rps_sock_flow_table; 38 orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
39 lockdep_is_held(&sock_flow_mutex));
38 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; 40 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
39 41
40 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 42 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
@@ -121,6 +123,15 @@ static struct ctl_table net_core_table[] = {
121 .mode = 0644, 123 .mode = 0644,
122 .proc_handler = proc_dointvec 124 .proc_handler = proc_dointvec
123 }, 125 },
126#ifdef CONFIG_BPF_JIT
127 {
128 .procname = "bpf_jit_enable",
129 .data = &bpf_jit_enable,
130 .maxlen = sizeof(int),
131 .mode = 0644,
132 .proc_handler = proc_dointvec
133 },
134#endif
124 { 135 {
125 .procname = "netdev_tstamp_prequeue", 136 .procname = "netdev_tstamp_prequeue",
126 .data = &netdev_tstamp_prequeue, 137 .data = &netdev_tstamp_prequeue,
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 0ae6c22da85b..7e7ca375d431 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -26,12 +26,12 @@ static struct sock_filter ptp_filter[] = {
26 PTP_FILTER 26 PTP_FILTER
27}; 27};
28 28
29static unsigned int classify(struct sk_buff *skb) 29static unsigned int classify(const struct sk_buff *skb)
30{ 30{
31 if (likely(skb->dev && 31 if (likely(skb->dev &&
32 skb->dev->phydev && 32 skb->dev->phydev &&
33 skb->dev->phydev->drv)) 33 skb->dev->phydev->drv))
34 return sk_run_filter(skb, ptp_filter, ARRAY_SIZE(ptp_filter)); 34 return sk_run_filter(skb, ptp_filter);
35 else 35 else
36 return PTP_CLASS_NONE; 36 return PTP_CLASS_NONE;
37} 37}
@@ -96,11 +96,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
96 struct phy_device *phydev; 96 struct phy_device *phydev;
97 unsigned int type; 97 unsigned int type;
98 98
99 skb_push(skb, ETH_HLEN); 99 if (skb_headroom(skb) < ETH_HLEN)
100 return false;
101 __skb_push(skb, ETH_HLEN);
100 102
101 type = classify(skb); 103 type = classify(skb);
102 104
103 skb_pull(skb, ETH_HLEN); 105 __skb_pull(skb, ETH_HLEN);
104 106
105 switch (type) { 107 switch (type) {
106 case PTP_CLASS_V1_IPV4: 108 case PTP_CLASS_V1_IPV4:
diff --git a/net/core/utils.c b/net/core/utils.c
index f41854470539..386e263f6066 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -27,6 +27,7 @@
27#include <linux/ratelimit.h> 27#include <linux/ratelimit.h>
28 28
29#include <net/sock.h> 29#include <net/sock.h>
30#include <net/net_ratelimit.h>
30 31
31#include <asm/byteorder.h> 32#include <asm/byteorder.h>
32#include <asm/system.h> 33#include <asm/system.h>
@@ -75,7 +76,7 @@ __be32 in_aton(const char *str)
75 str++; 76 str++;
76 } 77 }
77 } 78 }
78 return(htonl(l)); 79 return htonl(l);
79} 80}
80EXPORT_SYMBOL(in_aton); 81EXPORT_SYMBOL(in_aton);
81 82
@@ -92,18 +93,19 @@ EXPORT_SYMBOL(in_aton);
92 93
93static inline int xdigit2bin(char c, int delim) 94static inline int xdigit2bin(char c, int delim)
94{ 95{
96 int val;
97
95 if (c == delim || c == '\0') 98 if (c == delim || c == '\0')
96 return IN6PTON_DELIM; 99 return IN6PTON_DELIM;
97 if (c == ':') 100 if (c == ':')
98 return IN6PTON_COLON_MASK; 101 return IN6PTON_COLON_MASK;
99 if (c == '.') 102 if (c == '.')
100 return IN6PTON_DOT; 103 return IN6PTON_DOT;
101 if (c >= '0' && c <= '9') 104
102 return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); 105 val = hex_to_bin(c);
103 if (c >= 'a' && c <= 'f') 106 if (val >= 0)
104 return (IN6PTON_XDIGIT | (c - 'a' + 10)); 107 return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
105 if (c >= 'A' && c <= 'F') 108
106 return (IN6PTON_XDIGIT | (c - 'A' + 10));
107 if (delim == -1) 109 if (delim == -1)
108 return IN6PTON_DELIM; 110 return IN6PTON_DELIM;
109 return IN6PTON_UNKNOWN; 111 return IN6PTON_UNKNOWN;
@@ -295,3 +297,27 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
295 csum_unfold(*sum))); 297 csum_unfold(*sum)));
296} 298}
297EXPORT_SYMBOL(inet_proto_csum_replace4); 299EXPORT_SYMBOL(inet_proto_csum_replace4);
300
301int mac_pton(const char *s, u8 *mac)
302{
303 int i;
304
305 /* XX:XX:XX:XX:XX:XX */
306 if (strlen(s) < 3 * ETH_ALEN - 1)
307 return 0;
308
309 /* Don't dirty result unless string is valid MAC. */
310 for (i = 0; i < ETH_ALEN; i++) {
311 if (!strchr("0123456789abcdefABCDEF", s[i * 3]))
312 return 0;
313 if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1]))
314 return 0;
315 if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':')
316 return 0;
317 }
318 for (i = 0; i < ETH_ALEN; i++) {
319 mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]);
320 }
321 return 1;
322}
323EXPORT_SYMBOL(mac_pton);