aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c1756
1 files changed, 585 insertions, 1171 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index d0cbc93fcf32..a06a7a58dd11 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -97,8 +97,6 @@
97#include <net/net_namespace.h> 97#include <net/net_namespace.h>
98#include <net/sock.h> 98#include <net/sock.h>
99#include <linux/rtnetlink.h> 99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h> 100#include <linux/stat.h>
103#include <net/dst.h> 101#include <net/dst.h>
104#include <net/pkt_sched.h> 102#include <net/pkt_sched.h>
@@ -106,12 +104,10 @@
106#include <net/xfrm.h> 104#include <net/xfrm.h>
107#include <linux/highmem.h> 105#include <linux/highmem.h>
108#include <linux/init.h> 106#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h> 107#include <linux/module.h>
111#include <linux/netpoll.h> 108#include <linux/netpoll.h>
112#include <linux/rcupdate.h> 109#include <linux/rcupdate.h>
113#include <linux/delay.h> 110#include <linux/delay.h>
114#include <net/wext.h>
115#include <net/iw_handler.h> 111#include <net/iw_handler.h>
116#include <asm/current.h> 112#include <asm/current.h>
117#include <linux/audit.h> 113#include <linux/audit.h>
@@ -132,9 +128,7 @@
132#include <linux/pci.h> 128#include <linux/pci.h>
133#include <linux/inetdevice.h> 129#include <linux/inetdevice.h>
134#include <linux/cpu_rmap.h> 130#include <linux/cpu_rmap.h>
135#include <linux/net_tstamp.h>
136#include <linux/static_key.h> 131#include <linux/static_key.h>
137#include <net/flow_keys.h>
138 132
139#include "net-sysfs.h" 133#include "net-sysfs.h"
140 134
@@ -144,41 +138,10 @@
144/* This should be increased if a protocol with a bigger head is added. */ 138/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128) 139#define GRO_MAX_HEAD (MAX_HEADER + 128)
146 140
147/*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
158 * the average user (w/out VLANs) will not be adversely affected.
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
175#define PTYPE_HASH_SIZE (16)
176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
178static DEFINE_SPINLOCK(ptype_lock); 141static DEFINE_SPINLOCK(ptype_lock);
179static DEFINE_SPINLOCK(offload_lock); 142static DEFINE_SPINLOCK(offload_lock);
180static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 143struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
181static struct list_head ptype_all __read_mostly; /* Taps */ 144struct list_head ptype_all __read_mostly; /* Taps */
182static struct list_head offload_base __read_mostly; 145static struct list_head offload_base __read_mostly;
183 146
184/* 147/*
@@ -203,7 +166,7 @@ static struct list_head offload_base __read_mostly;
203DEFINE_RWLOCK(dev_base_lock); 166DEFINE_RWLOCK(dev_base_lock);
204EXPORT_SYMBOL(dev_base_lock); 167EXPORT_SYMBOL(dev_base_lock);
205 168
206DEFINE_SEQLOCK(devnet_rename_seq); 169seqcount_t devnet_rename_seq;
207 170
208static inline void dev_base_seq_inc(struct net *net) 171static inline void dev_base_seq_inc(struct net *net)
209{ 172{
@@ -695,11 +658,10 @@ __setup("netdev=", netdev_boot_setup);
695 658
696struct net_device *__dev_get_by_name(struct net *net, const char *name) 659struct net_device *__dev_get_by_name(struct net *net, const char *name)
697{ 660{
698 struct hlist_node *p;
699 struct net_device *dev; 661 struct net_device *dev;
700 struct hlist_head *head = dev_name_hash(net, name); 662 struct hlist_head *head = dev_name_hash(net, name);
701 663
702 hlist_for_each_entry(dev, p, head, name_hlist) 664 hlist_for_each_entry(dev, head, name_hlist)
703 if (!strncmp(dev->name, name, IFNAMSIZ)) 665 if (!strncmp(dev->name, name, IFNAMSIZ))
704 return dev; 666 return dev;
705 667
@@ -721,11 +683,10 @@ EXPORT_SYMBOL(__dev_get_by_name);
721 683
722struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 684struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
723{ 685{
724 struct hlist_node *p;
725 struct net_device *dev; 686 struct net_device *dev;
726 struct hlist_head *head = dev_name_hash(net, name); 687 struct hlist_head *head = dev_name_hash(net, name);
727 688
728 hlist_for_each_entry_rcu(dev, p, head, name_hlist) 689 hlist_for_each_entry_rcu(dev, head, name_hlist)
729 if (!strncmp(dev->name, name, IFNAMSIZ)) 690 if (!strncmp(dev->name, name, IFNAMSIZ))
730 return dev; 691 return dev;
731 692
@@ -772,11 +733,10 @@ EXPORT_SYMBOL(dev_get_by_name);
772 733
773struct net_device *__dev_get_by_index(struct net *net, int ifindex) 734struct net_device *__dev_get_by_index(struct net *net, int ifindex)
774{ 735{
775 struct hlist_node *p;
776 struct net_device *dev; 736 struct net_device *dev;
777 struct hlist_head *head = dev_index_hash(net, ifindex); 737 struct hlist_head *head = dev_index_hash(net, ifindex);
778 738
779 hlist_for_each_entry(dev, p, head, index_hlist) 739 hlist_for_each_entry(dev, head, index_hlist)
780 if (dev->ifindex == ifindex) 740 if (dev->ifindex == ifindex)
781 return dev; 741 return dev;
782 742
@@ -797,11 +757,10 @@ EXPORT_SYMBOL(__dev_get_by_index);
797 757
798struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 758struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
799{ 759{
800 struct hlist_node *p;
801 struct net_device *dev; 760 struct net_device *dev;
802 struct hlist_head *head = dev_index_hash(net, ifindex); 761 struct hlist_head *head = dev_index_hash(net, ifindex);
803 762
804 hlist_for_each_entry_rcu(dev, p, head, index_hlist) 763 hlist_for_each_entry_rcu(dev, head, index_hlist)
805 if (dev->ifindex == ifindex) 764 if (dev->ifindex == ifindex)
806 return dev; 765 return dev;
807 766
@@ -1093,10 +1052,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
1093 if (dev->flags & IFF_UP) 1052 if (dev->flags & IFF_UP)
1094 return -EBUSY; 1053 return -EBUSY;
1095 1054
1096 write_seqlock(&devnet_rename_seq); 1055 write_seqcount_begin(&devnet_rename_seq);
1097 1056
1098 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1057 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099 write_sequnlock(&devnet_rename_seq); 1058 write_seqcount_end(&devnet_rename_seq);
1100 return 0; 1059 return 0;
1101 } 1060 }
1102 1061
@@ -1104,7 +1063,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
1104 1063
1105 err = dev_get_valid_name(net, dev, newname); 1064 err = dev_get_valid_name(net, dev, newname);
1106 if (err < 0) { 1065 if (err < 0) {
1107 write_sequnlock(&devnet_rename_seq); 1066 write_seqcount_end(&devnet_rename_seq);
1108 return err; 1067 return err;
1109 } 1068 }
1110 1069
@@ -1112,11 +1071,11 @@ rollback:
1112 ret = device_rename(&dev->dev, dev->name); 1071 ret = device_rename(&dev->dev, dev->name);
1113 if (ret) { 1072 if (ret) {
1114 memcpy(dev->name, oldname, IFNAMSIZ); 1073 memcpy(dev->name, oldname, IFNAMSIZ);
1115 write_sequnlock(&devnet_rename_seq); 1074 write_seqcount_end(&devnet_rename_seq);
1116 return ret; 1075 return ret;
1117 } 1076 }
1118 1077
1119 write_sequnlock(&devnet_rename_seq); 1078 write_seqcount_end(&devnet_rename_seq);
1120 1079
1121 write_lock_bh(&dev_base_lock); 1080 write_lock_bh(&dev_base_lock);
1122 hlist_del_rcu(&dev->name_hlist); 1081 hlist_del_rcu(&dev->name_hlist);
@@ -1135,7 +1094,7 @@ rollback:
1135 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1094 /* err >= 0 after dev_alloc_name() or stores the first errno */
1136 if (err >= 0) { 1095 if (err >= 0) {
1137 err = ret; 1096 err = ret;
1138 write_seqlock(&devnet_rename_seq); 1097 write_seqcount_begin(&devnet_rename_seq);
1139 memcpy(dev->name, oldname, IFNAMSIZ); 1098 memcpy(dev->name, oldname, IFNAMSIZ);
1140 goto rollback; 1099 goto rollback;
1141 } else { 1100 } else {
@@ -1227,36 +1186,6 @@ void netdev_notify_peers(struct net_device *dev)
1227} 1186}
1228EXPORT_SYMBOL(netdev_notify_peers); 1187EXPORT_SYMBOL(netdev_notify_peers);
1229 1188
1230/**
1231 * dev_load - load a network module
1232 * @net: the applicable net namespace
1233 * @name: name of interface
1234 *
1235 * If a network interface is not present and the process has suitable
1236 * privileges this function loads the module. If module loading is not
1237 * available in this kernel then it becomes a nop.
1238 */
1239
1240void dev_load(struct net *net, const char *name)
1241{
1242 struct net_device *dev;
1243 int no_module;
1244
1245 rcu_read_lock();
1246 dev = dev_get_by_name_rcu(net, name);
1247 rcu_read_unlock();
1248
1249 no_module = !dev;
1250 if (no_module && capable(CAP_NET_ADMIN))
1251 no_module = request_module("netdev-%s", name);
1252 if (no_module && capable(CAP_SYS_MODULE)) {
1253 if (!request_module("%s", name))
1254 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1255 name);
1256 }
1257}
1258EXPORT_SYMBOL(dev_load);
1259
1260static int __dev_open(struct net_device *dev) 1189static int __dev_open(struct net_device *dev)
1261{ 1190{
1262 const struct net_device_ops *ops = dev->netdev_ops; 1191 const struct net_device_ops *ops = dev->netdev_ops;
@@ -1267,6 +1196,14 @@ static int __dev_open(struct net_device *dev)
1267 if (!netif_device_present(dev)) 1196 if (!netif_device_present(dev))
1268 return -ENODEV; 1197 return -ENODEV;
1269 1198
1199 /* Block netpoll from trying to do any rx path servicing.
1200 * If we don't do this there is a chance ndo_poll_controller
1201 * or ndo_poll may be running while we open the device
1202 */
1203 ret = netpoll_rx_disable(dev);
1204 if (ret)
1205 return ret;
1206
1270 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1207 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1271 ret = notifier_to_errno(ret); 1208 ret = notifier_to_errno(ret);
1272 if (ret) 1209 if (ret)
@@ -1280,6 +1217,8 @@ static int __dev_open(struct net_device *dev)
1280 if (!ret && ops->ndo_open) 1217 if (!ret && ops->ndo_open)
1281 ret = ops->ndo_open(dev); 1218 ret = ops->ndo_open(dev);
1282 1219
1220 netpoll_rx_enable(dev);
1221
1283 if (ret) 1222 if (ret)
1284 clear_bit(__LINK_STATE_START, &dev->state); 1223 clear_bit(__LINK_STATE_START, &dev->state);
1285 else { 1224 else {
@@ -1371,9 +1310,16 @@ static int __dev_close(struct net_device *dev)
1371 int retval; 1310 int retval;
1372 LIST_HEAD(single); 1311 LIST_HEAD(single);
1373 1312
1313 /* Temporarily disable netpoll until the interface is down */
1314 retval = netpoll_rx_disable(dev);
1315 if (retval)
1316 return retval;
1317
1374 list_add(&dev->unreg_list, &single); 1318 list_add(&dev->unreg_list, &single);
1375 retval = __dev_close_many(&single); 1319 retval = __dev_close_many(&single);
1376 list_del(&single); 1320 list_del(&single);
1321
1322 netpoll_rx_enable(dev);
1377 return retval; 1323 return retval;
1378} 1324}
1379 1325
@@ -1409,14 +1355,22 @@ static int dev_close_many(struct list_head *head)
1409 */ 1355 */
1410int dev_close(struct net_device *dev) 1356int dev_close(struct net_device *dev)
1411{ 1357{
1358 int ret = 0;
1412 if (dev->flags & IFF_UP) { 1359 if (dev->flags & IFF_UP) {
1413 LIST_HEAD(single); 1360 LIST_HEAD(single);
1414 1361
1362 /* Block netpoll rx while the interface is going down */
1363 ret = netpoll_rx_disable(dev);
1364 if (ret)
1365 return ret;
1366
1415 list_add(&dev->unreg_list, &single); 1367 list_add(&dev->unreg_list, &single);
1416 dev_close_many(&single); 1368 dev_close_many(&single);
1417 list_del(&single); 1369 list_del(&single);
1370
1371 netpoll_rx_enable(dev);
1418 } 1372 }
1419 return 0; 1373 return ret;
1420} 1374}
1421EXPORT_SYMBOL(dev_close); 1375EXPORT_SYMBOL(dev_close);
1422 1376
@@ -1621,57 +1575,6 @@ static inline void net_timestamp_set(struct sk_buff *skb)
1621 __net_timestamp(SKB); \ 1575 __net_timestamp(SKB); \
1622 } \ 1576 } \
1623 1577
1624static int net_hwtstamp_validate(struct ifreq *ifr)
1625{
1626 struct hwtstamp_config cfg;
1627 enum hwtstamp_tx_types tx_type;
1628 enum hwtstamp_rx_filters rx_filter;
1629 int tx_type_valid = 0;
1630 int rx_filter_valid = 0;
1631
1632 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1633 return -EFAULT;
1634
1635 if (cfg.flags) /* reserved for future extensions */
1636 return -EINVAL;
1637
1638 tx_type = cfg.tx_type;
1639 rx_filter = cfg.rx_filter;
1640
1641 switch (tx_type) {
1642 case HWTSTAMP_TX_OFF:
1643 case HWTSTAMP_TX_ON:
1644 case HWTSTAMP_TX_ONESTEP_SYNC:
1645 tx_type_valid = 1;
1646 break;
1647 }
1648
1649 switch (rx_filter) {
1650 case HWTSTAMP_FILTER_NONE:
1651 case HWTSTAMP_FILTER_ALL:
1652 case HWTSTAMP_FILTER_SOME:
1653 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1654 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1655 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1656 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1657 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1658 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1659 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1660 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1661 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1662 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1663 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1664 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1665 rx_filter_valid = 1;
1666 break;
1667 }
1668
1669 if (!tx_type_valid || !rx_filter_valid)
1670 return -ERANGE;
1671
1672 return 0;
1673}
1674
1675static inline bool is_skb_forwardable(struct net_device *dev, 1578static inline bool is_skb_forwardable(struct net_device *dev,
1676 struct sk_buff *skb) 1579 struct sk_buff *skb)
1677{ 1580{
@@ -1857,6 +1760,230 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1857 } 1760 }
1858} 1761}
1859 1762
1763#ifdef CONFIG_XPS
1764static DEFINE_MUTEX(xps_map_mutex);
1765#define xmap_dereference(P) \
1766 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1767
1768static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1769 int cpu, u16 index)
1770{
1771 struct xps_map *map = NULL;
1772 int pos;
1773
1774 if (dev_maps)
1775 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1776
1777 for (pos = 0; map && pos < map->len; pos++) {
1778 if (map->queues[pos] == index) {
1779 if (map->len > 1) {
1780 map->queues[pos] = map->queues[--map->len];
1781 } else {
1782 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1783 kfree_rcu(map, rcu);
1784 map = NULL;
1785 }
1786 break;
1787 }
1788 }
1789
1790 return map;
1791}
1792
1793static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1794{
1795 struct xps_dev_maps *dev_maps;
1796 int cpu, i;
1797 bool active = false;
1798
1799 mutex_lock(&xps_map_mutex);
1800 dev_maps = xmap_dereference(dev->xps_maps);
1801
1802 if (!dev_maps)
1803 goto out_no_maps;
1804
1805 for_each_possible_cpu(cpu) {
1806 for (i = index; i < dev->num_tx_queues; i++) {
1807 if (!remove_xps_queue(dev_maps, cpu, i))
1808 break;
1809 }
1810 if (i == dev->num_tx_queues)
1811 active = true;
1812 }
1813
1814 if (!active) {
1815 RCU_INIT_POINTER(dev->xps_maps, NULL);
1816 kfree_rcu(dev_maps, rcu);
1817 }
1818
1819 for (i = index; i < dev->num_tx_queues; i++)
1820 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1821 NUMA_NO_NODE);
1822
1823out_no_maps:
1824 mutex_unlock(&xps_map_mutex);
1825}
1826
1827static struct xps_map *expand_xps_map(struct xps_map *map,
1828 int cpu, u16 index)
1829{
1830 struct xps_map *new_map;
1831 int alloc_len = XPS_MIN_MAP_ALLOC;
1832 int i, pos;
1833
1834 for (pos = 0; map && pos < map->len; pos++) {
1835 if (map->queues[pos] != index)
1836 continue;
1837 return map;
1838 }
1839
1840 /* Need to add queue to this CPU's existing map */
1841 if (map) {
1842 if (pos < map->alloc_len)
1843 return map;
1844
1845 alloc_len = map->alloc_len * 2;
1846 }
1847
1848 /* Need to allocate new map to store queue on this CPU's map */
1849 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1850 cpu_to_node(cpu));
1851 if (!new_map)
1852 return NULL;
1853
1854 for (i = 0; i < pos; i++)
1855 new_map->queues[i] = map->queues[i];
1856 new_map->alloc_len = alloc_len;
1857 new_map->len = pos;
1858
1859 return new_map;
1860}
1861
1862int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1863{
1864 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1865 struct xps_map *map, *new_map;
1866 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1867 int cpu, numa_node_id = -2;
1868 bool active = false;
1869
1870 mutex_lock(&xps_map_mutex);
1871
1872 dev_maps = xmap_dereference(dev->xps_maps);
1873
1874 /* allocate memory for queue storage */
1875 for_each_online_cpu(cpu) {
1876 if (!cpumask_test_cpu(cpu, mask))
1877 continue;
1878
1879 if (!new_dev_maps)
1880 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1881 if (!new_dev_maps) {
1882 mutex_unlock(&xps_map_mutex);
1883 return -ENOMEM;
1884 }
1885
1886 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1887 NULL;
1888
1889 map = expand_xps_map(map, cpu, index);
1890 if (!map)
1891 goto error;
1892
1893 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1894 }
1895
1896 if (!new_dev_maps)
1897 goto out_no_new_maps;
1898
1899 for_each_possible_cpu(cpu) {
1900 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1901 /* add queue to CPU maps */
1902 int pos = 0;
1903
1904 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1905 while ((pos < map->len) && (map->queues[pos] != index))
1906 pos++;
1907
1908 if (pos == map->len)
1909 map->queues[map->len++] = index;
1910#ifdef CONFIG_NUMA
1911 if (numa_node_id == -2)
1912 numa_node_id = cpu_to_node(cpu);
1913 else if (numa_node_id != cpu_to_node(cpu))
1914 numa_node_id = -1;
1915#endif
1916 } else if (dev_maps) {
1917 /* fill in the new device map from the old device map */
1918 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1919 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1920 }
1921
1922 }
1923
1924 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1925
1926 /* Cleanup old maps */
1927 if (dev_maps) {
1928 for_each_possible_cpu(cpu) {
1929 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1930 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1931 if (map && map != new_map)
1932 kfree_rcu(map, rcu);
1933 }
1934
1935 kfree_rcu(dev_maps, rcu);
1936 }
1937
1938 dev_maps = new_dev_maps;
1939 active = true;
1940
1941out_no_new_maps:
1942 /* update Tx queue numa node */
1943 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1944 (numa_node_id >= 0) ? numa_node_id :
1945 NUMA_NO_NODE);
1946
1947 if (!dev_maps)
1948 goto out_no_maps;
1949
1950 /* removes queue from unused CPUs */
1951 for_each_possible_cpu(cpu) {
1952 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1953 continue;
1954
1955 if (remove_xps_queue(dev_maps, cpu, index))
1956 active = true;
1957 }
1958
1959 /* free map if not active */
1960 if (!active) {
1961 RCU_INIT_POINTER(dev->xps_maps, NULL);
1962 kfree_rcu(dev_maps, rcu);
1963 }
1964
1965out_no_maps:
1966 mutex_unlock(&xps_map_mutex);
1967
1968 return 0;
1969error:
1970 /* remove any maps that we added */
1971 for_each_possible_cpu(cpu) {
1972 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1973 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1974 NULL;
1975 if (new_map && new_map != map)
1976 kfree(new_map);
1977 }
1978
1979 mutex_unlock(&xps_map_mutex);
1980
1981 kfree(new_dev_maps);
1982 return -ENOMEM;
1983}
1984EXPORT_SYMBOL(netif_set_xps_queue);
1985
1986#endif
1860/* 1987/*
1861 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1988 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1862 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1989 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1880,8 +2007,12 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1880 if (dev->num_tc) 2007 if (dev->num_tc)
1881 netif_setup_tc(dev, txq); 2008 netif_setup_tc(dev, txq);
1882 2009
1883 if (txq < dev->real_num_tx_queues) 2010 if (txq < dev->real_num_tx_queues) {
1884 qdisc_reset_all_tx_gt(dev, txq); 2011 qdisc_reset_all_tx_gt(dev, txq);
2012#ifdef CONFIG_XPS
2013 netif_reset_xps_queues_gt(dev, txq);
2014#endif
2015 }
1885 } 2016 }
1886 2017
1887 dev->real_num_tx_queues = txq; 2018 dev->real_num_tx_queues = txq;
@@ -2046,6 +2177,15 @@ int skb_checksum_help(struct sk_buff *skb)
2046 return -EINVAL; 2177 return -EINVAL;
2047 } 2178 }
2048 2179
2180 /* Before computing a checksum, we should make sure no frag could
2181 * be modified by an external entity : checksum could be wrong.
2182 */
2183 if (skb_has_shared_frag(skb)) {
2184 ret = __skb_linearize(skb);
2185 if (ret)
2186 goto out;
2187 }
2188
2049 offset = skb_checksum_start_offset(skb); 2189 offset = skb_checksum_start_offset(skb);
2050 BUG_ON(offset >= skb_headlen(skb)); 2190 BUG_ON(offset >= skb_headlen(skb));
2051 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2191 csum = skb_checksum(skb, offset, skb->len - offset, 0);
@@ -2069,25 +2209,19 @@ out:
2069EXPORT_SYMBOL(skb_checksum_help); 2209EXPORT_SYMBOL(skb_checksum_help);
2070 2210
2071/** 2211/**
2072 * skb_gso_segment - Perform segmentation on skb. 2212 * skb_mac_gso_segment - mac layer segmentation handler.
2073 * @skb: buffer to segment 2213 * @skb: buffer to segment
2074 * @features: features for the output path (see dev->features) 2214 * @features: features for the output path (see dev->features)
2075 *
2076 * This function segments the given skb and returns a list of segments.
2077 *
2078 * It may return NULL if the skb requires no segmentation. This is
2079 * only possible when GSO is used for verifying header integrity.
2080 */ 2215 */
2081struct sk_buff *skb_gso_segment(struct sk_buff *skb, 2216struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2082 netdev_features_t features) 2217 netdev_features_t features)
2083{ 2218{
2084 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2219 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2085 struct packet_offload *ptype; 2220 struct packet_offload *ptype;
2086 __be16 type = skb->protocol; 2221 __be16 type = skb->protocol;
2087 int vlan_depth = ETH_HLEN;
2088 int err;
2089 2222
2090 while (type == htons(ETH_P_8021Q)) { 2223 while (type == htons(ETH_P_8021Q)) {
2224 int vlan_depth = ETH_HLEN;
2091 struct vlan_hdr *vh; 2225 struct vlan_hdr *vh;
2092 2226
2093 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) 2227 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
@@ -2098,22 +2232,14 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2098 vlan_depth += VLAN_HLEN; 2232 vlan_depth += VLAN_HLEN;
2099 } 2233 }
2100 2234
2101 skb_reset_mac_header(skb);
2102 skb->mac_len = skb->network_header - skb->mac_header;
2103 __skb_pull(skb, skb->mac_len); 2235 __skb_pull(skb, skb->mac_len);
2104 2236
2105 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2106 skb_warn_bad_offload(skb);
2107
2108 if (skb_header_cloned(skb) &&
2109 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2110 return ERR_PTR(err);
2111 }
2112
2113 rcu_read_lock(); 2237 rcu_read_lock();
2114 list_for_each_entry_rcu(ptype, &offload_base, list) { 2238 list_for_each_entry_rcu(ptype, &offload_base, list) {
2115 if (ptype->type == type && ptype->callbacks.gso_segment) { 2239 if (ptype->type == type && ptype->callbacks.gso_segment) {
2116 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 2240 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2241 int err;
2242
2117 err = ptype->callbacks.gso_send_check(skb); 2243 err = ptype->callbacks.gso_send_check(skb);
2118 segs = ERR_PTR(err); 2244 segs = ERR_PTR(err);
2119 if (err || skb_gso_ok(skb, features)) 2245 if (err || skb_gso_ok(skb, features))
@@ -2131,7 +2257,50 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2131 2257
2132 return segs; 2258 return segs;
2133} 2259}
2134EXPORT_SYMBOL(skb_gso_segment); 2260EXPORT_SYMBOL(skb_mac_gso_segment);
2261
2262
2263/* openvswitch calls this on rx path, so we need a different check.
2264 */
2265static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2266{
2267 if (tx_path)
2268 return skb->ip_summed != CHECKSUM_PARTIAL;
2269 else
2270 return skb->ip_summed == CHECKSUM_NONE;
2271}
2272
2273/**
2274 * __skb_gso_segment - Perform segmentation on skb.
2275 * @skb: buffer to segment
2276 * @features: features for the output path (see dev->features)
2277 * @tx_path: whether it is called in TX path
2278 *
2279 * This function segments the given skb and returns a list of segments.
2280 *
2281 * It may return NULL if the skb requires no segmentation. This is
2282 * only possible when GSO is used for verifying header integrity.
2283 */
2284struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2285 netdev_features_t features, bool tx_path)
2286{
2287 if (unlikely(skb_needs_check(skb, tx_path))) {
2288 int err;
2289
2290 skb_warn_bad_offload(skb);
2291
2292 if (skb_header_cloned(skb) &&
2293 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2294 return ERR_PTR(err);
2295 }
2296
2297 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2298 skb_reset_mac_header(skb);
2299 skb_reset_mac_len(skb);
2300
2301 return skb_mac_gso_segment(skb, features);
2302}
2303EXPORT_SYMBOL(__skb_gso_segment);
2135 2304
2136/* Take action when hardware reception checksum errors are detected. */ 2305/* Take action when hardware reception checksum errors are detected. */
2137#ifdef CONFIG_BUG 2306#ifdef CONFIG_BUG
@@ -2410,126 +2579,28 @@ out:
2410 return rc; 2579 return rc;
2411} 2580}
2412 2581
2413static u32 hashrnd __read_mostly; 2582static void qdisc_pkt_len_init(struct sk_buff *skb)
2414
2415/*
2416 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2417 * to be used as a distribution range.
2418 */
2419u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2420 unsigned int num_tx_queues)
2421{
2422 u32 hash;
2423 u16 qoffset = 0;
2424 u16 qcount = num_tx_queues;
2425
2426 if (skb_rx_queue_recorded(skb)) {
2427 hash = skb_get_rx_queue(skb);
2428 while (unlikely(hash >= num_tx_queues))
2429 hash -= num_tx_queues;
2430 return hash;
2431 }
2432
2433 if (dev->num_tc) {
2434 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2435 qoffset = dev->tc_to_txq[tc].offset;
2436 qcount = dev->tc_to_txq[tc].count;
2437 }
2438
2439 if (skb->sk && skb->sk->sk_hash)
2440 hash = skb->sk->sk_hash;
2441 else
2442 hash = (__force u16) skb->protocol;
2443 hash = jhash_1word(hash, hashrnd);
2444
2445 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2446}
2447EXPORT_SYMBOL(__skb_tx_hash);
2448
2449static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2450{
2451 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2452 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2453 dev->name, queue_index,
2454 dev->real_num_tx_queues);
2455 return 0;
2456 }
2457 return queue_index;
2458}
2459
2460static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2461{ 2583{
2462#ifdef CONFIG_XPS 2584 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2463 struct xps_dev_maps *dev_maps;
2464 struct xps_map *map;
2465 int queue_index = -1;
2466 2585
2467 rcu_read_lock(); 2586 qdisc_skb_cb(skb)->pkt_len = skb->len;
2468 dev_maps = rcu_dereference(dev->xps_maps);
2469 if (dev_maps) {
2470 map = rcu_dereference(
2471 dev_maps->cpu_map[raw_smp_processor_id()]);
2472 if (map) {
2473 if (map->len == 1)
2474 queue_index = map->queues[0];
2475 else {
2476 u32 hash;
2477 if (skb->sk && skb->sk->sk_hash)
2478 hash = skb->sk->sk_hash;
2479 else
2480 hash = (__force u16) skb->protocol ^
2481 skb->rxhash;
2482 hash = jhash_1word(hash, hashrnd);
2483 queue_index = map->queues[
2484 ((u64)hash * map->len) >> 32];
2485 }
2486 if (unlikely(queue_index >= dev->real_num_tx_queues))
2487 queue_index = -1;
2488 }
2489 }
2490 rcu_read_unlock();
2491
2492 return queue_index;
2493#else
2494 return -1;
2495#endif
2496}
2497
2498struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2499 struct sk_buff *skb)
2500{
2501 int queue_index;
2502 const struct net_device_ops *ops = dev->netdev_ops;
2503
2504 if (dev->real_num_tx_queues == 1)
2505 queue_index = 0;
2506 else if (ops->ndo_select_queue) {
2507 queue_index = ops->ndo_select_queue(dev, skb);
2508 queue_index = dev_cap_txqueue(dev, queue_index);
2509 } else {
2510 struct sock *sk = skb->sk;
2511 queue_index = sk_tx_queue_get(sk);
2512
2513 if (queue_index < 0 || skb->ooo_okay ||
2514 queue_index >= dev->real_num_tx_queues) {
2515 int old_index = queue_index;
2516 2587
2517 queue_index = get_xps_queue(dev, skb); 2588 /* To get more precise estimation of bytes sent on wire,
2518 if (queue_index < 0) 2589 * we add to pkt_len the headers size of all segments
2519 queue_index = skb_tx_hash(dev, skb); 2590 */
2591 if (shinfo->gso_size) {
2592 unsigned int hdr_len;
2520 2593
2521 if (queue_index != old_index && sk) { 2594 /* mac layer + network layer */
2522 struct dst_entry *dst = 2595 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2523 rcu_dereference_check(sk->sk_dst_cache, 1);
2524 2596
2525 if (dst && skb_dst(skb) == dst) 2597 /* + transport layer */
2526 sk_tx_queue_set(sk, queue_index); 2598 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2527 } 2599 hdr_len += tcp_hdrlen(skb);
2528 } 2600 else
2601 hdr_len += sizeof(struct udphdr);
2602 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2529 } 2603 }
2530
2531 skb_set_queue_mapping(skb, queue_index);
2532 return netdev_get_tx_queue(dev, queue_index);
2533} 2604}
2534 2605
2535static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2606static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
@@ -2540,7 +2611,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2540 bool contended; 2611 bool contended;
2541 int rc; 2612 int rc;
2542 2613
2543 qdisc_skb_cb(skb)->pkt_len = skb->len; 2614 qdisc_pkt_len_init(skb);
2544 qdisc_calculate_pkt_len(skb, q); 2615 qdisc_calculate_pkt_len(skb, q);
2545 /* 2616 /*
2546 * Heuristic to force contended enqueues to serialize on a 2617 * Heuristic to force contended enqueues to serialize on a
@@ -2663,6 +2734,8 @@ int dev_queue_xmit(struct sk_buff *skb)
2663 struct Qdisc *q; 2734 struct Qdisc *q;
2664 int rc = -ENOMEM; 2735 int rc = -ENOMEM;
2665 2736
2737 skb_reset_mac_header(skb);
2738
2666 /* Disable soft irqs for various locks below. Also 2739 /* Disable soft irqs for various locks below. Also
2667 * stops preemption for RCU. 2740 * stops preemption for RCU.
2668 */ 2741 */
@@ -2757,41 +2830,6 @@ static inline void ____napi_schedule(struct softnet_data *sd,
2757 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2830 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2758} 2831}
2759 2832
2760/*
2761 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2762 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2763 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2764 * if hash is a canonical 4-tuple hash over transport ports.
2765 */
2766void __skb_get_rxhash(struct sk_buff *skb)
2767{
2768 struct flow_keys keys;
2769 u32 hash;
2770
2771 if (!skb_flow_dissect(skb, &keys))
2772 return;
2773
2774 if (keys.ports)
2775 skb->l4_rxhash = 1;
2776
2777 /* get a consistent hash (same value on both flow directions) */
2778 if (((__force u32)keys.dst < (__force u32)keys.src) ||
2779 (((__force u32)keys.dst == (__force u32)keys.src) &&
2780 ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2781 swap(keys.dst, keys.src);
2782 swap(keys.port16[0], keys.port16[1]);
2783 }
2784
2785 hash = jhash_3words((__force u32)keys.dst,
2786 (__force u32)keys.src,
2787 (__force u32)keys.ports, hashrnd);
2788 if (!hash)
2789 hash = 1;
2790
2791 skb->rxhash = hash;
2792}
2793EXPORT_SYMBOL(__skb_get_rxhash);
2794
2795#ifdef CONFIG_RPS 2833#ifdef CONFIG_RPS
2796 2834
2797/* One global table that all flow-based protocols share. */ 2835/* One global table that all flow-based protocols share. */
@@ -3318,7 +3356,7 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3318 } 3356 }
3319} 3357}
3320 3358
3321static int __netif_receive_skb(struct sk_buff *skb) 3359static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3322{ 3360{
3323 struct packet_type *ptype, *pt_prev; 3361 struct packet_type *ptype, *pt_prev;
3324 rx_handler_func_t *rx_handler; 3362 rx_handler_func_t *rx_handler;
@@ -3327,24 +3365,11 @@ static int __netif_receive_skb(struct sk_buff *skb)
3327 bool deliver_exact = false; 3365 bool deliver_exact = false;
3328 int ret = NET_RX_DROP; 3366 int ret = NET_RX_DROP;
3329 __be16 type; 3367 __be16 type;
3330 unsigned long pflags = current->flags;
3331 3368
3332 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3369 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3333 3370
3334 trace_netif_receive_skb(skb); 3371 trace_netif_receive_skb(skb);
3335 3372
3336 /*
3337 * PFMEMALLOC skbs are special, they should
3338 * - be delivered to SOCK_MEMALLOC sockets only
3339 * - stay away from userspace
3340 * - have bounded memory usage
3341 *
3342 * Use PF_MEMALLOC as this saves us from propagating the allocation
3343 * context down to all allocation sites.
3344 */
3345 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3346 current->flags |= PF_MEMALLOC;
3347
3348 /* if we've gotten here through NAPI, check netpoll */ 3373 /* if we've gotten here through NAPI, check netpoll */
3349 if (netpoll_receive_skb(skb)) 3374 if (netpoll_receive_skb(skb))
3350 goto out; 3375 goto out;
@@ -3352,7 +3377,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
3352 orig_dev = skb->dev; 3377 orig_dev = skb->dev;
3353 3378
3354 skb_reset_network_header(skb); 3379 skb_reset_network_header(skb);
3355 skb_reset_transport_header(skb); 3380 if (!skb_transport_header_was_set(skb))
3381 skb_reset_transport_header(skb);
3356 skb_reset_mac_len(skb); 3382 skb_reset_mac_len(skb);
3357 3383
3358 pt_prev = NULL; 3384 pt_prev = NULL;
@@ -3377,7 +3403,7 @@ another_round:
3377 } 3403 }
3378#endif 3404#endif
3379 3405
3380 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) 3406 if (pfmemalloc)
3381 goto skip_taps; 3407 goto skip_taps;
3382 3408
3383 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3409 list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -3396,8 +3422,7 @@ skip_taps:
3396ncls: 3422ncls:
3397#endif 3423#endif
3398 3424
3399 if (sk_memalloc_socks() && skb_pfmemalloc(skb) 3425 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3400 && !skb_pfmemalloc_protocol(skb))
3401 goto drop; 3426 goto drop;
3402 3427
3403 if (vlan_tx_tag_present(skb)) { 3428 if (vlan_tx_tag_present(skb)) {
@@ -3467,7 +3492,31 @@ drop:
3467unlock: 3492unlock:
3468 rcu_read_unlock(); 3493 rcu_read_unlock();
3469out: 3494out:
3470 tsk_restore_flags(current, pflags, PF_MEMALLOC); 3495 return ret;
3496}
3497
3498static int __netif_receive_skb(struct sk_buff *skb)
3499{
3500 int ret;
3501
3502 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3503 unsigned long pflags = current->flags;
3504
3505 /*
3506 * PFMEMALLOC skbs are special, they should
3507 * - be delivered to SOCK_MEMALLOC sockets only
3508 * - stay away from userspace
3509 * - have bounded memory usage
3510 *
3511 * Use PF_MEMALLOC as this saves us from propagating the allocation
3512 * context down to all allocation sites.
3513 */
3514 current->flags |= PF_MEMALLOC;
3515 ret = __netif_receive_skb_core(skb, true);
3516 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3517 } else
3518 ret = __netif_receive_skb_core(skb, false);
3519
3471 return ret; 3520 return ret;
3472} 3521}
3473 3522
@@ -3634,7 +3683,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3634 __be16 type = skb->protocol; 3683 __be16 type = skb->protocol;
3635 struct list_head *head = &offload_base; 3684 struct list_head *head = &offload_base;
3636 int same_flow; 3685 int same_flow;
3637 int mac_len;
3638 enum gro_result ret; 3686 enum gro_result ret;
3639 3687
3640 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3688 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
@@ -3651,8 +3699,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3651 continue; 3699 continue;
3652 3700
3653 skb_set_network_header(skb, skb_gro_offset(skb)); 3701 skb_set_network_header(skb, skb_gro_offset(skb));
3654 mac_len = skb->network_header - skb->mac_header; 3702 skb_reset_mac_len(skb);
3655 skb->mac_len = mac_len;
3656 NAPI_GRO_CB(skb)->same_flow = 0; 3703 NAPI_GRO_CB(skb)->same_flow = 0;
3657 NAPI_GRO_CB(skb)->flush = 0; 3704 NAPI_GRO_CB(skb)->flush = 0;
3658 NAPI_GRO_CB(skb)->free = 0; 3705 NAPI_GRO_CB(skb)->free = 0;
@@ -4134,530 +4181,231 @@ softnet_break:
4134 goto out; 4181 goto out;
4135} 4182}
4136 4183
4137static gifconf_func_t *gifconf_list[NPROTO]; 4184struct netdev_upper {
4138
4139/**
4140 * register_gifconf - register a SIOCGIF handler
4141 * @family: Address family
4142 * @gifconf: Function handler
4143 *
4144 * Register protocol dependent address dumping routines. The handler
4145 * that is passed must not be freed or reused until it has been replaced
4146 * by another handler.
4147 */
4148int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4149{
4150 if (family >= NPROTO)
4151 return -EINVAL;
4152 gifconf_list[family] = gifconf;
4153 return 0;
4154}
4155EXPORT_SYMBOL(register_gifconf);
4156
4157
4158/*
4159 * Map an interface index to its name (SIOCGIFNAME)
4160 */
4161
4162/*
4163 * We need this ioctl for efficient implementation of the
4164 * if_indextoname() function required by the IPv6 API. Without
4165 * it, we would have to search all the interfaces to find a
4166 * match. --pb
4167 */
4168
4169static int dev_ifname(struct net *net, struct ifreq __user *arg)
4170{
4171 struct net_device *dev; 4185 struct net_device *dev;
4172 struct ifreq ifr; 4186 bool master;
4173 unsigned seq; 4187 struct list_head list;
4174 4188 struct rcu_head rcu;
4175 /* 4189 struct list_head search_list;
4176 * Fetch the caller's info block. 4190};
4177 */
4178 4191
4179 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 4192static void __append_search_uppers(struct list_head *search_list,
4180 return -EFAULT; 4193 struct net_device *dev)
4194{
4195 struct netdev_upper *upper;
4181 4196
4182retry: 4197 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4183 seq = read_seqbegin(&devnet_rename_seq); 4198 /* check if this upper is not already in search list */
4184 rcu_read_lock(); 4199 if (list_empty(&upper->search_list))
4185 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); 4200 list_add_tail(&upper->search_list, search_list);
4186 if (!dev) {
4187 rcu_read_unlock();
4188 return -ENODEV;
4189 } 4201 }
4190
4191 strcpy(ifr.ifr_name, dev->name);
4192 rcu_read_unlock();
4193 if (read_seqretry(&devnet_rename_seq, seq))
4194 goto retry;
4195
4196 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4197 return -EFAULT;
4198 return 0;
4199} 4202}
4200 4203
4201/* 4204static bool __netdev_search_upper_dev(struct net_device *dev,
4202 * Perform a SIOCGIFCONF call. This structure will change 4205 struct net_device *upper_dev)
4203 * size eventually, and there is nothing I can do about it.
4204 * Thus we will need a 'compatibility mode'.
4205 */
4206
4207static int dev_ifconf(struct net *net, char __user *arg)
4208{ 4206{
4209 struct ifconf ifc; 4207 LIST_HEAD(search_list);
4210 struct net_device *dev; 4208 struct netdev_upper *upper;
4211 char __user *pos; 4209 struct netdev_upper *tmp;
4212 int len; 4210 bool ret = false;
4213 int total;
4214 int i;
4215 4211
4216 /* 4212 __append_search_uppers(&search_list, dev);
4217 * Fetch the caller's info block. 4213 list_for_each_entry(upper, &search_list, search_list) {
4218 */ 4214 if (upper->dev == upper_dev) {
4219 4215 ret = true;
4220 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 4216 break;
4221 return -EFAULT;
4222
4223 pos = ifc.ifc_buf;
4224 len = ifc.ifc_len;
4225
4226 /*
4227 * Loop over the interfaces, and write an info block for each.
4228 */
4229
4230 total = 0;
4231 for_each_netdev(net, dev) {
4232 for (i = 0; i < NPROTO; i++) {
4233 if (gifconf_list[i]) {
4234 int done;
4235 if (!pos)
4236 done = gifconf_list[i](dev, NULL, 0);
4237 else
4238 done = gifconf_list[i](dev, pos + total,
4239 len - total);
4240 if (done < 0)
4241 return -EFAULT;
4242 total += done;
4243 }
4244 } 4217 }
4218 __append_search_uppers(&search_list, upper->dev);
4245 } 4219 }
4246 4220 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4247 /* 4221 INIT_LIST_HEAD(&upper->search_list);
4248 * All done. Write the updated control block back to the caller. 4222 return ret;
4249 */
4250 ifc.ifc_len = total;
4251
4252 /*
4253 * Both BSD and Solaris return 0 here, so we do too.
4254 */
4255 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4256} 4223}
4257 4224
4258#ifdef CONFIG_PROC_FS 4225static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4259 4226 struct net_device *upper_dev)
4260#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4261
4262#define get_bucket(x) ((x) >> BUCKET_SPACE)
4263#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4264#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4265
4266static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4267{ 4227{
4268 struct net *net = seq_file_net(seq); 4228 struct netdev_upper *upper;
4269 struct net_device *dev;
4270 struct hlist_node *p;
4271 struct hlist_head *h;
4272 unsigned int count = 0, offset = get_offset(*pos);
4273 4229
4274 h = &net->dev_name_head[get_bucket(*pos)]; 4230 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4275 hlist_for_each_entry_rcu(dev, p, h, name_hlist) { 4231 if (upper->dev == upper_dev)
4276 if (++count == offset) 4232 return upper;
4277 return dev;
4278 } 4233 }
4279
4280 return NULL; 4234 return NULL;
4281} 4235}
4282 4236
4283static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) 4237/**
4284{ 4238 * netdev_has_upper_dev - Check if device is linked to an upper device
4285 struct net_device *dev; 4239 * @dev: device
4286 unsigned int bucket; 4240 * @upper_dev: upper device to check
4287 4241 *
4288 do { 4242 * Find out if a device is linked to specified upper device and return true
4289 dev = dev_from_same_bucket(seq, pos); 4243 * in case it is. Note that this checks only immediate upper device,
4290 if (dev) 4244 * not through a complete stack of devices. The caller must hold the RTNL lock.
4291 return dev;
4292
4293 bucket = get_bucket(*pos) + 1;
4294 *pos = set_bucket_offset(bucket, 1);
4295 } while (bucket < NETDEV_HASHENTRIES);
4296
4297 return NULL;
4298}
4299
4300/*
4301 * This is invoked by the /proc filesystem handler to display a device
4302 * in detail.
4303 */ 4245 */
4304void *dev_seq_start(struct seq_file *seq, loff_t *pos) 4246bool netdev_has_upper_dev(struct net_device *dev,
4305 __acquires(RCU) 4247 struct net_device *upper_dev)
4306{
4307 rcu_read_lock();
4308 if (!*pos)
4309 return SEQ_START_TOKEN;
4310
4311 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4312 return NULL;
4313
4314 return dev_from_bucket(seq, pos);
4315}
4316
4317void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4318{
4319 ++*pos;
4320 return dev_from_bucket(seq, pos);
4321}
4322
4323void dev_seq_stop(struct seq_file *seq, void *v)
4324 __releases(RCU)
4325{
4326 rcu_read_unlock();
4327}
4328
4329static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4330{ 4248{
4331 struct rtnl_link_stats64 temp; 4249 ASSERT_RTNL();
4332 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4333 4250
4334 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " 4251 return __netdev_find_upper(dev, upper_dev);
4335 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4336 dev->name, stats->rx_bytes, stats->rx_packets,
4337 stats->rx_errors,
4338 stats->rx_dropped + stats->rx_missed_errors,
4339 stats->rx_fifo_errors,
4340 stats->rx_length_errors + stats->rx_over_errors +
4341 stats->rx_crc_errors + stats->rx_frame_errors,
4342 stats->rx_compressed, stats->multicast,
4343 stats->tx_bytes, stats->tx_packets,
4344 stats->tx_errors, stats->tx_dropped,
4345 stats->tx_fifo_errors, stats->collisions,
4346 stats->tx_carrier_errors +
4347 stats->tx_aborted_errors +
4348 stats->tx_window_errors +
4349 stats->tx_heartbeat_errors,
4350 stats->tx_compressed);
4351} 4252}
4253EXPORT_SYMBOL(netdev_has_upper_dev);
4352 4254
4353/* 4255/**
4354 * Called from the PROCfs module. This now uses the new arbitrary sized 4256 * netdev_has_any_upper_dev - Check if device is linked to some device
4355 * /proc/net interface to create /proc/net/dev 4257 * @dev: device
4258 *
4259 * Find out if a device is linked to an upper device and return true in case
4260 * it is. The caller must hold the RTNL lock.
4356 */ 4261 */
4357static int dev_seq_show(struct seq_file *seq, void *v) 4262bool netdev_has_any_upper_dev(struct net_device *dev)
4358{
4359 if (v == SEQ_START_TOKEN)
4360 seq_puts(seq, "Inter-| Receive "
4361 " | Transmit\n"
4362 " face |bytes packets errs drop fifo frame "
4363 "compressed multicast|bytes packets errs "
4364 "drop fifo colls carrier compressed\n");
4365 else
4366 dev_seq_printf_stats(seq, v);
4367 return 0;
4368}
4369
4370static struct softnet_data *softnet_get_online(loff_t *pos)
4371{
4372 struct softnet_data *sd = NULL;
4373
4374 while (*pos < nr_cpu_ids)
4375 if (cpu_online(*pos)) {
4376 sd = &per_cpu(softnet_data, *pos);
4377 break;
4378 } else
4379 ++*pos;
4380 return sd;
4381}
4382
4383static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4384{ 4263{
4385 return softnet_get_online(pos); 4264 ASSERT_RTNL();
4386}
4387
4388static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4389{
4390 ++*pos;
4391 return softnet_get_online(pos);
4392}
4393 4265
4394static void softnet_seq_stop(struct seq_file *seq, void *v) 4266 return !list_empty(&dev->upper_dev_list);
4395{
4396} 4267}
4268EXPORT_SYMBOL(netdev_has_any_upper_dev);
4397 4269
4398static int softnet_seq_show(struct seq_file *seq, void *v) 4270/**
4399{ 4271 * netdev_master_upper_dev_get - Get master upper device
4400 struct softnet_data *sd = v; 4272 * @dev: device
4401 4273 *
4402 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 4274 * Find a master upper device and return pointer to it or NULL in case
4403 sd->processed, sd->dropped, sd->time_squeeze, 0, 4275 * it's not there. The caller must hold the RTNL lock.
4404 0, 0, 0, 0, /* was fastroute */ 4276 */
4405 sd->cpu_collision, sd->received_rps); 4277struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4406 return 0;
4407}
4408
4409static const struct seq_operations dev_seq_ops = {
4410 .start = dev_seq_start,
4411 .next = dev_seq_next,
4412 .stop = dev_seq_stop,
4413 .show = dev_seq_show,
4414};
4415
4416static int dev_seq_open(struct inode *inode, struct file *file)
4417{ 4278{
4418 return seq_open_net(inode, file, &dev_seq_ops, 4279 struct netdev_upper *upper;
4419 sizeof(struct seq_net_private));
4420}
4421 4280
4422static const struct file_operations dev_seq_fops = { 4281 ASSERT_RTNL();
4423 .owner = THIS_MODULE,
4424 .open = dev_seq_open,
4425 .read = seq_read,
4426 .llseek = seq_lseek,
4427 .release = seq_release_net,
4428};
4429 4282
4430static const struct seq_operations softnet_seq_ops = { 4283 if (list_empty(&dev->upper_dev_list))
4431 .start = softnet_seq_start, 4284 return NULL;
4432 .next = softnet_seq_next,
4433 .stop = softnet_seq_stop,
4434 .show = softnet_seq_show,
4435};
4436 4285
4437static int softnet_seq_open(struct inode *inode, struct file *file) 4286 upper = list_first_entry(&dev->upper_dev_list,
4438{ 4287 struct netdev_upper, list);
4439 return seq_open(file, &softnet_seq_ops); 4288 if (likely(upper->master))
4289 return upper->dev;
4290 return NULL;
4440} 4291}
4292EXPORT_SYMBOL(netdev_master_upper_dev_get);
4441 4293
4442static const struct file_operations softnet_seq_fops = { 4294/**
4443 .owner = THIS_MODULE, 4295 * netdev_master_upper_dev_get_rcu - Get master upper device
4444 .open = softnet_seq_open, 4296 * @dev: device
4445 .read = seq_read, 4297 *
4446 .llseek = seq_lseek, 4298 * Find a master upper device and return pointer to it or NULL in case
4447 .release = seq_release, 4299 * it's not there. The caller must hold the RCU read lock.
4448}; 4300 */
4449 4301struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4450static void *ptype_get_idx(loff_t pos)
4451{ 4302{
4452 struct packet_type *pt = NULL; 4303 struct netdev_upper *upper;
4453 loff_t i = 0;
4454 int t;
4455
4456 list_for_each_entry_rcu(pt, &ptype_all, list) {
4457 if (i == pos)
4458 return pt;
4459 ++i;
4460 }
4461 4304
4462 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 4305 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4463 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 4306 struct netdev_upper, list);
4464 if (i == pos) 4307 if (upper && likely(upper->master))
4465 return pt; 4308 return upper->dev;
4466 ++i;
4467 }
4468 }
4469 return NULL; 4309 return NULL;
4470} 4310}
4311EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4471 4312
4472static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 4313static int __netdev_upper_dev_link(struct net_device *dev,
4473 __acquires(RCU) 4314 struct net_device *upper_dev, bool master)
4474{ 4315{
4475 rcu_read_lock(); 4316 struct netdev_upper *upper;
4476 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4477}
4478 4317
4479static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4318 ASSERT_RTNL();
4480{
4481 struct packet_type *pt;
4482 struct list_head *nxt;
4483 int hash;
4484 4319
4485 ++*pos; 4320 if (dev == upper_dev)
4486 if (v == SEQ_START_TOKEN) 4321 return -EBUSY;
4487 return ptype_get_idx(0);
4488 4322
4489 pt = v; 4323 /* To prevent loops, check if dev is not upper device to upper_dev. */
4490 nxt = pt->list.next; 4324 if (__netdev_search_upper_dev(upper_dev, dev))
4491 if (pt->type == htons(ETH_P_ALL)) { 4325 return -EBUSY;
4492 if (nxt != &ptype_all)
4493 goto found;
4494 hash = 0;
4495 nxt = ptype_base[0].next;
4496 } else
4497 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4498 4326
4499 while (nxt == &ptype_base[hash]) { 4327 if (__netdev_find_upper(dev, upper_dev))
4500 if (++hash >= PTYPE_HASH_SIZE) 4328 return -EEXIST;
4501 return NULL;
4502 nxt = ptype_base[hash].next;
4503 }
4504found:
4505 return list_entry(nxt, struct packet_type, list);
4506}
4507 4329
4508static void ptype_seq_stop(struct seq_file *seq, void *v) 4330 if (master && netdev_master_upper_dev_get(dev))
4509 __releases(RCU) 4331 return -EBUSY;
4510{
4511 rcu_read_unlock();
4512}
4513 4332
4514static int ptype_seq_show(struct seq_file *seq, void *v) 4333 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4515{ 4334 if (!upper)
4516 struct packet_type *pt = v; 4335 return -ENOMEM;
4517 4336
4518 if (v == SEQ_START_TOKEN) 4337 upper->dev = upper_dev;
4519 seq_puts(seq, "Type Device Function\n"); 4338 upper->master = master;
4520 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 4339 INIT_LIST_HEAD(&upper->search_list);
4521 if (pt->type == htons(ETH_P_ALL))
4522 seq_puts(seq, "ALL ");
4523 else
4524 seq_printf(seq, "%04x", ntohs(pt->type));
4525 4340
4526 seq_printf(seq, " %-8s %pF\n", 4341 /* Ensure that master upper link is always the first item in list. */
4527 pt->dev ? pt->dev->name : "", pt->func); 4342 if (master)
4528 } 4343 list_add_rcu(&upper->list, &dev->upper_dev_list);
4344 else
4345 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4346 dev_hold(upper_dev);
4529 4347
4530 return 0; 4348 return 0;
4531} 4349}
4532 4350
4533static const struct seq_operations ptype_seq_ops = { 4351/**
4534 .start = ptype_seq_start, 4352 * netdev_upper_dev_link - Add a link to the upper device
4535 .next = ptype_seq_next, 4353 * @dev: device
4536 .stop = ptype_seq_stop, 4354 * @upper_dev: new upper device
4537 .show = ptype_seq_show, 4355 *
4538}; 4356 * Adds a link to device which is upper to this one. The caller must hold
4539 4357 * the RTNL lock. On a failure a negative errno code is returned.
4540static int ptype_seq_open(struct inode *inode, struct file *file) 4358 * On success the reference counts are adjusted and the function
4541{ 4359 * returns zero.
4542 return seq_open_net(inode, file, &ptype_seq_ops, 4360 */
4543 sizeof(struct seq_net_private)); 4361int netdev_upper_dev_link(struct net_device *dev,
4544} 4362 struct net_device *upper_dev)
4545
4546static const struct file_operations ptype_seq_fops = {
4547 .owner = THIS_MODULE,
4548 .open = ptype_seq_open,
4549 .read = seq_read,
4550 .llseek = seq_lseek,
4551 .release = seq_release_net,
4552};
4553
4554
4555static int __net_init dev_proc_net_init(struct net *net)
4556{
4557 int rc = -ENOMEM;
4558
4559 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4560 goto out;
4561 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4562 goto out_dev;
4563 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4564 goto out_softnet;
4565
4566 if (wext_proc_init(net))
4567 goto out_ptype;
4568 rc = 0;
4569out:
4570 return rc;
4571out_ptype:
4572 proc_net_remove(net, "ptype");
4573out_softnet:
4574 proc_net_remove(net, "softnet_stat");
4575out_dev:
4576 proc_net_remove(net, "dev");
4577 goto out;
4578}
4579
4580static void __net_exit dev_proc_net_exit(struct net *net)
4581{
4582 wext_proc_exit(net);
4583
4584 proc_net_remove(net, "ptype");
4585 proc_net_remove(net, "softnet_stat");
4586 proc_net_remove(net, "dev");
4587}
4588
4589static struct pernet_operations __net_initdata dev_proc_ops = {
4590 .init = dev_proc_net_init,
4591 .exit = dev_proc_net_exit,
4592};
4593
4594static int __init dev_proc_init(void)
4595{ 4363{
4596 return register_pernet_subsys(&dev_proc_ops); 4364 return __netdev_upper_dev_link(dev, upper_dev, false);
4597} 4365}
4598#else 4366EXPORT_SYMBOL(netdev_upper_dev_link);
4599#define dev_proc_init() 0
4600#endif /* CONFIG_PROC_FS */
4601
4602 4367
4603/** 4368/**
4604 * netdev_set_master - set up master pointer 4369 * netdev_master_upper_dev_link - Add a master link to the upper device
4605 * @slave: slave device 4370 * @dev: device
4606 * @master: new master device 4371 * @upper_dev: new upper device
4607 * 4372 *
4608 * Changes the master device of the slave. Pass %NULL to break the 4373 * Adds a link to device which is upper to this one. In this case, only
4609 * bonding. The caller must hold the RTNL semaphore. On a failure 4374 * one master upper device can be linked, although other non-master devices
4610 * a negative errno code is returned. On success the reference counts 4375 * might be linked as well. The caller must hold the RTNL lock.
4611 * are adjusted and the function returns zero. 4376 * On a failure a negative errno code is returned. On success the reference
4377 * counts are adjusted and the function returns zero.
4612 */ 4378 */
4613int netdev_set_master(struct net_device *slave, struct net_device *master) 4379int netdev_master_upper_dev_link(struct net_device *dev,
4380 struct net_device *upper_dev)
4614{ 4381{
4615 struct net_device *old = slave->master; 4382 return __netdev_upper_dev_link(dev, upper_dev, true);
4616
4617 ASSERT_RTNL();
4618
4619 if (master) {
4620 if (old)
4621 return -EBUSY;
4622 dev_hold(master);
4623 }
4624
4625 slave->master = master;
4626
4627 if (old)
4628 dev_put(old);
4629 return 0;
4630} 4383}
4631EXPORT_SYMBOL(netdev_set_master); 4384EXPORT_SYMBOL(netdev_master_upper_dev_link);
4632 4385
4633/** 4386/**
4634 * netdev_set_bond_master - set up bonding master/slave pair 4387 * netdev_upper_dev_unlink - Removes a link to upper device
4635 * @slave: slave device 4388 * @dev: device
4636 * @master: new master device 4389 * @upper_dev: new upper device
4637 * 4390 *
4638 * Changes the master device of the slave. Pass %NULL to break the 4391 * Removes a link to device which is upper to this one. The caller must hold
4639 * bonding. The caller must hold the RTNL semaphore. On a failure 4392 * the RTNL lock.
4640 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4641 * to the routing socket and the function returns zero.
4642 */ 4393 */
4643int netdev_set_bond_master(struct net_device *slave, struct net_device *master) 4394void netdev_upper_dev_unlink(struct net_device *dev,
4395 struct net_device *upper_dev)
4644{ 4396{
4645 int err; 4397 struct netdev_upper *upper;
4646 4398
4647 ASSERT_RTNL(); 4399 ASSERT_RTNL();
4648 4400
4649 err = netdev_set_master(slave, master); 4401 upper = __netdev_find_upper(dev, upper_dev);
4650 if (err) 4402 if (!upper)
4651 return err; 4403 return;
4652 if (master) 4404 list_del_rcu(&upper->list);
4653 slave->flags |= IFF_SLAVE; 4405 dev_put(upper_dev);
4654 else 4406 kfree_rcu(upper, rcu);
4655 slave->flags &= ~IFF_SLAVE;
4656
4657 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4658 return 0;
4659} 4407}
4660EXPORT_SYMBOL(netdev_set_bond_master); 4408EXPORT_SYMBOL(netdev_upper_dev_unlink);
4661 4409
4662static void dev_change_rx_flags(struct net_device *dev, int flags) 4410static void dev_change_rx_flags(struct net_device *dev, int flags)
4663{ 4411{
@@ -5020,381 +4768,33 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5020 if (!netif_device_present(dev)) 4768 if (!netif_device_present(dev))
5021 return -ENODEV; 4769 return -ENODEV;
5022 err = ops->ndo_set_mac_address(dev, sa); 4770 err = ops->ndo_set_mac_address(dev, sa);
5023 if (!err) 4771 if (err)
5024 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4772 return err;
4773 dev->addr_assign_type = NET_ADDR_SET;
4774 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5025 add_device_randomness(dev->dev_addr, dev->addr_len); 4775 add_device_randomness(dev->dev_addr, dev->addr_len);
5026 return err; 4776 return 0;
5027} 4777}
5028EXPORT_SYMBOL(dev_set_mac_address); 4778EXPORT_SYMBOL(dev_set_mac_address);
5029 4779
5030/*
5031 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5032 */
5033static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5034{
5035 int err;
5036 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5037
5038 if (!dev)
5039 return -ENODEV;
5040
5041 switch (cmd) {
5042 case SIOCGIFFLAGS: /* Get interface flags */
5043 ifr->ifr_flags = (short) dev_get_flags(dev);
5044 return 0;
5045
5046 case SIOCGIFMETRIC: /* Get the metric on the interface
5047 (currently unused) */
5048 ifr->ifr_metric = 0;
5049 return 0;
5050
5051 case SIOCGIFMTU: /* Get the MTU of a device */
5052 ifr->ifr_mtu = dev->mtu;
5053 return 0;
5054
5055 case SIOCGIFHWADDR:
5056 if (!dev->addr_len)
5057 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5058 else
5059 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5060 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5061 ifr->ifr_hwaddr.sa_family = dev->type;
5062 return 0;
5063
5064 case SIOCGIFSLAVE:
5065 err = -EINVAL;
5066 break;
5067
5068 case SIOCGIFMAP:
5069 ifr->ifr_map.mem_start = dev->mem_start;
5070 ifr->ifr_map.mem_end = dev->mem_end;
5071 ifr->ifr_map.base_addr = dev->base_addr;
5072 ifr->ifr_map.irq = dev->irq;
5073 ifr->ifr_map.dma = dev->dma;
5074 ifr->ifr_map.port = dev->if_port;
5075 return 0;
5076
5077 case SIOCGIFINDEX:
5078 ifr->ifr_ifindex = dev->ifindex;
5079 return 0;
5080
5081 case SIOCGIFTXQLEN:
5082 ifr->ifr_qlen = dev->tx_queue_len;
5083 return 0;
5084
5085 default:
5086 /* dev_ioctl() should ensure this case
5087 * is never reached
5088 */
5089 WARN_ON(1);
5090 err = -ENOTTY;
5091 break;
5092
5093 }
5094 return err;
5095}
5096
5097/*
5098 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
5099 */
5100static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5101{
5102 int err;
5103 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5104 const struct net_device_ops *ops;
5105
5106 if (!dev)
5107 return -ENODEV;
5108
5109 ops = dev->netdev_ops;
5110
5111 switch (cmd) {
5112 case SIOCSIFFLAGS: /* Set interface flags */
5113 return dev_change_flags(dev, ifr->ifr_flags);
5114
5115 case SIOCSIFMETRIC: /* Set the metric on the interface
5116 (currently unused) */
5117 return -EOPNOTSUPP;
5118
5119 case SIOCSIFMTU: /* Set the MTU of a device */
5120 return dev_set_mtu(dev, ifr->ifr_mtu);
5121
5122 case SIOCSIFHWADDR:
5123 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5124
5125 case SIOCSIFHWBROADCAST:
5126 if (ifr->ifr_hwaddr.sa_family != dev->type)
5127 return -EINVAL;
5128 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5129 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5130 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5131 return 0;
5132
5133 case SIOCSIFMAP:
5134 if (ops->ndo_set_config) {
5135 if (!netif_device_present(dev))
5136 return -ENODEV;
5137 return ops->ndo_set_config(dev, &ifr->ifr_map);
5138 }
5139 return -EOPNOTSUPP;
5140
5141 case SIOCADDMULTI:
5142 if (!ops->ndo_set_rx_mode ||
5143 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5144 return -EINVAL;
5145 if (!netif_device_present(dev))
5146 return -ENODEV;
5147 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5148
5149 case SIOCDELMULTI:
5150 if (!ops->ndo_set_rx_mode ||
5151 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5152 return -EINVAL;
5153 if (!netif_device_present(dev))
5154 return -ENODEV;
5155 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5156
5157 case SIOCSIFTXQLEN:
5158 if (ifr->ifr_qlen < 0)
5159 return -EINVAL;
5160 dev->tx_queue_len = ifr->ifr_qlen;
5161 return 0;
5162
5163 case SIOCSIFNAME:
5164 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5165 return dev_change_name(dev, ifr->ifr_newname);
5166
5167 case SIOCSHWTSTAMP:
5168 err = net_hwtstamp_validate(ifr);
5169 if (err)
5170 return err;
5171 /* fall through */
5172
5173 /*
5174 * Unknown or private ioctl
5175 */
5176 default:
5177 if ((cmd >= SIOCDEVPRIVATE &&
5178 cmd <= SIOCDEVPRIVATE + 15) ||
5179 cmd == SIOCBONDENSLAVE ||
5180 cmd == SIOCBONDRELEASE ||
5181 cmd == SIOCBONDSETHWADDR ||
5182 cmd == SIOCBONDSLAVEINFOQUERY ||
5183 cmd == SIOCBONDINFOQUERY ||
5184 cmd == SIOCBONDCHANGEACTIVE ||
5185 cmd == SIOCGMIIPHY ||
5186 cmd == SIOCGMIIREG ||
5187 cmd == SIOCSMIIREG ||
5188 cmd == SIOCBRADDIF ||
5189 cmd == SIOCBRDELIF ||
5190 cmd == SIOCSHWTSTAMP ||
5191 cmd == SIOCWANDEV) {
5192 err = -EOPNOTSUPP;
5193 if (ops->ndo_do_ioctl) {
5194 if (netif_device_present(dev))
5195 err = ops->ndo_do_ioctl(dev, ifr, cmd);
5196 else
5197 err = -ENODEV;
5198 }
5199 } else
5200 err = -EINVAL;
5201
5202 }
5203 return err;
5204}
5205
5206/*
5207 * This function handles all "interface"-type I/O control requests. The actual
5208 * 'doing' part of this is dev_ifsioc above.
5209 */
5210
5211/** 4780/**
5212 * dev_ioctl - network device ioctl 4781 * dev_change_carrier - Change device carrier
5213 * @net: the applicable net namespace 4782 * @dev: device
5214 * @cmd: command to issue 4783 * @new_carries: new value
5215 * @arg: pointer to a struct ifreq in user space
5216 * 4784 *
5217 * Issue ioctl functions to devices. This is normally called by the 4785 * Change device carrier
5218 * user space syscall interfaces but can sometimes be useful for
5219 * other purposes. The return value is the return from the syscall if
5220 * positive or a negative errno code on error.
5221 */ 4786 */
5222 4787int dev_change_carrier(struct net_device *dev, bool new_carrier)
5223int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5224{ 4788{
5225 struct ifreq ifr; 4789 const struct net_device_ops *ops = dev->netdev_ops;
5226 int ret;
5227 char *colon;
5228
5229 /* One special case: SIOCGIFCONF takes ifconf argument
5230 and requires shared lock, because it sleeps writing
5231 to user space.
5232 */
5233
5234 if (cmd == SIOCGIFCONF) {
5235 rtnl_lock();
5236 ret = dev_ifconf(net, (char __user *) arg);
5237 rtnl_unlock();
5238 return ret;
5239 }
5240 if (cmd == SIOCGIFNAME)
5241 return dev_ifname(net, (struct ifreq __user *)arg);
5242
5243 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5244 return -EFAULT;
5245
5246 ifr.ifr_name[IFNAMSIZ-1] = 0;
5247
5248 colon = strchr(ifr.ifr_name, ':');
5249 if (colon)
5250 *colon = 0;
5251
5252 /*
5253 * See which interface the caller is talking about.
5254 */
5255
5256 switch (cmd) {
5257 /*
5258 * These ioctl calls:
5259 * - can be done by all.
5260 * - atomic and do not require locking.
5261 * - return a value
5262 */
5263 case SIOCGIFFLAGS:
5264 case SIOCGIFMETRIC:
5265 case SIOCGIFMTU:
5266 case SIOCGIFHWADDR:
5267 case SIOCGIFSLAVE:
5268 case SIOCGIFMAP:
5269 case SIOCGIFINDEX:
5270 case SIOCGIFTXQLEN:
5271 dev_load(net, ifr.ifr_name);
5272 rcu_read_lock();
5273 ret = dev_ifsioc_locked(net, &ifr, cmd);
5274 rcu_read_unlock();
5275 if (!ret) {
5276 if (colon)
5277 *colon = ':';
5278 if (copy_to_user(arg, &ifr,
5279 sizeof(struct ifreq)))
5280 ret = -EFAULT;
5281 }
5282 return ret;
5283
5284 case SIOCETHTOOL:
5285 dev_load(net, ifr.ifr_name);
5286 rtnl_lock();
5287 ret = dev_ethtool(net, &ifr);
5288 rtnl_unlock();
5289 if (!ret) {
5290 if (colon)
5291 *colon = ':';
5292 if (copy_to_user(arg, &ifr,
5293 sizeof(struct ifreq)))
5294 ret = -EFAULT;
5295 }
5296 return ret;
5297
5298 /*
5299 * These ioctl calls:
5300 * - require superuser power.
5301 * - require strict serialization.
5302 * - return a value
5303 */
5304 case SIOCGMIIPHY:
5305 case SIOCGMIIREG:
5306 case SIOCSIFNAME:
5307 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5308 return -EPERM;
5309 dev_load(net, ifr.ifr_name);
5310 rtnl_lock();
5311 ret = dev_ifsioc(net, &ifr, cmd);
5312 rtnl_unlock();
5313 if (!ret) {
5314 if (colon)
5315 *colon = ':';
5316 if (copy_to_user(arg, &ifr,
5317 sizeof(struct ifreq)))
5318 ret = -EFAULT;
5319 }
5320 return ret;
5321
5322 /*
5323 * These ioctl calls:
5324 * - require superuser power.
5325 * - require strict serialization.
5326 * - do not return a value
5327 */
5328 case SIOCSIFMAP:
5329 case SIOCSIFTXQLEN:
5330 if (!capable(CAP_NET_ADMIN))
5331 return -EPERM;
5332 /* fall through */
5333 /*
5334 * These ioctl calls:
5335 * - require local superuser power.
5336 * - require strict serialization.
5337 * - do not return a value
5338 */
5339 case SIOCSIFFLAGS:
5340 case SIOCSIFMETRIC:
5341 case SIOCSIFMTU:
5342 case SIOCSIFHWADDR:
5343 case SIOCSIFSLAVE:
5344 case SIOCADDMULTI:
5345 case SIOCDELMULTI:
5346 case SIOCSIFHWBROADCAST:
5347 case SIOCSMIIREG:
5348 case SIOCBONDENSLAVE:
5349 case SIOCBONDRELEASE:
5350 case SIOCBONDSETHWADDR:
5351 case SIOCBONDCHANGEACTIVE:
5352 case SIOCBRADDIF:
5353 case SIOCBRDELIF:
5354 case SIOCSHWTSTAMP:
5355 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5356 return -EPERM;
5357 /* fall through */
5358 case SIOCBONDSLAVEINFOQUERY:
5359 case SIOCBONDINFOQUERY:
5360 dev_load(net, ifr.ifr_name);
5361 rtnl_lock();
5362 ret = dev_ifsioc(net, &ifr, cmd);
5363 rtnl_unlock();
5364 return ret;
5365
5366 case SIOCGIFMEM:
5367 /* Get the per device memory space. We can add this but
5368 * currently do not support it */
5369 case SIOCSIFMEM:
5370 /* Set the per device memory buffer space.
5371 * Not applicable in our case */
5372 case SIOCSIFLINK:
5373 return -ENOTTY;
5374 4790
5375 /* 4791 if (!ops->ndo_change_carrier)
5376 * Unknown or private ioctl. 4792 return -EOPNOTSUPP;
5377 */ 4793 if (!netif_device_present(dev))
5378 default: 4794 return -ENODEV;
5379 if (cmd == SIOCWANDEV || 4795 return ops->ndo_change_carrier(dev, new_carrier);
5380 (cmd >= SIOCDEVPRIVATE &&
5381 cmd <= SIOCDEVPRIVATE + 15)) {
5382 dev_load(net, ifr.ifr_name);
5383 rtnl_lock();
5384 ret = dev_ifsioc(net, &ifr, cmd);
5385 rtnl_unlock();
5386 if (!ret && copy_to_user(arg, &ifr,
5387 sizeof(struct ifreq)))
5388 ret = -EFAULT;
5389 return ret;
5390 }
5391 /* Take care of Wireless Extensions */
5392 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5393 return wext_handle_ioctl(net, &ifr, cmd, arg);
5394 return -ENOTTY;
5395 }
5396} 4796}
5397 4797EXPORT_SYMBOL(dev_change_carrier);
5398 4798
5399/** 4799/**
5400 * dev_new_index - allocate an ifindex 4800 * dev_new_index - allocate an ifindex
@@ -5482,11 +4882,15 @@ static void rollback_registered_many(struct list_head *head)
5482 if (dev->netdev_ops->ndo_uninit) 4882 if (dev->netdev_ops->ndo_uninit)
5483 dev->netdev_ops->ndo_uninit(dev); 4883 dev->netdev_ops->ndo_uninit(dev);
5484 4884
5485 /* Notifier chain MUST detach us from master device. */ 4885 /* Notifier chain MUST detach us all upper devices. */
5486 WARN_ON(dev->master); 4886 WARN_ON(netdev_has_any_upper_dev(dev));
5487 4887
5488 /* Remove entries from kobject tree */ 4888 /* Remove entries from kobject tree */
5489 netdev_unregister_kobject(dev); 4889 netdev_unregister_kobject(dev);
4890#ifdef CONFIG_XPS
4891 /* Remove XPS queueing entries */
4892 netif_reset_xps_queues_gt(dev, 0);
4893#endif
5490 } 4894 }
5491 4895
5492 synchronize_net(); 4896 synchronize_net();
@@ -5664,10 +5068,9 @@ static int netif_alloc_rx_queues(struct net_device *dev)
5664 BUG_ON(count < 1); 5068 BUG_ON(count < 1);
5665 5069
5666 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); 5070 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5667 if (!rx) { 5071 if (!rx)
5668 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5669 return -ENOMEM; 5072 return -ENOMEM;
5670 } 5073
5671 dev->_rx = rx; 5074 dev->_rx = rx;
5672 5075
5673 for (i = 0; i < count; i++) 5076 for (i = 0; i < count; i++)
@@ -5698,10 +5101,9 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
5698 BUG_ON(count < 1); 5101 BUG_ON(count < 1);
5699 5102
5700 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); 5103 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5701 if (!tx) { 5104 if (!tx)
5702 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5703 return -ENOMEM; 5105 return -ENOMEM;
5704 } 5106
5705 dev->_tx = tx; 5107 dev->_tx = tx;
5706 5108
5707 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5109 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5760,6 +5162,14 @@ int register_netdevice(struct net_device *dev)
5760 } 5162 }
5761 } 5163 }
5762 5164
5165 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5166 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5167 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5168 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5169 ret = -EINVAL;
5170 goto err_uninit;
5171 }
5172
5763 ret = -EBUSY; 5173 ret = -EBUSY;
5764 if (!dev->ifindex) 5174 if (!dev->ifindex)
5765 dev->ifindex = dev_new_index(net); 5175 dev->ifindex = dev_new_index(net);
@@ -5815,6 +5225,13 @@ int register_netdevice(struct net_device *dev)
5815 list_netdevice(dev); 5225 list_netdevice(dev);
5816 add_device_randomness(dev->dev_addr, dev->addr_len); 5226 add_device_randomness(dev->dev_addr, dev->addr_len);
5817 5227
5228 /* If the device has permanent device address, driver should
5229 * set dev_addr and also addr_assign_type should be set to
5230 * NET_ADDR_PERM (default value).
5231 */
5232 if (dev->addr_assign_type == NET_ADDR_PERM)
5233 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5234
5818 /* Notify protocols, that a new device appeared. */ 5235 /* Notify protocols, that a new device appeared. */
5819 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 5236 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5820 ret = notifier_to_errno(ret); 5237 ret = notifier_to_errno(ret);
@@ -6121,6 +5538,14 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6121 5538
6122static const struct ethtool_ops default_ethtool_ops; 5539static const struct ethtool_ops default_ethtool_ops;
6123 5540
5541void netdev_set_default_ethtool_ops(struct net_device *dev,
5542 const struct ethtool_ops *ops)
5543{
5544 if (dev->ethtool_ops == &default_ethtool_ops)
5545 dev->ethtool_ops = ops;
5546}
5547EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5548
6124/** 5549/**
6125 * alloc_netdev_mqs - allocate network device 5550 * alloc_netdev_mqs - allocate network device
6126 * @sizeof_priv: size of private data to allocate space for 5551 * @sizeof_priv: size of private data to allocate space for
@@ -6165,10 +5590,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6165 alloc_size += NETDEV_ALIGN - 1; 5590 alloc_size += NETDEV_ALIGN - 1;
6166 5591
6167 p = kzalloc(alloc_size, GFP_KERNEL); 5592 p = kzalloc(alloc_size, GFP_KERNEL);
6168 if (!p) { 5593 if (!p)
6169 pr_err("alloc_netdev: Unable to allocate device\n");
6170 return NULL; 5594 return NULL;
6171 }
6172 5595
6173 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5596 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6174 dev->padded = (char *)dev - (char *)p; 5597 dev->padded = (char *)dev - (char *)p;
@@ -6191,6 +5614,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6191 INIT_LIST_HEAD(&dev->napi_list); 5614 INIT_LIST_HEAD(&dev->napi_list);
6192 INIT_LIST_HEAD(&dev->unreg_list); 5615 INIT_LIST_HEAD(&dev->unreg_list);
6193 INIT_LIST_HEAD(&dev->link_watch_list); 5616 INIT_LIST_HEAD(&dev->link_watch_list);
5617 INIT_LIST_HEAD(&dev->upper_dev_list);
6194 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5618 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6195 setup(dev); 5619 setup(dev);
6196 5620
@@ -6834,19 +6258,9 @@ static int __init net_dev_init(void)
6834 6258
6835 hotcpu_notifier(dev_cpu_callback, 0); 6259 hotcpu_notifier(dev_cpu_callback, 0);
6836 dst_init(); 6260 dst_init();
6837 dev_mcast_init();
6838 rc = 0; 6261 rc = 0;
6839out: 6262out:
6840 return rc; 6263 return rc;
6841} 6264}
6842 6265
6843subsys_initcall(net_dev_init); 6266subsys_initcall(net_dev_init);
6844
6845static int __init initialize_hashrnd(void)
6846{
6847 get_random_bytes(&hashrnd, sizeof(hashrnd));
6848 return 0;
6849}
6850
6851late_initcall_sync(initialize_hashrnd);
6852