aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2012-03-19 20:02:01 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2012-03-19 20:02:01 -0400
commit10ce3cc919f50c2043b41ca968b43c26a3672600 (patch)
treeea409366a5208aced495bc0516a08b81fd43222e /net/core
parent24e3e5ae1e4c2a3a32f5b1f96b4e3fd721806acd (diff)
parent5c6a7a62c130afef3d61c1dee153012231ff5cd9 (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile6
-rw-r--r--net/core/dev.c352
-rw-r--r--net/core/dev_addr_lists.c19
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/ethtool.c715
-rw-r--r--net/core/flow.c12
-rw-r--r--net/core/flow_dissector.c144
-rw-r--r--net/core/neighbour.c229
-rw-r--r--net/core/net-sysfs.c330
-rw-r--r--net/core/net_namespace.c31
-rw-r--r--net/core/netpoll.c14
-rw-r--r--net/core/netprio_cgroup.c339
-rw-r--r--net/core/pktgen.c25
-rw-r--r--net/core/request_sock.c7
-rw-r--r--net/core/rtnetlink.c104
-rw-r--r--net/core/secure_seq.c10
-rw-r--r--net/core/skbuff.c91
-rw-r--r--net/core/sock.c209
-rw-r--r--net/core/sock_diag.c192
-rw-r--r--net/core/sysctl_net_core.c9
20 files changed, 1784 insertions, 1056 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0d357b1c4e57..674641b13aea 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,12 +3,13 @@
3# 3#
4 4
5obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ 5obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
6 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o 6 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
7 7
8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o 8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
9 9
10obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ 10obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o 11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
12 sock_diag.o
12 13
13obj-$(CONFIG_XFRM) += flow.o 14obj-$(CONFIG_XFRM) += flow.o
14obj-y += net-sysfs.o 15obj-y += net-sysfs.o
@@ -19,3 +20,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
19obj-$(CONFIG_TRACEPOINTS) += net-traces.o 20obj-$(CONFIG_TRACEPOINTS) += net-traces.o
20obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o 21obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
21obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o 22obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
23obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ba50a1e404c..6ca32f6b3105 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -133,10 +133,9 @@
133#include <linux/pci.h> 133#include <linux/pci.h>
134#include <linux/inetdevice.h> 134#include <linux/inetdevice.h>
135#include <linux/cpu_rmap.h> 135#include <linux/cpu_rmap.h>
136#include <linux/if_tunnel.h>
137#include <linux/if_pppox.h>
138#include <linux/ppp_defs.h>
139#include <linux/net_tstamp.h> 136#include <linux/net_tstamp.h>
137#include <linux/jump_label.h>
138#include <net/flow_keys.h>
140 139
141#include "net-sysfs.h" 140#include "net-sysfs.h"
142 141
@@ -1320,8 +1319,6 @@ EXPORT_SYMBOL(dev_close);
1320 */ 1319 */
1321void dev_disable_lro(struct net_device *dev) 1320void dev_disable_lro(struct net_device *dev)
1322{ 1321{
1323 u32 flags;
1324
1325 /* 1322 /*
1326 * If we're trying to disable lro on a vlan device 1323 * If we're trying to disable lro on a vlan device
1327 * use the underlying physical device instead 1324 * use the underlying physical device instead
@@ -1329,15 +1326,9 @@ void dev_disable_lro(struct net_device *dev)
1329 if (is_vlan_dev(dev)) 1326 if (is_vlan_dev(dev))
1330 dev = vlan_dev_real_dev(dev); 1327 dev = vlan_dev_real_dev(dev);
1331 1328
1332 if (dev->ethtool_ops && dev->ethtool_ops->get_flags) 1329 dev->wanted_features &= ~NETIF_F_LRO;
1333 flags = dev->ethtool_ops->get_flags(dev); 1330 netdev_update_features(dev);
1334 else
1335 flags = ethtool_op_get_flags(dev);
1336
1337 if (!(flags & ETH_FLAG_LRO))
1338 return;
1339 1331
1340 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341 if (unlikely(dev->features & NETIF_F_LRO)) 1332 if (unlikely(dev->features & NETIF_F_LRO))
1342 netdev_WARN(dev, "failed to disable LRO!\n"); 1333 netdev_WARN(dev, "failed to disable LRO!\n");
1343} 1334}
@@ -1396,7 +1387,7 @@ rollback:
1396 for_each_net(net) { 1387 for_each_net(net) {
1397 for_each_netdev(net, dev) { 1388 for_each_netdev(net, dev) {
1398 if (dev == last) 1389 if (dev == last)
1399 break; 1390 goto outroll;
1400 1391
1401 if (dev->flags & IFF_UP) { 1392 if (dev->flags & IFF_UP) {
1402 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
@@ -1407,6 +1398,7 @@ rollback:
1407 } 1398 }
1408 } 1399 }
1409 1400
1401outroll:
1410 raw_notifier_chain_unregister(&netdev_chain, nb); 1402 raw_notifier_chain_unregister(&netdev_chain, nb);
1411 goto unlock; 1403 goto unlock;
1412} 1404}
@@ -1449,34 +1441,55 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1449} 1441}
1450EXPORT_SYMBOL(call_netdevice_notifiers); 1442EXPORT_SYMBOL(call_netdevice_notifiers);
1451 1443
1452/* When > 0 there are consumers of rx skb time stamps */ 1444static struct jump_label_key netstamp_needed __read_mostly;
1453static atomic_t netstamp_needed = ATOMIC_INIT(0); 1445#ifdef HAVE_JUMP_LABEL
1446/* We are not allowed to call jump_label_dec() from irq context
1447 * If net_disable_timestamp() is called from irq context, defer the
1448 * jump_label_dec() calls.
1449 */
1450static atomic_t netstamp_needed_deferred;
1451#endif
1454 1452
1455void net_enable_timestamp(void) 1453void net_enable_timestamp(void)
1456{ 1454{
1457 atomic_inc(&netstamp_needed); 1455#ifdef HAVE_JUMP_LABEL
1456 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1457
1458 if (deferred) {
1459 while (--deferred)
1460 jump_label_dec(&netstamp_needed);
1461 return;
1462 }
1463#endif
1464 WARN_ON(in_interrupt());
1465 jump_label_inc(&netstamp_needed);
1458} 1466}
1459EXPORT_SYMBOL(net_enable_timestamp); 1467EXPORT_SYMBOL(net_enable_timestamp);
1460 1468
1461void net_disable_timestamp(void) 1469void net_disable_timestamp(void)
1462{ 1470{
1463 atomic_dec(&netstamp_needed); 1471#ifdef HAVE_JUMP_LABEL
1472 if (in_interrupt()) {
1473 atomic_inc(&netstamp_needed_deferred);
1474 return;
1475 }
1476#endif
1477 jump_label_dec(&netstamp_needed);
1464} 1478}
1465EXPORT_SYMBOL(net_disable_timestamp); 1479EXPORT_SYMBOL(net_disable_timestamp);
1466 1480
1467static inline void net_timestamp_set(struct sk_buff *skb) 1481static inline void net_timestamp_set(struct sk_buff *skb)
1468{ 1482{
1469 if (atomic_read(&netstamp_needed)) 1483 skb->tstamp.tv64 = 0;
1484 if (static_branch(&netstamp_needed))
1470 __net_timestamp(skb); 1485 __net_timestamp(skb);
1471 else
1472 skb->tstamp.tv64 = 0;
1473} 1486}
1474 1487
1475static inline void net_timestamp_check(struct sk_buff *skb) 1488#define net_timestamp_check(COND, SKB) \
1476{ 1489 if (static_branch(&netstamp_needed)) { \
1477 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) 1490 if ((COND) && !(SKB)->tstamp.tv64) \
1478 __net_timestamp(skb); 1491 __net_timestamp(SKB); \
1479} 1492 } \
1480 1493
1481static int net_hwtstamp_validate(struct ifreq *ifr) 1494static int net_hwtstamp_validate(struct ifreq *ifr)
1482{ 1495{
@@ -1874,6 +1887,23 @@ void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1874EXPORT_SYMBOL(skb_set_dev); 1887EXPORT_SYMBOL(skb_set_dev);
1875#endif /* CONFIG_NET_NS */ 1888#endif /* CONFIG_NET_NS */
1876 1889
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892 static const netdev_features_t null_features = 0;
1893 struct net_device *dev = skb->dev;
1894 const char *driver = "";
1895
1896 if (dev && dev->dev.parent)
1897 driver = dev_driver_string(dev->dev.parent);
1898
1899 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900 "gso_type=%d ip_summed=%d\n",
1901 driver, dev ? &dev->features : &null_features,
1902 skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904 skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1877/* 1907/*
1878 * Invalidate hardware checksum when packet is to be mangled, and 1908 * Invalidate hardware checksum when packet is to be mangled, and
1879 * complete checksum manually on outgoing path. 1909 * complete checksum manually on outgoing path.
@@ -1887,8 +1917,8 @@ int skb_checksum_help(struct sk_buff *skb)
1887 goto out_set_summed; 1917 goto out_set_summed;
1888 1918
1889 if (unlikely(skb_shinfo(skb)->gso_size)) { 1919 if (unlikely(skb_shinfo(skb)->gso_size)) {
1890 /* Let GSO fix up the checksum. */ 1920 skb_warn_bad_offload(skb);
1891 goto out_set_summed; 1921 return -EINVAL;
1892 } 1922 }
1893 1923
1894 offset = skb_checksum_start_offset(skb); 1924 offset = skb_checksum_start_offset(skb);
@@ -1923,7 +1953,8 @@ EXPORT_SYMBOL(skb_checksum_help);
1923 * It may return NULL if the skb requires no segmentation. This is 1953 * It may return NULL if the skb requires no segmentation. This is
1924 * only possible when GSO is used for verifying header integrity. 1954 * only possible when GSO is used for verifying header integrity.
1925 */ 1955 */
1926struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) 1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957 netdev_features_t features)
1927{ 1958{
1928 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1959 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1929 struct packet_type *ptype; 1960 struct packet_type *ptype;
@@ -1947,16 +1978,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1947 __skb_pull(skb, skb->mac_len); 1978 __skb_pull(skb, skb->mac_len);
1948 1979
1949 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1980 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1950 struct net_device *dev = skb->dev; 1981 skb_warn_bad_offload(skb);
1951 struct ethtool_drvinfo info = {};
1952
1953 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1954 dev->ethtool_ops->get_drvinfo(dev, &info);
1955
1956 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1957 info.driver, dev ? dev->features : 0L,
1958 skb->sk ? skb->sk->sk_route_caps : 0L,
1959 skb->len, skb->data_len, skb->ip_summed);
1960 1982
1961 if (skb_header_cloned(skb) && 1983 if (skb_header_cloned(skb) &&
1962 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1984 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
@@ -2064,7 +2086,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
2064 * This function segments the given skb and stores the list of segments 2086 * This function segments the given skb and stores the list of segments
2065 * in skb->next. 2087 * in skb->next.
2066 */ 2088 */
2067static int dev_gso_segment(struct sk_buff *skb, int features) 2089static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2068{ 2090{
2069 struct sk_buff *segs; 2091 struct sk_buff *segs;
2070 2092
@@ -2103,7 +2125,7 @@ static inline void skb_orphan_try(struct sk_buff *skb)
2103 } 2125 }
2104} 2126}
2105 2127
2106static bool can_checksum_protocol(unsigned long features, __be16 protocol) 2128static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2107{ 2129{
2108 return ((features & NETIF_F_GEN_CSUM) || 2130 return ((features & NETIF_F_GEN_CSUM) ||
2109 ((features & NETIF_F_V4_CSUM) && 2131 ((features & NETIF_F_V4_CSUM) &&
@@ -2114,7 +2136,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2114 protocol == htons(ETH_P_FCOE))); 2136 protocol == htons(ETH_P_FCOE)));
2115} 2137}
2116 2138
2117static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) 2139static netdev_features_t harmonize_features(struct sk_buff *skb,
2140 __be16 protocol, netdev_features_t features)
2118{ 2141{
2119 if (!can_checksum_protocol(features, protocol)) { 2142 if (!can_checksum_protocol(features, protocol)) {
2120 features &= ~NETIF_F_ALL_CSUM; 2143 features &= ~NETIF_F_ALL_CSUM;
@@ -2126,10 +2149,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features
2126 return features; 2149 return features;
2127} 2150}
2128 2151
2129u32 netif_skb_features(struct sk_buff *skb) 2152netdev_features_t netif_skb_features(struct sk_buff *skb)
2130{ 2153{
2131 __be16 protocol = skb->protocol; 2154 __be16 protocol = skb->protocol;
2132 u32 features = skb->dev->features; 2155 netdev_features_t features = skb->dev->features;
2133 2156
2134 if (protocol == htons(ETH_P_8021Q)) { 2157 if (protocol == htons(ETH_P_8021Q)) {
2135 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 2158 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2175,7 +2198,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2175 unsigned int skb_len; 2198 unsigned int skb_len;
2176 2199
2177 if (likely(!skb->next)) { 2200 if (likely(!skb->next)) {
2178 u32 features; 2201 netdev_features_t features;
2179 2202
2180 /* 2203 /*
2181 * If device doesn't need skb->dst, release it right now while 2204 * If device doesn't need skb->dst, release it right now while
@@ -2256,7 +2279,7 @@ gso:
2256 return rc; 2279 return rc;
2257 } 2280 }
2258 txq_trans_update(txq); 2281 txq_trans_update(txq);
2259 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 2282 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2260 return NETDEV_TX_BUSY; 2283 return NETDEV_TX_BUSY;
2261 } while (skb->next); 2284 } while (skb->next);
2262 2285
@@ -2456,6 +2479,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2456 return rc; 2479 return rc;
2457} 2480}
2458 2481
2482#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2483static void skb_update_prio(struct sk_buff *skb)
2484{
2485 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2486
2487 if ((!skb->priority) && (skb->sk) && map)
2488 skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2489}
2490#else
2491#define skb_update_prio(skb)
2492#endif
2493
2459static DEFINE_PER_CPU(int, xmit_recursion); 2494static DEFINE_PER_CPU(int, xmit_recursion);
2460#define RECURSION_LIMIT 10 2495#define RECURSION_LIMIT 10
2461 2496
@@ -2496,6 +2531,8 @@ int dev_queue_xmit(struct sk_buff *skb)
2496 */ 2531 */
2497 rcu_read_lock_bh(); 2532 rcu_read_lock_bh();
2498 2533
2534 skb_update_prio(skb);
2535
2499 txq = dev_pick_tx(dev, skb); 2536 txq = dev_pick_tx(dev, skb);
2500 q = rcu_dereference_bh(txq->qdisc); 2537 q = rcu_dereference_bh(txq->qdisc);
2501 2538
@@ -2530,7 +2567,7 @@ int dev_queue_xmit(struct sk_buff *skb)
2530 2567
2531 HARD_TX_LOCK(dev, txq, cpu); 2568 HARD_TX_LOCK(dev, txq, cpu);
2532 2569
2533 if (!netif_tx_queue_stopped(txq)) { 2570 if (!netif_xmit_stopped(txq)) {
2534 __this_cpu_inc(xmit_recursion); 2571 __this_cpu_inc(xmit_recursion);
2535 rc = dev_hard_start_xmit(skb, dev, txq); 2572 rc = dev_hard_start_xmit(skb, dev, txq);
2536 __this_cpu_dec(xmit_recursion); 2573 __this_cpu_dec(xmit_recursion);
@@ -2591,123 +2628,28 @@ static inline void ____napi_schedule(struct softnet_data *sd,
2591 */ 2628 */
2592void __skb_get_rxhash(struct sk_buff *skb) 2629void __skb_get_rxhash(struct sk_buff *skb)
2593{ 2630{
2594 int nhoff, hash = 0, poff; 2631 struct flow_keys keys;
2595 const struct ipv6hdr *ip6; 2632 u32 hash;
2596 const struct iphdr *ip;
2597 const struct vlan_hdr *vlan;
2598 u8 ip_proto;
2599 u32 addr1, addr2;
2600 u16 proto;
2601 union {
2602 u32 v32;
2603 u16 v16[2];
2604 } ports;
2605
2606 nhoff = skb_network_offset(skb);
2607 proto = skb->protocol;
2608
2609again:
2610 switch (proto) {
2611 case __constant_htons(ETH_P_IP):
2612ip:
2613 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2614 goto done;
2615
2616 ip = (const struct iphdr *) (skb->data + nhoff);
2617 if (ip_is_fragment(ip))
2618 ip_proto = 0;
2619 else
2620 ip_proto = ip->protocol;
2621 addr1 = (__force u32) ip->saddr;
2622 addr2 = (__force u32) ip->daddr;
2623 nhoff += ip->ihl * 4;
2624 break;
2625 case __constant_htons(ETH_P_IPV6):
2626ipv6:
2627 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2628 goto done;
2629
2630 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2631 ip_proto = ip6->nexthdr;
2632 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2633 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2634 nhoff += 40;
2635 break;
2636 case __constant_htons(ETH_P_8021Q):
2637 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2638 goto done;
2639 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2640 proto = vlan->h_vlan_encapsulated_proto;
2641 nhoff += sizeof(*vlan);
2642 goto again;
2643 case __constant_htons(ETH_P_PPP_SES):
2644 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2645 goto done;
2646 proto = *((__be16 *) (skb->data + nhoff +
2647 sizeof(struct pppoe_hdr)));
2648 nhoff += PPPOE_SES_HLEN;
2649 switch (proto) {
2650 case __constant_htons(PPP_IP):
2651 goto ip;
2652 case __constant_htons(PPP_IPV6):
2653 goto ipv6;
2654 default:
2655 goto done;
2656 }
2657 default:
2658 goto done;
2659 }
2660
2661 switch (ip_proto) {
2662 case IPPROTO_GRE:
2663 if (pskb_may_pull(skb, nhoff + 16)) {
2664 u8 *h = skb->data + nhoff;
2665 __be16 flags = *(__be16 *)h;
2666 2633
2667 /* 2634 if (!skb_flow_dissect(skb, &keys))
2668 * Only look inside GRE if version zero and no 2635 return;
2669 * routing
2670 */
2671 if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2672 proto = *(__be16 *)(h + 2);
2673 nhoff += 4;
2674 if (flags & GRE_CSUM)
2675 nhoff += 4;
2676 if (flags & GRE_KEY)
2677 nhoff += 4;
2678 if (flags & GRE_SEQ)
2679 nhoff += 4;
2680 goto again;
2681 }
2682 }
2683 break;
2684 case IPPROTO_IPIP:
2685 goto again;
2686 default:
2687 break;
2688 }
2689 2636
2690 ports.v32 = 0; 2637 if (keys.ports) {
2691 poff = proto_ports_offset(ip_proto); 2638 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2692 if (poff >= 0) { 2639 swap(keys.port16[0], keys.port16[1]);
2693 nhoff += poff; 2640 skb->l4_rxhash = 1;
2694 if (pskb_may_pull(skb, nhoff + 4)) {
2695 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2696 if (ports.v16[1] < ports.v16[0])
2697 swap(ports.v16[0], ports.v16[1]);
2698 skb->l4_rxhash = 1;
2699 }
2700 } 2641 }
2701 2642
2702 /* get a consistent hash (same value on both flow directions) */ 2643 /* get a consistent hash (same value on both flow directions) */
2703 if (addr2 < addr1) 2644 if ((__force u32)keys.dst < (__force u32)keys.src)
2704 swap(addr1, addr2); 2645 swap(keys.dst, keys.src);
2705 2646
2706 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); 2647 hash = jhash_3words((__force u32)keys.dst,
2648 (__force u32)keys.src,
2649 (__force u32)keys.ports, hashrnd);
2707 if (!hash) 2650 if (!hash)
2708 hash = 1; 2651 hash = 1;
2709 2652
2710done:
2711 skb->rxhash = hash; 2653 skb->rxhash = hash;
2712} 2654}
2713EXPORT_SYMBOL(__skb_get_rxhash); 2655EXPORT_SYMBOL(__skb_get_rxhash);
@@ -2718,6 +2660,8 @@ EXPORT_SYMBOL(__skb_get_rxhash);
2718struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 2660struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2719EXPORT_SYMBOL(rps_sock_flow_table); 2661EXPORT_SYMBOL(rps_sock_flow_table);
2720 2662
2663struct jump_label_key rps_needed __read_mostly;
2664
2721static struct rps_dev_flow * 2665static struct rps_dev_flow *
2722set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2666set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2723 struct rps_dev_flow *rflow, u16 next_cpu) 2667 struct rps_dev_flow *rflow, u16 next_cpu)
@@ -2997,12 +2941,11 @@ int netif_rx(struct sk_buff *skb)
2997 if (netpoll_rx(skb)) 2941 if (netpoll_rx(skb))
2998 return NET_RX_DROP; 2942 return NET_RX_DROP;
2999 2943
3000 if (netdev_tstamp_prequeue) 2944 net_timestamp_check(netdev_tstamp_prequeue, skb);
3001 net_timestamp_check(skb);
3002 2945
3003 trace_netif_rx(skb); 2946 trace_netif_rx(skb);
3004#ifdef CONFIG_RPS 2947#ifdef CONFIG_RPS
3005 { 2948 if (static_branch(&rps_needed)) {
3006 struct rps_dev_flow voidflow, *rflow = &voidflow; 2949 struct rps_dev_flow voidflow, *rflow = &voidflow;
3007 int cpu; 2950 int cpu;
3008 2951
@@ -3017,14 +2960,13 @@ int netif_rx(struct sk_buff *skb)
3017 2960
3018 rcu_read_unlock(); 2961 rcu_read_unlock();
3019 preempt_enable(); 2962 preempt_enable();
3020 } 2963 } else
3021#else 2964#endif
3022 { 2965 {
3023 unsigned int qtail; 2966 unsigned int qtail;
3024 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 2967 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3025 put_cpu(); 2968 put_cpu();
3026 } 2969 }
3027#endif
3028 return ret; 2970 return ret;
3029} 2971}
3030EXPORT_SYMBOL(netif_rx); 2972EXPORT_SYMBOL(netif_rx);
@@ -3230,8 +3172,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
3230 int ret = NET_RX_DROP; 3172 int ret = NET_RX_DROP;
3231 __be16 type; 3173 __be16 type;
3232 3174
3233 if (!netdev_tstamp_prequeue) 3175 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3234 net_timestamp_check(skb);
3235 3176
3236 trace_netif_receive_skb(skb); 3177 trace_netif_receive_skb(skb);
3237 3178
@@ -3362,14 +3303,13 @@ out:
3362 */ 3303 */
3363int netif_receive_skb(struct sk_buff *skb) 3304int netif_receive_skb(struct sk_buff *skb)
3364{ 3305{
3365 if (netdev_tstamp_prequeue) 3306 net_timestamp_check(netdev_tstamp_prequeue, skb);
3366 net_timestamp_check(skb);
3367 3307
3368 if (skb_defer_rx_timestamp(skb)) 3308 if (skb_defer_rx_timestamp(skb))
3369 return NET_RX_SUCCESS; 3309 return NET_RX_SUCCESS;
3370 3310
3371#ifdef CONFIG_RPS 3311#ifdef CONFIG_RPS
3372 { 3312 if (static_branch(&rps_needed)) {
3373 struct rps_dev_flow voidflow, *rflow = &voidflow; 3313 struct rps_dev_flow voidflow, *rflow = &voidflow;
3374 int cpu, ret; 3314 int cpu, ret;
3375 3315
@@ -3380,16 +3320,12 @@ int netif_receive_skb(struct sk_buff *skb)
3380 if (cpu >= 0) { 3320 if (cpu >= 0) {
3381 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3321 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3382 rcu_read_unlock(); 3322 rcu_read_unlock();
3383 } else { 3323 return ret;
3384 rcu_read_unlock();
3385 ret = __netif_receive_skb(skb);
3386 } 3324 }
3387 3325 rcu_read_unlock();
3388 return ret;
3389 } 3326 }
3390#else
3391 return __netif_receive_skb(skb);
3392#endif 3327#endif
3328 return __netif_receive_skb(skb);
3393} 3329}
3394EXPORT_SYMBOL(netif_receive_skb); 3330EXPORT_SYMBOL(netif_receive_skb);
3395 3331
@@ -3564,14 +3500,20 @@ static inline gro_result_t
3564__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3500__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3565{ 3501{
3566 struct sk_buff *p; 3502 struct sk_buff *p;
3503 unsigned int maclen = skb->dev->hard_header_len;
3567 3504
3568 for (p = napi->gro_list; p; p = p->next) { 3505 for (p = napi->gro_list; p; p = p->next) {
3569 unsigned long diffs; 3506 unsigned long diffs;
3570 3507
3571 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 3508 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3572 diffs |= p->vlan_tci ^ skb->vlan_tci; 3509 diffs |= p->vlan_tci ^ skb->vlan_tci;
3573 diffs |= compare_ether_header(skb_mac_header(p), 3510 if (maclen == ETH_HLEN)
3574 skb_gro_mac_header(skb)); 3511 diffs |= compare_ether_header(skb_mac_header(p),
3512 skb_gro_mac_header(skb));
3513 else if (!diffs)
3514 diffs = memcmp(skb_mac_header(p),
3515 skb_gro_mac_header(skb),
3516 maclen);
3575 NAPI_GRO_CB(p)->same_flow = !diffs; 3517 NAPI_GRO_CB(p)->same_flow = !diffs;
3576 NAPI_GRO_CB(p)->flush = 0; 3518 NAPI_GRO_CB(p)->flush = 0;
3577 } 3519 }
@@ -4282,6 +4224,12 @@ static int dev_seq_open(struct inode *inode, struct file *file)
4282 sizeof(struct dev_iter_state)); 4224 sizeof(struct dev_iter_state));
4283} 4225}
4284 4226
4227int dev_seq_open_ops(struct inode *inode, struct file *file,
4228 const struct seq_operations *ops)
4229{
4230 return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
4231}
4232
4285static const struct file_operations dev_seq_fops = { 4233static const struct file_operations dev_seq_fops = {
4286 .owner = THIS_MODULE, 4234 .owner = THIS_MODULE,
4287 .open = dev_seq_open, 4235 .open = dev_seq_open,
@@ -4532,7 +4480,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
4532 4480
4533static int __dev_set_promiscuity(struct net_device *dev, int inc) 4481static int __dev_set_promiscuity(struct net_device *dev, int inc)
4534{ 4482{
4535 unsigned short old_flags = dev->flags; 4483 unsigned int old_flags = dev->flags;
4536 uid_t uid; 4484 uid_t uid;
4537 gid_t gid; 4485 gid_t gid;
4538 4486
@@ -4589,7 +4537,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
4589 */ 4537 */
4590int dev_set_promiscuity(struct net_device *dev, int inc) 4538int dev_set_promiscuity(struct net_device *dev, int inc)
4591{ 4539{
4592 unsigned short old_flags = dev->flags; 4540 unsigned int old_flags = dev->flags;
4593 int err; 4541 int err;
4594 4542
4595 err = __dev_set_promiscuity(dev, inc); 4543 err = __dev_set_promiscuity(dev, inc);
@@ -4616,7 +4564,7 @@ EXPORT_SYMBOL(dev_set_promiscuity);
4616 4564
4617int dev_set_allmulti(struct net_device *dev, int inc) 4565int dev_set_allmulti(struct net_device *dev, int inc)
4618{ 4566{
4619 unsigned short old_flags = dev->flags; 4567 unsigned int old_flags = dev->flags;
4620 4568
4621 ASSERT_RTNL(); 4569 ASSERT_RTNL();
4622 4570
@@ -4719,7 +4667,7 @@ EXPORT_SYMBOL(dev_get_flags);
4719 4667
4720int __dev_change_flags(struct net_device *dev, unsigned int flags) 4668int __dev_change_flags(struct net_device *dev, unsigned int flags)
4721{ 4669{
4722 int old_flags = dev->flags; 4670 unsigned int old_flags = dev->flags;
4723 int ret; 4671 int ret;
4724 4672
4725 ASSERT_RTNL(); 4673 ASSERT_RTNL();
@@ -4802,10 +4750,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4802 * Change settings on device based state flags. The flags are 4750 * Change settings on device based state flags. The flags are
4803 * in the userspace exported format. 4751 * in the userspace exported format.
4804 */ 4752 */
4805int dev_change_flags(struct net_device *dev, unsigned flags) 4753int dev_change_flags(struct net_device *dev, unsigned int flags)
4806{ 4754{
4807 int ret, changes; 4755 int ret;
4808 int old_flags = dev->flags; 4756 unsigned int changes, old_flags = dev->flags;
4809 4757
4810 ret = __dev_change_flags(dev, flags); 4758 ret = __dev_change_flags(dev, flags);
4811 if (ret < 0) 4759 if (ret < 0)
@@ -5362,7 +5310,8 @@ static void rollback_registered(struct net_device *dev)
5362 list_del(&single); 5310 list_del(&single);
5363} 5311}
5364 5312
5365static u32 netdev_fix_features(struct net_device *dev, u32 features) 5313static netdev_features_t netdev_fix_features(struct net_device *dev,
5314 netdev_features_t features)
5366{ 5315{
5367 /* Fix illegal checksum combinations */ 5316 /* Fix illegal checksum combinations */
5368 if ((features & NETIF_F_HW_CSUM) && 5317 if ((features & NETIF_F_HW_CSUM) &&
@@ -5371,12 +5320,6 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)
5371 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 5320 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5372 } 5321 }
5373 5322
5374 if ((features & NETIF_F_NO_CSUM) &&
5375 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5376 netdev_warn(dev, "mixed no checksumming and other settings.\n");
5377 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5378 }
5379
5380 /* Fix illegal SG+CSUM combinations. */ 5323 /* Fix illegal SG+CSUM combinations. */
5381 if ((features & NETIF_F_SG) && 5324 if ((features & NETIF_F_SG) &&
5382 !(features & NETIF_F_ALL_CSUM)) { 5325 !(features & NETIF_F_ALL_CSUM)) {
@@ -5424,7 +5367,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)
5424 5367
5425int __netdev_update_features(struct net_device *dev) 5368int __netdev_update_features(struct net_device *dev)
5426{ 5369{
5427 u32 features; 5370 netdev_features_t features;
5428 int err = 0; 5371 int err = 0;
5429 5372
5430 ASSERT_RTNL(); 5373 ASSERT_RTNL();
@@ -5440,16 +5383,16 @@ int __netdev_update_features(struct net_device *dev)
5440 if (dev->features == features) 5383 if (dev->features == features)
5441 return 0; 5384 return 0;
5442 5385
5443 netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", 5386 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5444 dev->features, features); 5387 &dev->features, &features);
5445 5388
5446 if (dev->netdev_ops->ndo_set_features) 5389 if (dev->netdev_ops->ndo_set_features)
5447 err = dev->netdev_ops->ndo_set_features(dev, features); 5390 err = dev->netdev_ops->ndo_set_features(dev, features);
5448 5391
5449 if (unlikely(err < 0)) { 5392 if (unlikely(err < 0)) {
5450 netdev_err(dev, 5393 netdev_err(dev,
5451 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", 5394 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5452 err, features, dev->features); 5395 err, &features, &dev->features);
5453 return -1; 5396 return -1;
5454 } 5397 }
5455 5398
@@ -5548,6 +5491,9 @@ static void netdev_init_one_queue(struct net_device *dev,
5548 queue->xmit_lock_owner = -1; 5491 queue->xmit_lock_owner = -1;
5549 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 5492 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5550 queue->dev = dev; 5493 queue->dev = dev;
5494#ifdef CONFIG_BQL
5495 dql_init(&queue->dql, HZ);
5496#endif
5551} 5497}
5552 5498
5553static int netif_alloc_netdev_queues(struct net_device *dev) 5499static int netif_alloc_netdev_queues(struct net_device *dev)
@@ -5633,11 +5579,12 @@ int register_netdevice(struct net_device *dev)
5633 dev->wanted_features = dev->features & dev->hw_features; 5579 dev->wanted_features = dev->features & dev->hw_features;
5634 5580
5635 /* Turn on no cache copy if HW is doing checksum */ 5581 /* Turn on no cache copy if HW is doing checksum */
5636 dev->hw_features |= NETIF_F_NOCACHE_COPY; 5582 if (!(dev->flags & IFF_LOOPBACK)) {
5637 if ((dev->features & NETIF_F_ALL_CSUM) && 5583 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5638 !(dev->features & NETIF_F_NO_CSUM)) { 5584 if (dev->features & NETIF_F_ALL_CSUM) {
5639 dev->wanted_features |= NETIF_F_NOCACHE_COPY; 5585 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5640 dev->features |= NETIF_F_NOCACHE_COPY; 5586 dev->features |= NETIF_F_NOCACHE_COPY;
5587 }
5641 } 5588 }
5642 5589
5643 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 5590 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
@@ -6373,7 +6320,8 @@ static int dev_cpu_callback(struct notifier_block *nfb,
6373 * @one to the master device with current feature set @all. Will not 6320 * @one to the master device with current feature set @all. Will not
6374 * enable anything that is off in @mask. Returns the new feature set. 6321 * enable anything that is off in @mask. Returns the new feature set.
6375 */ 6322 */
6376u32 netdev_increment_features(u32 all, u32 one, u32 mask) 6323netdev_features_t netdev_increment_features(netdev_features_t all,
6324 netdev_features_t one, netdev_features_t mask)
6377{ 6325{
6378 if (mask & NETIF_F_GEN_CSUM) 6326 if (mask & NETIF_F_GEN_CSUM)
6379 mask |= NETIF_F_ALL_CSUM; 6327 mask |= NETIF_F_ALL_CSUM;
@@ -6382,10 +6330,6 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6382 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; 6330 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6383 all &= one | ~NETIF_F_ALL_FOR_ALL; 6331 all &= one | ~NETIF_F_ALL_FOR_ALL;
6384 6332
6385 /* If device needs checksumming, downgrade to it. */
6386 if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6387 all &= ~NETIF_F_NO_CSUM;
6388
6389 /* If one device supports hw checksumming, set for all. */ 6333 /* If one device supports hw checksumming, set for all. */
6390 if (all & NETIF_F_GEN_CSUM) 6334 if (all & NETIF_F_GEN_CSUM)
6391 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); 6335 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 277faef9148d..29c07fef9228 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -427,7 +427,7 @@ EXPORT_SYMBOL(dev_uc_del);
427 * 427 *
428 * Add newly added addresses to the destination device and release 428 * Add newly added addresses to the destination device and release
429 * addresses that have no users left. The source device must be 429 * addresses that have no users left. The source device must be
430 * locked by netif_tx_lock_bh. 430 * locked by netif_addr_lock_bh.
431 * 431 *
432 * This function is intended to be called from the dev->set_rx_mode 432 * This function is intended to be called from the dev->set_rx_mode
433 * function of layered software devices. 433 * function of layered software devices.
@@ -439,11 +439,11 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)
439 if (to->addr_len != from->addr_len) 439 if (to->addr_len != from->addr_len)
440 return -EINVAL; 440 return -EINVAL;
441 441
442 netif_addr_lock_bh(to); 442 netif_addr_lock_nested(to);
443 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); 443 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
444 if (!err) 444 if (!err)
445 __dev_set_rx_mode(to); 445 __dev_set_rx_mode(to);
446 netif_addr_unlock_bh(to); 446 netif_addr_unlock(to);
447 return err; 447 return err;
448} 448}
449EXPORT_SYMBOL(dev_uc_sync); 449EXPORT_SYMBOL(dev_uc_sync);
@@ -463,7 +463,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)
463 return; 463 return;
464 464
465 netif_addr_lock_bh(from); 465 netif_addr_lock_bh(from);
466 netif_addr_lock(to); 466 netif_addr_lock_nested(to);
467 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); 467 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
468 __dev_set_rx_mode(to); 468 __dev_set_rx_mode(to);
469 netif_addr_unlock(to); 469 netif_addr_unlock(to);
@@ -590,7 +590,7 @@ EXPORT_SYMBOL(dev_mc_del_global);
590 * 590 *
591 * Add newly added addresses to the destination device and release 591 * Add newly added addresses to the destination device and release
592 * addresses that have no users left. The source device must be 592 * addresses that have no users left. The source device must be
593 * locked by netif_tx_lock_bh. 593 * locked by netif_addr_lock_bh.
594 * 594 *
595 * This function is intended to be called from the ndo_set_rx_mode 595 * This function is intended to be called from the ndo_set_rx_mode
596 * function of layered software devices. 596 * function of layered software devices.
@@ -602,11 +602,11 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)
602 if (to->addr_len != from->addr_len) 602 if (to->addr_len != from->addr_len)
603 return -EINVAL; 603 return -EINVAL;
604 604
605 netif_addr_lock_bh(to); 605 netif_addr_lock_nested(to);
606 err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); 606 err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
607 if (!err) 607 if (!err)
608 __dev_set_rx_mode(to); 608 __dev_set_rx_mode(to);
609 netif_addr_unlock_bh(to); 609 netif_addr_unlock(to);
610 return err; 610 return err;
611} 611}
612EXPORT_SYMBOL(dev_mc_sync); 612EXPORT_SYMBOL(dev_mc_sync);
@@ -626,7 +626,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
626 return; 626 return;
627 627
628 netif_addr_lock_bh(from); 628 netif_addr_lock_bh(from);
629 netif_addr_lock(to); 629 netif_addr_lock_nested(to);
630 __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); 630 __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
631 __dev_set_rx_mode(to); 631 __dev_set_rx_mode(to);
632 netif_addr_unlock(to); 632 netif_addr_unlock(to);
@@ -696,8 +696,7 @@ static const struct seq_operations dev_mc_seq_ops = {
696 696
697static int dev_mc_seq_open(struct inode *inode, struct file *file) 697static int dev_mc_seq_open(struct inode *inode, struct file *file)
698{ 698{
699 return seq_open_net(inode, file, &dev_mc_seq_ops, 699 return dev_seq_open_ops(inode, file, &dev_mc_seq_ops);
700 sizeof(struct seq_net_private));
701} 700}
702 701
703static const struct file_operations dev_mc_seq_fops = { 702static const struct file_operations dev_mc_seq_fops = {
diff --git a/net/core/dst.c b/net/core/dst.c
index d5e2c4c09107..43d94cedbf7c 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -366,7 +366,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
366 dev_hold(dst->dev); 366 dev_hold(dst->dev);
367 dev_put(dev); 367 dev_put(dev);
368 rcu_read_lock(); 368 rcu_read_lock();
369 neigh = dst_get_neighbour(dst); 369 neigh = dst_get_neighbour_noref(dst);
370 if (neigh && neigh->dev == dev) { 370 if (neigh && neigh->dev == dev) {
371 neigh->dev = dst->dev; 371 neigh->dev = dst->dev;
372 dev_hold(dst->dev); 372 dev_hold(dst->dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f44481707124..3f79db1b612a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -36,235 +36,44 @@ u32 ethtool_op_get_link(struct net_device *dev)
36} 36}
37EXPORT_SYMBOL(ethtool_op_get_link); 37EXPORT_SYMBOL(ethtool_op_get_link);
38 38
39u32 ethtool_op_get_tx_csum(struct net_device *dev)
40{
41 return (dev->features & NETIF_F_ALL_CSUM) != 0;
42}
43EXPORT_SYMBOL(ethtool_op_get_tx_csum);
44
45int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
46{
47 if (data)
48 dev->features |= NETIF_F_IP_CSUM;
49 else
50 dev->features &= ~NETIF_F_IP_CSUM;
51
52 return 0;
53}
54EXPORT_SYMBOL(ethtool_op_set_tx_csum);
55
56int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
57{
58 if (data)
59 dev->features |= NETIF_F_HW_CSUM;
60 else
61 dev->features &= ~NETIF_F_HW_CSUM;
62
63 return 0;
64}
65EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
66
67int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
68{
69 if (data)
70 dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
71 else
72 dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
73
74 return 0;
75}
76EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
77
78u32 ethtool_op_get_sg(struct net_device *dev)
79{
80 return (dev->features & NETIF_F_SG) != 0;
81}
82EXPORT_SYMBOL(ethtool_op_get_sg);
83
84int ethtool_op_set_sg(struct net_device *dev, u32 data)
85{
86 if (data)
87 dev->features |= NETIF_F_SG;
88 else
89 dev->features &= ~NETIF_F_SG;
90
91 return 0;
92}
93EXPORT_SYMBOL(ethtool_op_set_sg);
94
95u32 ethtool_op_get_tso(struct net_device *dev)
96{
97 return (dev->features & NETIF_F_TSO) != 0;
98}
99EXPORT_SYMBOL(ethtool_op_get_tso);
100
101int ethtool_op_set_tso(struct net_device *dev, u32 data)
102{
103 if (data)
104 dev->features |= NETIF_F_TSO;
105 else
106 dev->features &= ~NETIF_F_TSO;
107
108 return 0;
109}
110EXPORT_SYMBOL(ethtool_op_set_tso);
111
112u32 ethtool_op_get_ufo(struct net_device *dev)
113{
114 return (dev->features & NETIF_F_UFO) != 0;
115}
116EXPORT_SYMBOL(ethtool_op_get_ufo);
117
118int ethtool_op_set_ufo(struct net_device *dev, u32 data)
119{
120 if (data)
121 dev->features |= NETIF_F_UFO;
122 else
123 dev->features &= ~NETIF_F_UFO;
124 return 0;
125}
126EXPORT_SYMBOL(ethtool_op_set_ufo);
127
128/* the following list of flags are the same as their associated
129 * NETIF_F_xxx values in include/linux/netdevice.h
130 */
131static const u32 flags_dup_features =
132 (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
133 ETH_FLAG_RXHASH);
134
135u32 ethtool_op_get_flags(struct net_device *dev)
136{
137 /* in the future, this function will probably contain additional
138 * handling for flags which are not so easily handled
139 * by a simple masking operation
140 */
141
142 return dev->features & flags_dup_features;
143}
144EXPORT_SYMBOL(ethtool_op_get_flags);
145
146/* Check if device can enable (or disable) particular feature coded in "data"
147 * argument. Flags "supported" describe features that can be toggled by device.
148 * If feature can not be toggled, it state (enabled or disabled) must match
149 * hardcoded device features state, otherwise flags are marked as invalid.
150 */
151bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported)
152{
153 u32 features = dev->features & flags_dup_features;
154 /* "data" can contain only flags_dup_features bits,
155 * see __ethtool_set_flags */
156
157 return (features & ~supported) != (data & ~supported);
158}
159EXPORT_SYMBOL(ethtool_invalid_flags);
160
161int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
162{
163 if (ethtool_invalid_flags(dev, data, supported))
164 return -EINVAL;
165
166 dev->features = ((dev->features & ~flags_dup_features) |
167 (data & flags_dup_features));
168 return 0;
169}
170EXPORT_SYMBOL(ethtool_op_set_flags);
171
172/* Handlers for each ethtool command */ 39/* Handlers for each ethtool command */
173 40
174#define ETHTOOL_DEV_FEATURE_WORDS 1 41#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
175 42
176static void ethtool_get_features_compat(struct net_device *dev, 43static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
177 struct ethtool_get_features_block *features) 44 [NETIF_F_SG_BIT] = "tx-scatter-gather",
178{ 45 [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
179 if (!dev->ethtool_ops) 46 [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
180 return; 47 [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
181 48 [NETIF_F_HIGHDMA_BIT] = "highdma",
182 /* getting RX checksum */ 49 [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
183 if (dev->ethtool_ops->get_rx_csum) 50 [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert",
184 if (dev->ethtool_ops->get_rx_csum(dev)) 51
185 features[0].active |= NETIF_F_RXCSUM; 52 [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse",
186 53 [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter",
187 /* mark legacy-changeable features */ 54 [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
188 if (dev->ethtool_ops->set_sg) 55 [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
189 features[0].available |= NETIF_F_SG; 56 [NETIF_F_LLTX_BIT] = "tx-lockless",
190 if (dev->ethtool_ops->set_tx_csum) 57 [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
191 features[0].available |= NETIF_F_ALL_CSUM; 58 [NETIF_F_GRO_BIT] = "rx-gro",
192 if (dev->ethtool_ops->set_tso) 59 [NETIF_F_LRO_BIT] = "rx-lro",
193 features[0].available |= NETIF_F_ALL_TSO; 60
194 if (dev->ethtool_ops->set_rx_csum) 61 [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
195 features[0].available |= NETIF_F_RXCSUM; 62 [NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
196 if (dev->ethtool_ops->set_flags) 63 [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
197 features[0].available |= flags_dup_features; 64 [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
198} 65 [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
199 66 [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
200static int ethtool_set_feature_compat(struct net_device *dev, 67
201 int (*legacy_set)(struct net_device *, u32), 68 [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
202 struct ethtool_set_features_block *features, u32 mask) 69 [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
203{ 70 [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
204 u32 do_set; 71 [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
205 72 [NETIF_F_RXHASH_BIT] = "rx-hashing",
206 if (!legacy_set) 73 [NETIF_F_RXCSUM_BIT] = "rx-checksum",
207 return 0; 74 [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
208 75 [NETIF_F_LOOPBACK_BIT] = "loopback",
209 if (!(features[0].valid & mask)) 76};
210 return 0;
211
212 features[0].valid &= ~mask;
213
214 do_set = !!(features[0].requested & mask);
215
216 if (legacy_set(dev, do_set) < 0)
217 netdev_info(dev,
218 "Legacy feature change (%s) failed for 0x%08x\n",
219 do_set ? "set" : "clear", mask);
220
221 return 1;
222}
223
224static int ethtool_set_flags_compat(struct net_device *dev,
225 int (*legacy_set)(struct net_device *, u32),
226 struct ethtool_set_features_block *features, u32 mask)
227{
228 u32 value;
229
230 if (!legacy_set)
231 return 0;
232
233 if (!(features[0].valid & mask))
234 return 0;
235
236 value = dev->features & ~features[0].valid;
237 value |= features[0].requested;
238
239 features[0].valid &= ~mask;
240
241 if (legacy_set(dev, value & mask) < 0)
242 netdev_info(dev, "Legacy flags change failed\n");
243
244 return 1;
245}
246
247static int ethtool_set_features_compat(struct net_device *dev,
248 struct ethtool_set_features_block *features)
249{
250 int compat;
251
252 if (!dev->ethtool_ops)
253 return 0;
254
255 compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
256 features, NETIF_F_SG);
257 compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
258 features, NETIF_F_ALL_CSUM);
259 compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
260 features, NETIF_F_ALL_TSO);
261 compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
262 features, NETIF_F_RXCSUM);
263 compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags,
264 features, flags_dup_features);
265
266 return compat;
267}
268 77
269static int ethtool_get_features(struct net_device *dev, void __user *useraddr) 78static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
270{ 79{
@@ -272,18 +81,21 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
272 .cmd = ETHTOOL_GFEATURES, 81 .cmd = ETHTOOL_GFEATURES,
273 .size = ETHTOOL_DEV_FEATURE_WORDS, 82 .size = ETHTOOL_DEV_FEATURE_WORDS,
274 }; 83 };
275 struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { 84 struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
276 {
277 .available = dev->hw_features,
278 .requested = dev->wanted_features,
279 .active = dev->features,
280 .never_changed = NETIF_F_NEVER_CHANGE,
281 },
282 };
283 u32 __user *sizeaddr; 85 u32 __user *sizeaddr;
284 u32 copy_size; 86 u32 copy_size;
87 int i;
285 88
286 ethtool_get_features_compat(dev, features); 89 /* in case feature bits run out again */
90 BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
91
92 for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
93 features[i].available = (u32)(dev->hw_features >> (32 * i));
94 features[i].requested = (u32)(dev->wanted_features >> (32 * i));
95 features[i].active = (u32)(dev->features >> (32 * i));
96 features[i].never_changed =
97 (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
98 }
287 99
288 sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); 100 sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
289 if (get_user(copy_size, sizeaddr)) 101 if (get_user(copy_size, sizeaddr))
@@ -305,7 +117,8 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
305{ 117{
306 struct ethtool_sfeatures cmd; 118 struct ethtool_sfeatures cmd;
307 struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; 119 struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
308 int ret = 0; 120 netdev_features_t wanted = 0, valid = 0;
121 int i, ret = 0;
309 122
310 if (copy_from_user(&cmd, useraddr, sizeof(cmd))) 123 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
311 return -EFAULT; 124 return -EFAULT;
@@ -317,65 +130,29 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
317 if (copy_from_user(features, useraddr, sizeof(features))) 130 if (copy_from_user(features, useraddr, sizeof(features)))
318 return -EFAULT; 131 return -EFAULT;
319 132
320 if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) 133 for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
321 return -EINVAL; 134 valid |= (netdev_features_t)features[i].valid << (32 * i);
135 wanted |= (netdev_features_t)features[i].requested << (32 * i);
136 }
322 137
323 if (ethtool_set_features_compat(dev, features)) 138 if (valid & ~NETIF_F_ETHTOOL_BITS)
324 ret |= ETHTOOL_F_COMPAT; 139 return -EINVAL;
325 140
326 if (features[0].valid & ~dev->hw_features) { 141 if (valid & ~dev->hw_features) {
327 features[0].valid &= dev->hw_features; 142 valid &= dev->hw_features;
328 ret |= ETHTOOL_F_UNSUPPORTED; 143 ret |= ETHTOOL_F_UNSUPPORTED;
329 } 144 }
330 145
331 dev->wanted_features &= ~features[0].valid; 146 dev->wanted_features &= ~valid;
332 dev->wanted_features |= features[0].valid & features[0].requested; 147 dev->wanted_features |= wanted & valid;
333 __netdev_update_features(dev); 148 __netdev_update_features(dev);
334 149
335 if ((dev->wanted_features ^ dev->features) & features[0].valid) 150 if ((dev->wanted_features ^ dev->features) & valid)
336 ret |= ETHTOOL_F_WISH; 151 ret |= ETHTOOL_F_WISH;
337 152
338 return ret; 153 return ret;
339} 154}
340 155
341static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
342 /* NETIF_F_SG */ "tx-scatter-gather",
343 /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4",
344 /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded",
345 /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic",
346 /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6",
347 /* NETIF_F_HIGHDMA */ "highdma",
348 /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist",
349 /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert",
350
351 /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse",
352 /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter",
353 /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
354 /* NETIF_F_GSO */ "tx-generic-segmentation",
355 /* NETIF_F_LLTX */ "tx-lockless",
356 /* NETIF_F_NETNS_LOCAL */ "netns-local",
357 /* NETIF_F_GRO */ "rx-gro",
358 /* NETIF_F_LRO */ "rx-lro",
359
360 /* NETIF_F_TSO */ "tx-tcp-segmentation",
361 /* NETIF_F_UFO */ "tx-udp-fragmentation",
362 /* NETIF_F_GSO_ROBUST */ "tx-gso-robust",
363 /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation",
364 /* NETIF_F_TSO6 */ "tx-tcp6-segmentation",
365 /* NETIF_F_FSO */ "tx-fcoe-segmentation",
366 "",
367 "",
368
369 /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc",
370 /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp",
371 /* NETIF_F_FCOE_MTU */ "fcoe-mtu",
372 /* NETIF_F_NTUPLE */ "rx-ntuple-filter",
373 /* NETIF_F_RXHASH */ "rx-hashing",
374 /* NETIF_F_RXCSUM */ "rx-checksum",
375 /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy",
376 /* NETIF_F_LOOPBACK */ "loopback",
377};
378
379static int __ethtool_get_sset_count(struct net_device *dev, int sset) 156static int __ethtool_get_sset_count(struct net_device *dev, int sset)
380{ 157{
381 const struct ethtool_ops *ops = dev->ethtool_ops; 158 const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -402,7 +179,7 @@ static void __ethtool_get_strings(struct net_device *dev,
402 ops->get_strings(dev, stringset, data); 179 ops->get_strings(dev, stringset, data);
403} 180}
404 181
405static u32 ethtool_get_feature_mask(u32 eth_cmd) 182static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
406{ 183{
407 /* feature masks of legacy discrete ethtool ops */ 184 /* feature masks of legacy discrete ethtool ops */
408 185
@@ -433,136 +210,82 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd)
433 } 210 }
434} 211}
435 212
436static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
437{
438 const struct ethtool_ops *ops = dev->ethtool_ops;
439
440 if (!ops)
441 return NULL;
442
443 switch (ethcmd) {
444 case ETHTOOL_GTXCSUM:
445 return ops->get_tx_csum;
446 case ETHTOOL_GRXCSUM:
447 return ops->get_rx_csum;
448 case ETHTOOL_SSG:
449 return ops->get_sg;
450 case ETHTOOL_STSO:
451 return ops->get_tso;
452 case ETHTOOL_SUFO:
453 return ops->get_ufo;
454 default:
455 return NULL;
456 }
457}
458
459static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
460{
461 return !!(dev->features & NETIF_F_ALL_CSUM);
462}
463
464static int ethtool_get_one_feature(struct net_device *dev, 213static int ethtool_get_one_feature(struct net_device *dev,
465 char __user *useraddr, u32 ethcmd) 214 char __user *useraddr, u32 ethcmd)
466{ 215{
467 u32 mask = ethtool_get_feature_mask(ethcmd); 216 netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
468 struct ethtool_value edata = { 217 struct ethtool_value edata = {
469 .cmd = ethcmd, 218 .cmd = ethcmd,
470 .data = !!(dev->features & mask), 219 .data = !!(dev->features & mask),
471 }; 220 };
472 221
473 /* compatibility with discrete get_ ops */
474 if (!(dev->hw_features & mask)) {
475 u32 (*actor)(struct net_device *);
476
477 actor = __ethtool_get_one_feature_actor(dev, ethcmd);
478
479 /* bug compatibility with old get_rx_csum */
480 if (ethcmd == ETHTOOL_GRXCSUM && !actor)
481 actor = __ethtool_get_rx_csum_oldbug;
482
483 if (actor)
484 edata.data = actor(dev);
485 }
486
487 if (copy_to_user(useraddr, &edata, sizeof(edata))) 222 if (copy_to_user(useraddr, &edata, sizeof(edata)))
488 return -EFAULT; 223 return -EFAULT;
489 return 0; 224 return 0;
490} 225}
491 226
492static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
493static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
494static int __ethtool_set_sg(struct net_device *dev, u32 data);
495static int __ethtool_set_tso(struct net_device *dev, u32 data);
496static int __ethtool_set_ufo(struct net_device *dev, u32 data);
497
498static int ethtool_set_one_feature(struct net_device *dev, 227static int ethtool_set_one_feature(struct net_device *dev,
499 void __user *useraddr, u32 ethcmd) 228 void __user *useraddr, u32 ethcmd)
500{ 229{
501 struct ethtool_value edata; 230 struct ethtool_value edata;
502 u32 mask; 231 netdev_features_t mask;
503 232
504 if (copy_from_user(&edata, useraddr, sizeof(edata))) 233 if (copy_from_user(&edata, useraddr, sizeof(edata)))
505 return -EFAULT; 234 return -EFAULT;
506 235
507 mask = ethtool_get_feature_mask(ethcmd); 236 mask = ethtool_get_feature_mask(ethcmd);
508 mask &= dev->hw_features; 237 mask &= dev->hw_features;
509 if (mask) { 238 if (!mask)
510 if (edata.data) 239 return -EOPNOTSUPP;
511 dev->wanted_features |= mask;
512 else
513 dev->wanted_features &= ~mask;
514 240
515 __netdev_update_features(dev); 241 if (edata.data)
516 return 0; 242 dev->wanted_features |= mask;
517 } 243 else
244 dev->wanted_features &= ~mask;
518 245
519 /* Driver is not converted to ndo_fix_features or does not 246 __netdev_update_features(dev);
520 * support changing this offload. In the latter case it won't
521 * have corresponding ethtool_ops field set.
522 *
523 * Following part is to be removed after all drivers advertise
524 * their changeable features in netdev->hw_features and stop
525 * using discrete offload setting ops.
526 */
527 247
528 switch (ethcmd) { 248 return 0;
529 case ETHTOOL_STXCSUM: 249}
530 return __ethtool_set_tx_csum(dev, edata.data); 250
531 case ETHTOOL_SRXCSUM: 251#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
532 return __ethtool_set_rx_csum(dev, edata.data); 252 ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
533 case ETHTOOL_SSG: 253#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \
534 return __ethtool_set_sg(dev, edata.data); 254 NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH)
535 case ETHTOOL_STSO: 255
536 return __ethtool_set_tso(dev, edata.data); 256static u32 __ethtool_get_flags(struct net_device *dev)
537 case ETHTOOL_SUFO: 257{
538 return __ethtool_set_ufo(dev, edata.data); 258 u32 flags = 0;
539 default: 259
540 return -EOPNOTSUPP; 260 if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO;
541 } 261 if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN;
262 if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN;
263 if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE;
264 if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH;
265
266 return flags;
542} 267}
543 268
544int __ethtool_set_flags(struct net_device *dev, u32 data) 269static int __ethtool_set_flags(struct net_device *dev, u32 data)
545{ 270{
546 u32 changed; 271 netdev_features_t features = 0, changed;
547 272
548 if (data & ~flags_dup_features) 273 if (data & ~ETH_ALL_FLAGS)
549 return -EINVAL; 274 return -EINVAL;
550 275
551 /* legacy set_flags() op */ 276 if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO;
552 if (dev->ethtool_ops->set_flags) { 277 if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX;
553 if (unlikely(dev->hw_features & flags_dup_features)) 278 if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX;
554 netdev_warn(dev, 279 if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE;
555 "driver BUG: mixed hw_features and set_flags()\n"); 280 if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH;
556 return dev->ethtool_ops->set_flags(dev, data);
557 }
558 281
559 /* allow changing only bits set in hw_features */ 282 /* allow changing only bits set in hw_features */
560 changed = (data ^ dev->features) & flags_dup_features; 283 changed = (features ^ dev->features) & ETH_ALL_FEATURES;
561 if (changed & ~dev->hw_features) 284 if (changed & ~dev->hw_features)
562 return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; 285 return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
563 286
564 dev->wanted_features = 287 dev->wanted_features =
565 (dev->wanted_features & ~changed) | (data & dev->hw_features); 288 (dev->wanted_features & ~changed) | (features & changed);
566 289
567 __netdev_update_features(dev); 290 __netdev_update_features(dev);
568 291
@@ -716,6 +439,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
716{ 439{
717 struct ethtool_rxnfc info; 440 struct ethtool_rxnfc info;
718 size_t info_size = sizeof(info); 441 size_t info_size = sizeof(info);
442 int rc;
719 443
720 if (!dev->ethtool_ops->set_rxnfc) 444 if (!dev->ethtool_ops->set_rxnfc)
721 return -EOPNOTSUPP; 445 return -EOPNOTSUPP;
@@ -731,7 +455,15 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
731 if (copy_from_user(&info, useraddr, info_size)) 455 if (copy_from_user(&info, useraddr, info_size))
732 return -EFAULT; 456 return -EFAULT;
733 457
734 return dev->ethtool_ops->set_rxnfc(dev, &info); 458 rc = dev->ethtool_ops->set_rxnfc(dev, &info);
459 if (rc)
460 return rc;
461
462 if (cmd == ETHTOOL_SRXCLSRLINS &&
463 copy_to_user(useraddr, &info, info_size))
464 return -EFAULT;
465
466 return 0;
735} 467}
736 468
737static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, 469static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
@@ -792,34 +524,44 @@ err_out:
792static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, 524static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
793 void __user *useraddr) 525 void __user *useraddr)
794{ 526{
795 struct ethtool_rxfh_indir *indir; 527 u32 user_size, dev_size;
796 u32 table_size; 528 u32 *indir;
797 size_t full_size;
798 int ret; 529 int ret;
799 530
800 if (!dev->ethtool_ops->get_rxfh_indir) 531 if (!dev->ethtool_ops->get_rxfh_indir_size ||
532 !dev->ethtool_ops->get_rxfh_indir)
533 return -EOPNOTSUPP;
534 dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
535 if (dev_size == 0)
801 return -EOPNOTSUPP; 536 return -EOPNOTSUPP;
802 537
803 if (copy_from_user(&table_size, 538 if (copy_from_user(&user_size,
804 useraddr + offsetof(struct ethtool_rxfh_indir, size), 539 useraddr + offsetof(struct ethtool_rxfh_indir, size),
805 sizeof(table_size))) 540 sizeof(user_size)))
806 return -EFAULT; 541 return -EFAULT;
807 542
808 if (table_size > 543 if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
809 (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) 544 &dev_size, sizeof(dev_size)))
810 return -ENOMEM; 545 return -EFAULT;
811 full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; 546
812 indir = kzalloc(full_size, GFP_USER); 547 /* If the user buffer size is 0, this is just a query for the
548 * device table size. Otherwise, if it's smaller than the
549 * device table size it's an error.
550 */
551 if (user_size < dev_size)
552 return user_size == 0 ? 0 : -EINVAL;
553
554 indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
813 if (!indir) 555 if (!indir)
814 return -ENOMEM; 556 return -ENOMEM;
815 557
816 indir->cmd = ETHTOOL_GRXFHINDIR;
817 indir->size = table_size;
818 ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); 558 ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
819 if (ret) 559 if (ret)
820 goto out; 560 goto out;
821 561
822 if (copy_to_user(useraddr, indir, full_size)) 562 if (copy_to_user(useraddr +
563 offsetof(struct ethtool_rxfh_indir, ring_index[0]),
564 indir, dev_size * sizeof(indir[0])))
823 ret = -EFAULT; 565 ret = -EFAULT;
824 566
825out: 567out:
@@ -830,30 +572,56 @@ out:
830static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, 572static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
831 void __user *useraddr) 573 void __user *useraddr)
832{ 574{
833 struct ethtool_rxfh_indir *indir; 575 struct ethtool_rxnfc rx_rings;
834 u32 table_size; 576 u32 user_size, dev_size, i;
835 size_t full_size; 577 u32 *indir;
836 int ret; 578 int ret;
837 579
838 if (!dev->ethtool_ops->set_rxfh_indir) 580 if (!dev->ethtool_ops->get_rxfh_indir_size ||
581 !dev->ethtool_ops->set_rxfh_indir ||
582 !dev->ethtool_ops->get_rxnfc)
583 return -EOPNOTSUPP;
584 dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
585 if (dev_size == 0)
839 return -EOPNOTSUPP; 586 return -EOPNOTSUPP;
840 587
841 if (copy_from_user(&table_size, 588 if (copy_from_user(&user_size,
842 useraddr + offsetof(struct ethtool_rxfh_indir, size), 589 useraddr + offsetof(struct ethtool_rxfh_indir, size),
843 sizeof(table_size))) 590 sizeof(user_size)))
844 return -EFAULT; 591 return -EFAULT;
845 592
846 if (table_size > 593 if (user_size != 0 && user_size != dev_size)
847 (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) 594 return -EINVAL;
848 return -ENOMEM; 595
849 full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; 596 indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
850 indir = kmalloc(full_size, GFP_USER);
851 if (!indir) 597 if (!indir)
852 return -ENOMEM; 598 return -ENOMEM;
853 599
854 if (copy_from_user(indir, useraddr, full_size)) { 600 rx_rings.cmd = ETHTOOL_GRXRINGS;
855 ret = -EFAULT; 601 ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL);
602 if (ret)
856 goto out; 603 goto out;
604
605 if (user_size == 0) {
606 for (i = 0; i < dev_size; i++)
607 indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
608 } else {
609 if (copy_from_user(indir,
610 useraddr +
611 offsetof(struct ethtool_rxfh_indir,
612 ring_index[0]),
613 dev_size * sizeof(indir[0]))) {
614 ret = -EFAULT;
615 goto out;
616 }
617
618 /* Validate ring indices */
619 for (i = 0; i < dev_size; i++) {
620 if (indir[i] >= rx_rings.data) {
621 ret = -EINVAL;
622 goto out;
623 }
624 }
857 } 625 }
858 626
859 ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); 627 ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
@@ -863,58 +631,6 @@ out:
863 return ret; 631 return ret;
864} 632}
865 633
866/*
867 * ethtool does not (or did not) set masks for flow parameters that are
868 * not specified, so if both value and mask are 0 then this must be
869 * treated as equivalent to a mask with all bits set. Implement that
870 * here rather than in drivers.
871 */
872static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
873{
874 struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
875 struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
876
877 if (fs->flow_type != TCP_V4_FLOW &&
878 fs->flow_type != UDP_V4_FLOW &&
879 fs->flow_type != SCTP_V4_FLOW)
880 return;
881
882 if (!(entry->ip4src | mask->ip4src))
883 mask->ip4src = htonl(0xffffffff);
884 if (!(entry->ip4dst | mask->ip4dst))
885 mask->ip4dst = htonl(0xffffffff);
886 if (!(entry->psrc | mask->psrc))
887 mask->psrc = htons(0xffff);
888 if (!(entry->pdst | mask->pdst))
889 mask->pdst = htons(0xffff);
890 if (!(entry->tos | mask->tos))
891 mask->tos = 0xff;
892 if (!(fs->vlan_tag | fs->vlan_tag_mask))
893 fs->vlan_tag_mask = 0xffff;
894 if (!(fs->data | fs->data_mask))
895 fs->data_mask = 0xffffffffffffffffULL;
896}
897
898static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
899 void __user *useraddr)
900{
901 struct ethtool_rx_ntuple cmd;
902 const struct ethtool_ops *ops = dev->ethtool_ops;
903
904 if (!ops->set_rx_ntuple)
905 return -EOPNOTSUPP;
906
907 if (!(dev->features & NETIF_F_NTUPLE))
908 return -EINVAL;
909
910 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
911 return -EFAULT;
912
913 rx_ntuple_fix_masks(&cmd.fs);
914
915 return ops->set_rx_ntuple(dev, &cmd);
916}
917
918static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) 634static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
919{ 635{
920 struct ethtool_regs regs; 636 struct ethtool_regs regs;
@@ -1231,81 +947,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
1231 return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); 947 return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
1232} 948}
1233 949
1234static int __ethtool_set_sg(struct net_device *dev, u32 data)
1235{
1236 int err;
1237
1238 if (!dev->ethtool_ops->set_sg)
1239 return -EOPNOTSUPP;
1240
1241 if (data && !(dev->features & NETIF_F_ALL_CSUM))
1242 return -EINVAL;
1243
1244 if (!data && dev->ethtool_ops->set_tso) {
1245 err = dev->ethtool_ops->set_tso(dev, 0);
1246 if (err)
1247 return err;
1248 }
1249
1250 if (!data && dev->ethtool_ops->set_ufo) {
1251 err = dev->ethtool_ops->set_ufo(dev, 0);
1252 if (err)
1253 return err;
1254 }
1255 return dev->ethtool_ops->set_sg(dev, data);
1256}
1257
1258static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
1259{
1260 int err;
1261
1262 if (!dev->ethtool_ops->set_tx_csum)
1263 return -EOPNOTSUPP;
1264
1265 if (!data && dev->ethtool_ops->set_sg) {
1266 err = __ethtool_set_sg(dev, 0);
1267 if (err)
1268 return err;
1269 }
1270
1271 return dev->ethtool_ops->set_tx_csum(dev, data);
1272}
1273
1274static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
1275{
1276 if (!dev->ethtool_ops->set_rx_csum)
1277 return -EOPNOTSUPP;
1278
1279 if (!data)
1280 dev->features &= ~NETIF_F_GRO;
1281
1282 return dev->ethtool_ops->set_rx_csum(dev, data);
1283}
1284
1285static int __ethtool_set_tso(struct net_device *dev, u32 data)
1286{
1287 if (!dev->ethtool_ops->set_tso)
1288 return -EOPNOTSUPP;
1289
1290 if (data && !(dev->features & NETIF_F_SG))
1291 return -EINVAL;
1292
1293 return dev->ethtool_ops->set_tso(dev, data);
1294}
1295
1296static int __ethtool_set_ufo(struct net_device *dev, u32 data)
1297{
1298 if (!dev->ethtool_ops->set_ufo)
1299 return -EOPNOTSUPP;
1300 if (data && !(dev->features & NETIF_F_SG))
1301 return -EINVAL;
1302 if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
1303 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
1304 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
1305 return -EINVAL;
1306 return dev->ethtool_ops->set_ufo(dev, data);
1307}
1308
1309static int ethtool_self_test(struct net_device *dev, char __user *useraddr) 950static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
1310{ 951{
1311 struct ethtool_test test; 952 struct ethtool_test test;
@@ -1549,6 +1190,8 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
1549 if (!dev->ethtool_ops->flash_device) 1190 if (!dev->ethtool_ops->flash_device)
1550 return -EOPNOTSUPP; 1191 return -EOPNOTSUPP;
1551 1192
1193 efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
1194
1552 return dev->ethtool_ops->flash_device(dev, &efl); 1195 return dev->ethtool_ops->flash_device(dev, &efl);
1553} 1196}
1554 1197
@@ -1670,6 +1313,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1670 case ETHTOOL_GRXCSUM: 1313 case ETHTOOL_GRXCSUM:
1671 case ETHTOOL_GTXCSUM: 1314 case ETHTOOL_GTXCSUM:
1672 case ETHTOOL_GSG: 1315 case ETHTOOL_GSG:
1316 case ETHTOOL_GSSET_INFO:
1673 case ETHTOOL_GSTRINGS: 1317 case ETHTOOL_GSTRINGS:
1674 case ETHTOOL_GTSO: 1318 case ETHTOOL_GTSO:
1675 case ETHTOOL_GPERMADDR: 1319 case ETHTOOL_GPERMADDR:
@@ -1771,9 +1415,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1771 break; 1415 break;
1772 case ETHTOOL_GFLAGS: 1416 case ETHTOOL_GFLAGS:
1773 rc = ethtool_get_value(dev, useraddr, ethcmd, 1417 rc = ethtool_get_value(dev, useraddr, ethcmd,
1774 (dev->ethtool_ops->get_flags ? 1418 __ethtool_get_flags);
1775 dev->ethtool_ops->get_flags :
1776 ethtool_op_get_flags));
1777 break; 1419 break;
1778 case ETHTOOL_SFLAGS: 1420 case ETHTOOL_SFLAGS:
1779 rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); 1421 rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
@@ -1804,9 +1446,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1804 case ETHTOOL_RESET: 1446 case ETHTOOL_RESET:
1805 rc = ethtool_reset(dev, useraddr); 1447 rc = ethtool_reset(dev, useraddr);
1806 break; 1448 break;
1807 case ETHTOOL_SRXNTUPLE:
1808 rc = ethtool_set_rx_ntuple(dev, useraddr);
1809 break;
1810 case ETHTOOL_GSSET_INFO: 1449 case ETHTOOL_GSSET_INFO:
1811 rc = ethtool_get_sset_info(dev, useraddr); 1450 rc = ethtool_get_sset_info(dev, useraddr);
1812 break; 1451 break;
diff --git a/net/core/flow.c b/net/core/flow.c
index 8ae42de9c79e..e318c7e98042 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -358,6 +358,18 @@ void flow_cache_flush(void)
358 put_online_cpus(); 358 put_online_cpus();
359} 359}
360 360
361static void flow_cache_flush_task(struct work_struct *work)
362{
363 flow_cache_flush();
364}
365
366static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
367
368void flow_cache_flush_deferred(void)
369{
370 schedule_work(&flow_cache_flush_work);
371}
372
361static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) 373static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
362{ 374{
363 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 375 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 000000000000..a225089df5b6
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,144 @@
1#include <linux/skbuff.h>
2#include <linux/export.h>
3#include <linux/ip.h>
4#include <linux/ipv6.h>
5#include <linux/if_vlan.h>
6#include <net/ip.h>
7#include <linux/if_tunnel.h>
8#include <linux/if_pppox.h>
9#include <linux/ppp_defs.h>
10#include <net/flow_keys.h>
11
12/* copy saddr & daddr, possibly using 64bit load/store
13 * Equivalent to : flow->src = iph->saddr;
14 * flow->dst = iph->daddr;
15 */
16static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
17{
18 BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
19 offsetof(typeof(*flow), src) + sizeof(flow->src));
20 memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
21}
22
23bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
24{
25 int poff, nhoff = skb_network_offset(skb);
26 u8 ip_proto;
27 __be16 proto = skb->protocol;
28
29 memset(flow, 0, sizeof(*flow));
30
31again:
32 switch (proto) {
33 case __constant_htons(ETH_P_IP): {
34 const struct iphdr *iph;
35 struct iphdr _iph;
36ip:
37 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
38 if (!iph)
39 return false;
40
41 if (ip_is_fragment(iph))
42 ip_proto = 0;
43 else
44 ip_proto = iph->protocol;
45 iph_to_flow_copy_addrs(flow, iph);
46 nhoff += iph->ihl * 4;
47 break;
48 }
49 case __constant_htons(ETH_P_IPV6): {
50 const struct ipv6hdr *iph;
51 struct ipv6hdr _iph;
52ipv6:
53 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
54 if (!iph)
55 return false;
56
57 ip_proto = iph->nexthdr;
58 flow->src = iph->saddr.s6_addr32[3];
59 flow->dst = iph->daddr.s6_addr32[3];
60 nhoff += sizeof(struct ipv6hdr);
61 break;
62 }
63 case __constant_htons(ETH_P_8021Q): {
64 const struct vlan_hdr *vlan;
65 struct vlan_hdr _vlan;
66
67 vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
68 if (!vlan)
69 return false;
70
71 proto = vlan->h_vlan_encapsulated_proto;
72 nhoff += sizeof(*vlan);
73 goto again;
74 }
75 case __constant_htons(ETH_P_PPP_SES): {
76 struct {
77 struct pppoe_hdr hdr;
78 __be16 proto;
79 } *hdr, _hdr;
80 hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
81 if (!hdr)
82 return false;
83 proto = hdr->proto;
84 nhoff += PPPOE_SES_HLEN;
85 switch (proto) {
86 case __constant_htons(PPP_IP):
87 goto ip;
88 case __constant_htons(PPP_IPV6):
89 goto ipv6;
90 default:
91 return false;
92 }
93 }
94 default:
95 return false;
96 }
97
98 switch (ip_proto) {
99 case IPPROTO_GRE: {
100 struct gre_hdr {
101 __be16 flags;
102 __be16 proto;
103 } *hdr, _hdr;
104
105 hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
106 if (!hdr)
107 return false;
108 /*
109 * Only look inside GRE if version zero and no
110 * routing
111 */
112 if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
113 proto = hdr->proto;
114 nhoff += 4;
115 if (hdr->flags & GRE_CSUM)
116 nhoff += 4;
117 if (hdr->flags & GRE_KEY)
118 nhoff += 4;
119 if (hdr->flags & GRE_SEQ)
120 nhoff += 4;
121 goto again;
122 }
123 break;
124 }
125 case IPPROTO_IPIP:
126 goto again;
127 default:
128 break;
129 }
130
131 flow->ip_proto = ip_proto;
132 poff = proto_ports_offset(ip_proto);
133 if (poff >= 0) {
134 __be32 *ports, _ports;
135
136 nhoff += poff;
137 ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
138 if (ports)
139 flow->ports = *ports;
140 }
141
142 return true;
143}
144EXPORT_SYMBOL(skb_flow_dissect);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 039d51e6c284..2a83914b0277 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
238 it to safe state. 238 it to safe state.
239 */ 239 */
240 skb_queue_purge(&n->arp_queue); 240 skb_queue_purge(&n->arp_queue);
241 n->arp_queue_len_bytes = 0;
241 n->output = neigh_blackhole; 242 n->output = neigh_blackhole;
242 if (n->nud_state & NUD_VALID) 243 if (n->nud_state & NUD_VALID)
243 n->nud_state = NUD_NOARP; 244 n->nud_state = NUD_NOARP;
@@ -272,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
272} 273}
273EXPORT_SYMBOL(neigh_ifdown); 274EXPORT_SYMBOL(neigh_ifdown);
274 275
275static struct neighbour *neigh_alloc(struct neigh_table *tbl) 276static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
276{ 277{
277 struct neighbour *n = NULL; 278 struct neighbour *n = NULL;
278 unsigned long now = jiffies; 279 unsigned long now = jiffies;
@@ -287,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
287 goto out_entries; 288 goto out_entries;
288 } 289 }
289 290
290 n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); 291 if (tbl->entry_size)
292 n = kzalloc(tbl->entry_size, GFP_ATOMIC);
293 else {
294 int sz = sizeof(*n) + tbl->key_len;
295
296 sz = ALIGN(sz, NEIGH_PRIV_ALIGN);
297 sz += dev->neigh_priv_len;
298 n = kzalloc(sz, GFP_ATOMIC);
299 }
291 if (!n) 300 if (!n)
292 goto out_entries; 301 goto out_entries;
293 302
@@ -313,11 +322,18 @@ out_entries:
313 goto out; 322 goto out;
314} 323}
315 324
325static void neigh_get_hash_rnd(u32 *x)
326{
327 get_random_bytes(x, sizeof(*x));
328 *x |= 1;
329}
330
316static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) 331static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
317{ 332{
318 size_t size = (1 << shift) * sizeof(struct neighbour *); 333 size_t size = (1 << shift) * sizeof(struct neighbour *);
319 struct neigh_hash_table *ret; 334 struct neigh_hash_table *ret;
320 struct neighbour __rcu **buckets; 335 struct neighbour __rcu **buckets;
336 int i;
321 337
322 ret = kmalloc(sizeof(*ret), GFP_ATOMIC); 338 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
323 if (!ret) 339 if (!ret)
@@ -334,8 +350,8 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
334 } 350 }
335 ret->hash_buckets = buckets; 351 ret->hash_buckets = buckets;
336 ret->hash_shift = shift; 352 ret->hash_shift = shift;
337 get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); 353 for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
338 ret->hash_rnd |= 1; 354 neigh_get_hash_rnd(&ret->hash_rnd[i]);
339 return ret; 355 return ret;
340} 356}
341 357
@@ -462,7 +478,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
462 u32 hash_val; 478 u32 hash_val;
463 int key_len = tbl->key_len; 479 int key_len = tbl->key_len;
464 int error; 480 int error;
465 struct neighbour *n1, *rc, *n = neigh_alloc(tbl); 481 struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
466 struct neigh_hash_table *nht; 482 struct neigh_hash_table *nht;
467 483
468 if (!n) { 484 if (!n) {
@@ -480,6 +496,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
480 goto out_neigh_release; 496 goto out_neigh_release;
481 } 497 }
482 498
499 if (dev->netdev_ops->ndo_neigh_construct) {
500 error = dev->netdev_ops->ndo_neigh_construct(n);
501 if (error < 0) {
502 rc = ERR_PTR(error);
503 goto out_neigh_release;
504 }
505 }
506
483 /* Device specific setup. */ 507 /* Device specific setup. */
484 if (n->parms->neigh_setup && 508 if (n->parms->neigh_setup &&
485 (error = n->parms->neigh_setup(n)) < 0) { 509 (error = n->parms->neigh_setup(n)) < 0) {
@@ -677,18 +701,14 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
677 neigh_parms_destroy(parms); 701 neigh_parms_destroy(parms);
678} 702}
679 703
680static void neigh_destroy_rcu(struct rcu_head *head)
681{
682 struct neighbour *neigh = container_of(head, struct neighbour, rcu);
683
684 kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
685}
686/* 704/*
687 * neighbour must already be out of the table; 705 * neighbour must already be out of the table;
688 * 706 *
689 */ 707 */
690void neigh_destroy(struct neighbour *neigh) 708void neigh_destroy(struct neighbour *neigh)
691{ 709{
710 struct net_device *dev = neigh->dev;
711
692 NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); 712 NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
693 713
694 if (!neigh->dead) { 714 if (!neigh->dead) {
@@ -702,14 +722,18 @@ void neigh_destroy(struct neighbour *neigh)
702 printk(KERN_WARNING "Impossible event.\n"); 722 printk(KERN_WARNING "Impossible event.\n");
703 723
704 skb_queue_purge(&neigh->arp_queue); 724 skb_queue_purge(&neigh->arp_queue);
725 neigh->arp_queue_len_bytes = 0;
726
727 if (dev->netdev_ops->ndo_neigh_destroy)
728 dev->netdev_ops->ndo_neigh_destroy(neigh);
705 729
706 dev_put(neigh->dev); 730 dev_put(dev);
707 neigh_parms_put(neigh->parms); 731 neigh_parms_put(neigh->parms);
708 732
709 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); 733 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
710 734
711 atomic_dec(&neigh->tbl->entries); 735 atomic_dec(&neigh->tbl->entries);
712 call_rcu(&neigh->rcu, neigh_destroy_rcu); 736 kfree_rcu(neigh, rcu);
713} 737}
714EXPORT_SYMBOL(neigh_destroy); 738EXPORT_SYMBOL(neigh_destroy);
715 739
@@ -802,6 +826,8 @@ next_elt:
802 write_unlock_bh(&tbl->lock); 826 write_unlock_bh(&tbl->lock);
803 cond_resched(); 827 cond_resched();
804 write_lock_bh(&tbl->lock); 828 write_lock_bh(&tbl->lock);
829 nht = rcu_dereference_protected(tbl->nht,
830 lockdep_is_held(&tbl->lock));
805 } 831 }
806 /* Cycle through all hash buckets every base_reachable_time/2 ticks. 832 /* Cycle through all hash buckets every base_reachable_time/2 ticks.
807 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 833 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
@@ -842,6 +868,7 @@ static void neigh_invalidate(struct neighbour *neigh)
842 write_lock(&neigh->lock); 868 write_lock(&neigh->lock);
843 } 869 }
844 skb_queue_purge(&neigh->arp_queue); 870 skb_queue_purge(&neigh->arp_queue);
871 neigh->arp_queue_len_bytes = 0;
845} 872}
846 873
847static void neigh_probe(struct neighbour *neigh) 874static void neigh_probe(struct neighbour *neigh)
@@ -980,15 +1007,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
980 1007
981 if (neigh->nud_state == NUD_INCOMPLETE) { 1008 if (neigh->nud_state == NUD_INCOMPLETE) {
982 if (skb) { 1009 if (skb) {
983 if (skb_queue_len(&neigh->arp_queue) >= 1010 while (neigh->arp_queue_len_bytes + skb->truesize >
984 neigh->parms->queue_len) { 1011 neigh->parms->queue_len_bytes) {
985 struct sk_buff *buff; 1012 struct sk_buff *buff;
1013
986 buff = __skb_dequeue(&neigh->arp_queue); 1014 buff = __skb_dequeue(&neigh->arp_queue);
1015 if (!buff)
1016 break;
1017 neigh->arp_queue_len_bytes -= buff->truesize;
987 kfree_skb(buff); 1018 kfree_skb(buff);
988 NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); 1019 NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
989 } 1020 }
990 skb_dst_force(skb); 1021 skb_dst_force(skb);
991 __skb_queue_tail(&neigh->arp_queue, skb); 1022 __skb_queue_tail(&neigh->arp_queue, skb);
1023 neigh->arp_queue_len_bytes += skb->truesize;
992 } 1024 }
993 rc = 1; 1025 rc = 1;
994 } 1026 }
@@ -1167,7 +1199,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1167 1199
1168 rcu_read_lock(); 1200 rcu_read_lock();
1169 /* On shaper/eql skb->dst->neighbour != neigh :( */ 1201 /* On shaper/eql skb->dst->neighbour != neigh :( */
1170 if (dst && (n2 = dst_get_neighbour(dst)) != NULL) 1202 if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL)
1171 n1 = n2; 1203 n1 = n2;
1172 n1->output(n1, skb); 1204 n1->output(n1, skb);
1173 rcu_read_unlock(); 1205 rcu_read_unlock();
@@ -1175,6 +1207,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1175 write_lock_bh(&neigh->lock); 1207 write_lock_bh(&neigh->lock);
1176 } 1208 }
1177 skb_queue_purge(&neigh->arp_queue); 1209 skb_queue_purge(&neigh->arp_queue);
1210 neigh->arp_queue_len_bytes = 0;
1178 } 1211 }
1179out: 1212out:
1180 if (update_isrouter) { 1213 if (update_isrouter) {
@@ -1477,11 +1510,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
1477 tbl->parms.reachable_time = 1510 tbl->parms.reachable_time =
1478 neigh_rand_reach_time(tbl->parms.base_reachable_time); 1511 neigh_rand_reach_time(tbl->parms.base_reachable_time);
1479 1512
1480 if (!tbl->kmem_cachep)
1481 tbl->kmem_cachep =
1482 kmem_cache_create(tbl->id, tbl->entry_size, 0,
1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1484 NULL);
1485 tbl->stats = alloc_percpu(struct neigh_statistics); 1513 tbl->stats = alloc_percpu(struct neigh_statistics);
1486 if (!tbl->stats) 1514 if (!tbl->stats)
1487 panic("cannot create neighbour cache statistics"); 1515 panic("cannot create neighbour cache statistics");
@@ -1566,9 +1594,6 @@ int neigh_table_clear(struct neigh_table *tbl)
1566 free_percpu(tbl->stats); 1594 free_percpu(tbl->stats);
1567 tbl->stats = NULL; 1595 tbl->stats = NULL;
1568 1596
1569 kmem_cache_destroy(tbl->kmem_cachep);
1570 tbl->kmem_cachep = NULL;
1571
1572 return 0; 1597 return 0;
1573} 1598}
1574EXPORT_SYMBOL(neigh_table_clear); 1599EXPORT_SYMBOL(neigh_table_clear);
@@ -1747,7 +1772,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
1747 NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); 1772 NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
1748 1773
1749 NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); 1774 NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
1750 NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); 1775 NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
1776 /* approximative value for deprecated QUEUE_LEN (in packets) */
1777 NLA_PUT_U32(skb, NDTPA_QUEUE_LEN,
1778 DIV_ROUND_UP(parms->queue_len_bytes,
1779 SKB_TRUESIZE(ETH_FRAME_LEN)));
1751 NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); 1780 NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
1752 NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); 1781 NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
1753 NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); 1782 NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
@@ -1808,7 +1837,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
1808 1837
1809 rcu_read_lock_bh(); 1838 rcu_read_lock_bh();
1810 nht = rcu_dereference_bh(tbl->nht); 1839 nht = rcu_dereference_bh(tbl->nht);
1811 ndc.ndtc_hash_rnd = nht->hash_rnd; 1840 ndc.ndtc_hash_rnd = nht->hash_rnd[0];
1812 ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); 1841 ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
1813 rcu_read_unlock_bh(); 1842 rcu_read_unlock_bh();
1814 1843
@@ -1974,7 +2003,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1974 2003
1975 switch (i) { 2004 switch (i) {
1976 case NDTPA_QUEUE_LEN: 2005 case NDTPA_QUEUE_LEN:
1977 p->queue_len = nla_get_u32(tbp[i]); 2006 p->queue_len_bytes = nla_get_u32(tbp[i]) *
2007 SKB_TRUESIZE(ETH_FRAME_LEN);
2008 break;
2009 case NDTPA_QUEUE_LENBYTES:
2010 p->queue_len_bytes = nla_get_u32(tbp[i]);
1978 break; 2011 break;
1979 case NDTPA_PROXY_QLEN: 2012 case NDTPA_PROXY_QLEN:
1980 p->proxy_qlen = nla_get_u32(tbp[i]); 2013 p->proxy_qlen = nla_get_u32(tbp[i]);
@@ -2397,7 +2430,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
2397 struct net *net = seq_file_net(seq); 2430 struct net *net = seq_file_net(seq);
2398 struct neigh_table *tbl = state->tbl; 2431 struct neigh_table *tbl = state->tbl;
2399 2432
2400 pn = pn->next; 2433 do {
2434 pn = pn->next;
2435 } while (pn && !net_eq(pneigh_net(pn), net));
2436
2401 while (!pn) { 2437 while (!pn) {
2402 if (++state->bucket > PNEIGH_HASHMASK) 2438 if (++state->bucket > PNEIGH_HASHMASK)
2403 break; 2439 break;
@@ -2635,117 +2671,158 @@ EXPORT_SYMBOL(neigh_app_ns);
2635 2671
2636#ifdef CONFIG_SYSCTL 2672#ifdef CONFIG_SYSCTL
2637 2673
2638#define NEIGH_VARS_MAX 19 2674static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
2675 size_t *lenp, loff_t *ppos)
2676{
2677 int size, ret;
2678 ctl_table tmp = *ctl;
2679
2680 tmp.data = &size;
2681 size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
2682 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2683 if (write && !ret)
2684 *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
2685 return ret;
2686}
2687
2688enum {
2689 NEIGH_VAR_MCAST_PROBE,
2690 NEIGH_VAR_UCAST_PROBE,
2691 NEIGH_VAR_APP_PROBE,
2692 NEIGH_VAR_RETRANS_TIME,
2693 NEIGH_VAR_BASE_REACHABLE_TIME,
2694 NEIGH_VAR_DELAY_PROBE_TIME,
2695 NEIGH_VAR_GC_STALETIME,
2696 NEIGH_VAR_QUEUE_LEN,
2697 NEIGH_VAR_QUEUE_LEN_BYTES,
2698 NEIGH_VAR_PROXY_QLEN,
2699 NEIGH_VAR_ANYCAST_DELAY,
2700 NEIGH_VAR_PROXY_DELAY,
2701 NEIGH_VAR_LOCKTIME,
2702 NEIGH_VAR_RETRANS_TIME_MS,
2703 NEIGH_VAR_BASE_REACHABLE_TIME_MS,
2704 NEIGH_VAR_GC_INTERVAL,
2705 NEIGH_VAR_GC_THRESH1,
2706 NEIGH_VAR_GC_THRESH2,
2707 NEIGH_VAR_GC_THRESH3,
2708 NEIGH_VAR_MAX
2709};
2639 2710
2640static struct neigh_sysctl_table { 2711static struct neigh_sysctl_table {
2641 struct ctl_table_header *sysctl_header; 2712 struct ctl_table_header *sysctl_header;
2642 struct ctl_table neigh_vars[NEIGH_VARS_MAX]; 2713 struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
2643 char *dev_name; 2714 char *dev_name;
2644} neigh_sysctl_template __read_mostly = { 2715} neigh_sysctl_template __read_mostly = {
2645 .neigh_vars = { 2716 .neigh_vars = {
2646 { 2717 [NEIGH_VAR_MCAST_PROBE] = {
2647 .procname = "mcast_solicit", 2718 .procname = "mcast_solicit",
2648 .maxlen = sizeof(int), 2719 .maxlen = sizeof(int),
2649 .mode = 0644, 2720 .mode = 0644,
2650 .proc_handler = proc_dointvec, 2721 .proc_handler = proc_dointvec,
2651 }, 2722 },
2652 { 2723 [NEIGH_VAR_UCAST_PROBE] = {
2653 .procname = "ucast_solicit", 2724 .procname = "ucast_solicit",
2654 .maxlen = sizeof(int), 2725 .maxlen = sizeof(int),
2655 .mode = 0644, 2726 .mode = 0644,
2656 .proc_handler = proc_dointvec, 2727 .proc_handler = proc_dointvec,
2657 }, 2728 },
2658 { 2729 [NEIGH_VAR_APP_PROBE] = {
2659 .procname = "app_solicit", 2730 .procname = "app_solicit",
2660 .maxlen = sizeof(int), 2731 .maxlen = sizeof(int),
2661 .mode = 0644, 2732 .mode = 0644,
2662 .proc_handler = proc_dointvec, 2733 .proc_handler = proc_dointvec,
2663 }, 2734 },
2664 { 2735 [NEIGH_VAR_RETRANS_TIME] = {
2665 .procname = "retrans_time", 2736 .procname = "retrans_time",
2666 .maxlen = sizeof(int), 2737 .maxlen = sizeof(int),
2667 .mode = 0644, 2738 .mode = 0644,
2668 .proc_handler = proc_dointvec_userhz_jiffies, 2739 .proc_handler = proc_dointvec_userhz_jiffies,
2669 }, 2740 },
2670 { 2741 [NEIGH_VAR_BASE_REACHABLE_TIME] = {
2671 .procname = "base_reachable_time", 2742 .procname = "base_reachable_time",
2672 .maxlen = sizeof(int), 2743 .maxlen = sizeof(int),
2673 .mode = 0644, 2744 .mode = 0644,
2674 .proc_handler = proc_dointvec_jiffies, 2745 .proc_handler = proc_dointvec_jiffies,
2675 }, 2746 },
2676 { 2747 [NEIGH_VAR_DELAY_PROBE_TIME] = {
2677 .procname = "delay_first_probe_time", 2748 .procname = "delay_first_probe_time",
2678 .maxlen = sizeof(int), 2749 .maxlen = sizeof(int),
2679 .mode = 0644, 2750 .mode = 0644,
2680 .proc_handler = proc_dointvec_jiffies, 2751 .proc_handler = proc_dointvec_jiffies,
2681 }, 2752 },
2682 { 2753 [NEIGH_VAR_GC_STALETIME] = {
2683 .procname = "gc_stale_time", 2754 .procname = "gc_stale_time",
2684 .maxlen = sizeof(int), 2755 .maxlen = sizeof(int),
2685 .mode = 0644, 2756 .mode = 0644,
2686 .proc_handler = proc_dointvec_jiffies, 2757 .proc_handler = proc_dointvec_jiffies,
2687 }, 2758 },
2688 { 2759 [NEIGH_VAR_QUEUE_LEN] = {
2689 .procname = "unres_qlen", 2760 .procname = "unres_qlen",
2690 .maxlen = sizeof(int), 2761 .maxlen = sizeof(int),
2691 .mode = 0644, 2762 .mode = 0644,
2763 .proc_handler = proc_unres_qlen,
2764 },
2765 [NEIGH_VAR_QUEUE_LEN_BYTES] = {
2766 .procname = "unres_qlen_bytes",
2767 .maxlen = sizeof(int),
2768 .mode = 0644,
2692 .proc_handler = proc_dointvec, 2769 .proc_handler = proc_dointvec,
2693 }, 2770 },
2694 { 2771 [NEIGH_VAR_PROXY_QLEN] = {
2695 .procname = "proxy_qlen", 2772 .procname = "proxy_qlen",
2696 .maxlen = sizeof(int), 2773 .maxlen = sizeof(int),
2697 .mode = 0644, 2774 .mode = 0644,
2698 .proc_handler = proc_dointvec, 2775 .proc_handler = proc_dointvec,
2699 }, 2776 },
2700 { 2777 [NEIGH_VAR_ANYCAST_DELAY] = {
2701 .procname = "anycast_delay", 2778 .procname = "anycast_delay",
2702 .maxlen = sizeof(int), 2779 .maxlen = sizeof(int),
2703 .mode = 0644, 2780 .mode = 0644,
2704 .proc_handler = proc_dointvec_userhz_jiffies, 2781 .proc_handler = proc_dointvec_userhz_jiffies,
2705 }, 2782 },
2706 { 2783 [NEIGH_VAR_PROXY_DELAY] = {
2707 .procname = "proxy_delay", 2784 .procname = "proxy_delay",
2708 .maxlen = sizeof(int), 2785 .maxlen = sizeof(int),
2709 .mode = 0644, 2786 .mode = 0644,
2710 .proc_handler = proc_dointvec_userhz_jiffies, 2787 .proc_handler = proc_dointvec_userhz_jiffies,
2711 }, 2788 },
2712 { 2789 [NEIGH_VAR_LOCKTIME] = {
2713 .procname = "locktime", 2790 .procname = "locktime",
2714 .maxlen = sizeof(int), 2791 .maxlen = sizeof(int),
2715 .mode = 0644, 2792 .mode = 0644,
2716 .proc_handler = proc_dointvec_userhz_jiffies, 2793 .proc_handler = proc_dointvec_userhz_jiffies,
2717 }, 2794 },
2718 { 2795 [NEIGH_VAR_RETRANS_TIME_MS] = {
2719 .procname = "retrans_time_ms", 2796 .procname = "retrans_time_ms",
2720 .maxlen = sizeof(int), 2797 .maxlen = sizeof(int),
2721 .mode = 0644, 2798 .mode = 0644,
2722 .proc_handler = proc_dointvec_ms_jiffies, 2799 .proc_handler = proc_dointvec_ms_jiffies,
2723 }, 2800 },
2724 { 2801 [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
2725 .procname = "base_reachable_time_ms", 2802 .procname = "base_reachable_time_ms",
2726 .maxlen = sizeof(int), 2803 .maxlen = sizeof(int),
2727 .mode = 0644, 2804 .mode = 0644,
2728 .proc_handler = proc_dointvec_ms_jiffies, 2805 .proc_handler = proc_dointvec_ms_jiffies,
2729 }, 2806 },
2730 { 2807 [NEIGH_VAR_GC_INTERVAL] = {
2731 .procname = "gc_interval", 2808 .procname = "gc_interval",
2732 .maxlen = sizeof(int), 2809 .maxlen = sizeof(int),
2733 .mode = 0644, 2810 .mode = 0644,
2734 .proc_handler = proc_dointvec_jiffies, 2811 .proc_handler = proc_dointvec_jiffies,
2735 }, 2812 },
2736 { 2813 [NEIGH_VAR_GC_THRESH1] = {
2737 .procname = "gc_thresh1", 2814 .procname = "gc_thresh1",
2738 .maxlen = sizeof(int), 2815 .maxlen = sizeof(int),
2739 .mode = 0644, 2816 .mode = 0644,
2740 .proc_handler = proc_dointvec, 2817 .proc_handler = proc_dointvec,
2741 }, 2818 },
2742 { 2819 [NEIGH_VAR_GC_THRESH2] = {
2743 .procname = "gc_thresh2", 2820 .procname = "gc_thresh2",
2744 .maxlen = sizeof(int), 2821 .maxlen = sizeof(int),
2745 .mode = 0644, 2822 .mode = 0644,
2746 .proc_handler = proc_dointvec, 2823 .proc_handler = proc_dointvec,
2747 }, 2824 },
2748 { 2825 [NEIGH_VAR_GC_THRESH3] = {
2749 .procname = "gc_thresh3", 2826 .procname = "gc_thresh3",
2750 .maxlen = sizeof(int), 2827 .maxlen = sizeof(int),
2751 .mode = 0644, 2828 .mode = 0644,
@@ -2778,47 +2855,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
2778 if (!t) 2855 if (!t)
2779 goto err; 2856 goto err;
2780 2857
2781 t->neigh_vars[0].data = &p->mcast_probes; 2858 t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes;
2782 t->neigh_vars[1].data = &p->ucast_probes; 2859 t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes;
2783 t->neigh_vars[2].data = &p->app_probes; 2860 t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes;
2784 t->neigh_vars[3].data = &p->retrans_time; 2861 t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time;
2785 t->neigh_vars[4].data = &p->base_reachable_time; 2862 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time;
2786 t->neigh_vars[5].data = &p->delay_probe_time; 2863 t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time;
2787 t->neigh_vars[6].data = &p->gc_staletime; 2864 t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime;
2788 t->neigh_vars[7].data = &p->queue_len; 2865 t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes;
2789 t->neigh_vars[8].data = &p->proxy_qlen; 2866 t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes;
2790 t->neigh_vars[9].data = &p->anycast_delay; 2867 t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen;
2791 t->neigh_vars[10].data = &p->proxy_delay; 2868 t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay;
2792 t->neigh_vars[11].data = &p->locktime; 2869 t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
2793 t->neigh_vars[12].data = &p->retrans_time; 2870 t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
2794 t->neigh_vars[13].data = &p->base_reachable_time; 2871 t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time;
2872 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time;
2795 2873
2796 if (dev) { 2874 if (dev) {
2797 dev_name_source = dev->name; 2875 dev_name_source = dev->name;
2798 /* Terminate the table early */ 2876 /* Terminate the table early */
2799 memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); 2877 memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
2878 sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
2800 } else { 2879 } else {
2801 dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; 2880 dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
2802 t->neigh_vars[14].data = (int *)(p + 1); 2881 t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
2803 t->neigh_vars[15].data = (int *)(p + 1) + 1; 2882 t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
2804 t->neigh_vars[16].data = (int *)(p + 1) + 2; 2883 t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
2805 t->neigh_vars[17].data = (int *)(p + 1) + 3; 2884 t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
2806 } 2885 }
2807 2886
2808 2887
2809 if (handler) { 2888 if (handler) {
2810 /* RetransTime */ 2889 /* RetransTime */
2811 t->neigh_vars[3].proc_handler = handler; 2890 t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
2812 t->neigh_vars[3].extra1 = dev; 2891 t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
2813 /* ReachableTime */ 2892 /* ReachableTime */
2814 t->neigh_vars[4].proc_handler = handler; 2893 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
2815 t->neigh_vars[4].extra1 = dev; 2894 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
2816 /* RetransTime (in milliseconds)*/ 2895 /* RetransTime (in milliseconds)*/
2817 t->neigh_vars[12].proc_handler = handler; 2896 t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
2818 t->neigh_vars[12].extra1 = dev; 2897 t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
2819 /* ReachableTime (in milliseconds) */ 2898 /* ReachableTime (in milliseconds) */
2820 t->neigh_vars[13].proc_handler = handler; 2899 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
2821 t->neigh_vars[13].extra1 = dev; 2900 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
2822 } 2901 }
2823 2902
2824 t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); 2903 t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c71c434a4c05..a1727cda03d7 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -21,6 +21,7 @@
21#include <linux/wireless.h> 21#include <linux/wireless.h>
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/export.h> 23#include <linux/export.h>
24#include <linux/jiffies.h>
24#include <net/wext.h> 25#include <net/wext.h>
25 26
26#include "net-sysfs.h" 27#include "net-sysfs.h"
@@ -606,9 +607,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
606 rcu_assign_pointer(queue->rps_map, map); 607 rcu_assign_pointer(queue->rps_map, map);
607 spin_unlock(&rps_map_lock); 608 spin_unlock(&rps_map_lock);
608 609
609 if (old_map) 610 if (map)
611 jump_label_inc(&rps_needed);
612 if (old_map) {
610 kfree_rcu(old_map, rcu); 613 kfree_rcu(old_map, rcu);
611 614 jump_label_dec(&rps_needed);
615 }
612 free_cpumask_var(mask); 616 free_cpumask_var(mask);
613 return len; 617 return len;
614} 618}
@@ -618,15 +622,15 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
618 char *buf) 622 char *buf)
619{ 623{
620 struct rps_dev_flow_table *flow_table; 624 struct rps_dev_flow_table *flow_table;
621 unsigned int val = 0; 625 unsigned long val = 0;
622 626
623 rcu_read_lock(); 627 rcu_read_lock();
624 flow_table = rcu_dereference(queue->rps_flow_table); 628 flow_table = rcu_dereference(queue->rps_flow_table);
625 if (flow_table) 629 if (flow_table)
626 val = flow_table->mask + 1; 630 val = (unsigned long)flow_table->mask + 1;
627 rcu_read_unlock(); 631 rcu_read_unlock();
628 632
629 return sprintf(buf, "%u\n", val); 633 return sprintf(buf, "%lu\n", val);
630} 634}
631 635
632static void rps_dev_flow_table_release_work(struct work_struct *work) 636static void rps_dev_flow_table_release_work(struct work_struct *work)
@@ -650,33 +654,46 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
650 struct rx_queue_attribute *attr, 654 struct rx_queue_attribute *attr,
651 const char *buf, size_t len) 655 const char *buf, size_t len)
652{ 656{
653 unsigned int count; 657 unsigned long mask, count;
654 char *endp;
655 struct rps_dev_flow_table *table, *old_table; 658 struct rps_dev_flow_table *table, *old_table;
656 static DEFINE_SPINLOCK(rps_dev_flow_lock); 659 static DEFINE_SPINLOCK(rps_dev_flow_lock);
660 int rc;
657 661
658 if (!capable(CAP_NET_ADMIN)) 662 if (!capable(CAP_NET_ADMIN))
659 return -EPERM; 663 return -EPERM;
660 664
661 count = simple_strtoul(buf, &endp, 0); 665 rc = kstrtoul(buf, 0, &count);
662 if (endp == buf) 666 if (rc < 0)
663 return -EINVAL; 667 return rc;
664 668
665 if (count) { 669 if (count) {
666 int i; 670 mask = count - 1;
667 671 /* mask = roundup_pow_of_two(count) - 1;
668 if (count > 1<<30) { 672 * without overflows...
673 */
674 while ((mask | (mask >> 1)) != mask)
675 mask |= (mask >> 1);
676 /* On 64 bit arches, must check mask fits in table->mask (u32),
677 * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1)
678 * doesnt overflow.
679 */
680#if BITS_PER_LONG > 32
681 if (mask > (unsigned long)(u32)mask)
682 return -EINVAL;
683#else
684 if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
685 / sizeof(struct rps_dev_flow)) {
669 /* Enforce a limit to prevent overflow */ 686 /* Enforce a limit to prevent overflow */
670 return -EINVAL; 687 return -EINVAL;
671 } 688 }
672 count = roundup_pow_of_two(count); 689#endif
673 table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); 690 table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
674 if (!table) 691 if (!table)
675 return -ENOMEM; 692 return -ENOMEM;
676 693
677 table->mask = count - 1; 694 table->mask = mask;
678 for (i = 0; i < count; i++) 695 for (count = 0; count <= mask; count++)
679 table->flows[i].cpu = RPS_NO_CPU; 696 table->flows[count].cpu = RPS_NO_CPU;
680 } else 697 } else
681 table = NULL; 698 table = NULL;
682 699
@@ -780,7 +797,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
780#endif 797#endif
781} 798}
782 799
783#ifdef CONFIG_XPS 800#ifdef CONFIG_SYSFS
784/* 801/*
785 * netdev_queue sysfs structures and functions. 802 * netdev_queue sysfs structures and functions.
786 */ 803 */
@@ -826,6 +843,133 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
826 .store = netdev_queue_attr_store, 843 .store = netdev_queue_attr_store,
827}; 844};
828 845
846static ssize_t show_trans_timeout(struct netdev_queue *queue,
847 struct netdev_queue_attribute *attribute,
848 char *buf)
849{
850 unsigned long trans_timeout;
851
852 spin_lock_irq(&queue->_xmit_lock);
853 trans_timeout = queue->trans_timeout;
854 spin_unlock_irq(&queue->_xmit_lock);
855
856 return sprintf(buf, "%lu", trans_timeout);
857}
858
859static struct netdev_queue_attribute queue_trans_timeout =
860 __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
861
862#ifdef CONFIG_BQL
863/*
864 * Byte queue limits sysfs structures and functions.
865 */
866static ssize_t bql_show(char *buf, unsigned int value)
867{
868 return sprintf(buf, "%u\n", value);
869}
870
871static ssize_t bql_set(const char *buf, const size_t count,
872 unsigned int *pvalue)
873{
874 unsigned int value;
875 int err;
876
877 if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
878 value = DQL_MAX_LIMIT;
879 else {
880 err = kstrtouint(buf, 10, &value);
881 if (err < 0)
882 return err;
883 if (value > DQL_MAX_LIMIT)
884 return -EINVAL;
885 }
886
887 *pvalue = value;
888
889 return count;
890}
891
892static ssize_t bql_show_hold_time(struct netdev_queue *queue,
893 struct netdev_queue_attribute *attr,
894 char *buf)
895{
896 struct dql *dql = &queue->dql;
897
898 return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
899}
900
901static ssize_t bql_set_hold_time(struct netdev_queue *queue,
902 struct netdev_queue_attribute *attribute,
903 const char *buf, size_t len)
904{
905 struct dql *dql = &queue->dql;
906 unsigned value;
907 int err;
908
909 err = kstrtouint(buf, 10, &value);
910 if (err < 0)
911 return err;
912
913 dql->slack_hold_time = msecs_to_jiffies(value);
914
915 return len;
916}
917
918static struct netdev_queue_attribute bql_hold_time_attribute =
919 __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
920 bql_set_hold_time);
921
922static ssize_t bql_show_inflight(struct netdev_queue *queue,
923 struct netdev_queue_attribute *attr,
924 char *buf)
925{
926 struct dql *dql = &queue->dql;
927
928 return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
929}
930
931static struct netdev_queue_attribute bql_inflight_attribute =
932 __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
933
934#define BQL_ATTR(NAME, FIELD) \
935static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
936 struct netdev_queue_attribute *attr, \
937 char *buf) \
938{ \
939 return bql_show(buf, queue->dql.FIELD); \
940} \
941 \
942static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
943 struct netdev_queue_attribute *attr, \
944 const char *buf, size_t len) \
945{ \
946 return bql_set(buf, len, &queue->dql.FIELD); \
947} \
948 \
949static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
950 __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
951 bql_set_ ## NAME);
952
953BQL_ATTR(limit, limit)
954BQL_ATTR(limit_max, max_limit)
955BQL_ATTR(limit_min, min_limit)
956
957static struct attribute *dql_attrs[] = {
958 &bql_limit_attribute.attr,
959 &bql_limit_max_attribute.attr,
960 &bql_limit_min_attribute.attr,
961 &bql_hold_time_attribute.attr,
962 &bql_inflight_attribute.attr,
963 NULL
964};
965
966static struct attribute_group dql_group = {
967 .name = "byte_queue_limits",
968 .attrs = dql_attrs,
969};
970#endif /* CONFIG_BQL */
971
972#ifdef CONFIG_XPS
829static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) 973static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
830{ 974{
831 struct net_device *dev = queue->dev; 975 struct net_device *dev = queue->dev;
@@ -890,6 +1034,52 @@ static DEFINE_MUTEX(xps_map_mutex);
890#define xmap_dereference(P) \ 1034#define xmap_dereference(P) \
891 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1035 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
892 1036
1037static void xps_queue_release(struct netdev_queue *queue)
1038{
1039 struct net_device *dev = queue->dev;
1040 struct xps_dev_maps *dev_maps;
1041 struct xps_map *map;
1042 unsigned long index;
1043 int i, pos, nonempty = 0;
1044
1045 index = get_netdev_queue_index(queue);
1046
1047 mutex_lock(&xps_map_mutex);
1048 dev_maps = xmap_dereference(dev->xps_maps);
1049
1050 if (dev_maps) {
1051 for_each_possible_cpu(i) {
1052 map = xmap_dereference(dev_maps->cpu_map[i]);
1053 if (!map)
1054 continue;
1055
1056 for (pos = 0; pos < map->len; pos++)
1057 if (map->queues[pos] == index)
1058 break;
1059
1060 if (pos < map->len) {
1061 if (map->len > 1)
1062 map->queues[pos] =
1063 map->queues[--map->len];
1064 else {
1065 RCU_INIT_POINTER(dev_maps->cpu_map[i],
1066 NULL);
1067 kfree_rcu(map, rcu);
1068 map = NULL;
1069 }
1070 }
1071 if (map)
1072 nonempty = 1;
1073 }
1074
1075 if (!nonempty) {
1076 RCU_INIT_POINTER(dev->xps_maps, NULL);
1077 kfree_rcu(dev_maps, rcu);
1078 }
1079 }
1080 mutex_unlock(&xps_map_mutex);
1081}
1082
893static ssize_t store_xps_map(struct netdev_queue *queue, 1083static ssize_t store_xps_map(struct netdev_queue *queue,
894 struct netdev_queue_attribute *attribute, 1084 struct netdev_queue_attribute *attribute,
895 const char *buf, size_t len) 1085 const char *buf, size_t len)
@@ -901,7 +1091,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
901 struct xps_map *map, *new_map; 1091 struct xps_map *map, *new_map;
902 struct xps_dev_maps *dev_maps, *new_dev_maps; 1092 struct xps_dev_maps *dev_maps, *new_dev_maps;
903 int nonempty = 0; 1093 int nonempty = 0;
904 int numa_node = -2; 1094 int numa_node_id = -2;
905 1095
906 if (!capable(CAP_NET_ADMIN)) 1096 if (!capable(CAP_NET_ADMIN))
907 return -EPERM; 1097 return -EPERM;
@@ -944,10 +1134,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
944 need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); 1134 need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
945#ifdef CONFIG_NUMA 1135#ifdef CONFIG_NUMA
946 if (need_set) { 1136 if (need_set) {
947 if (numa_node == -2) 1137 if (numa_node_id == -2)
948 numa_node = cpu_to_node(cpu); 1138 numa_node_id = cpu_to_node(cpu);
949 else if (numa_node != cpu_to_node(cpu)) 1139 else if (numa_node_id != cpu_to_node(cpu))
950 numa_node = -1; 1140 numa_node_id = -1;
951 } 1141 }
952#endif 1142#endif
953 if (need_set && pos >= map_len) { 1143 if (need_set && pos >= map_len) {
@@ -987,9 +1177,9 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
987 nonempty = 1; 1177 nonempty = 1;
988 } 1178 }
989 1179
990 if (nonempty) 1180 if (nonempty) {
991 RCU_INIT_POINTER(dev->xps_maps, new_dev_maps); 1181 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
992 else { 1182 } else {
993 kfree(new_dev_maps); 1183 kfree(new_dev_maps);
994 RCU_INIT_POINTER(dev->xps_maps, NULL); 1184 RCU_INIT_POINTER(dev->xps_maps, NULL);
995 } 1185 }
@@ -997,7 +1187,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
997 if (dev_maps) 1187 if (dev_maps)
998 kfree_rcu(dev_maps, rcu); 1188 kfree_rcu(dev_maps, rcu);
999 1189
1000 netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : 1190 netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id :
1001 NUMA_NO_NODE); 1191 NUMA_NO_NODE);
1002 1192
1003 mutex_unlock(&xps_map_mutex); 1193 mutex_unlock(&xps_map_mutex);
@@ -1020,58 +1210,23 @@ error:
1020 1210
1021static struct netdev_queue_attribute xps_cpus_attribute = 1211static struct netdev_queue_attribute xps_cpus_attribute =
1022 __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); 1212 __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
1213#endif /* CONFIG_XPS */
1023 1214
1024static struct attribute *netdev_queue_default_attrs[] = { 1215static struct attribute *netdev_queue_default_attrs[] = {
1216 &queue_trans_timeout.attr,
1217#ifdef CONFIG_XPS
1025 &xps_cpus_attribute.attr, 1218 &xps_cpus_attribute.attr,
1219#endif
1026 NULL 1220 NULL
1027}; 1221};
1028 1222
1029static void netdev_queue_release(struct kobject *kobj) 1223static void netdev_queue_release(struct kobject *kobj)
1030{ 1224{
1031 struct netdev_queue *queue = to_netdev_queue(kobj); 1225 struct netdev_queue *queue = to_netdev_queue(kobj);
1032 struct net_device *dev = queue->dev;
1033 struct xps_dev_maps *dev_maps;
1034 struct xps_map *map;
1035 unsigned long index;
1036 int i, pos, nonempty = 0;
1037
1038 index = get_netdev_queue_index(queue);
1039
1040 mutex_lock(&xps_map_mutex);
1041 dev_maps = xmap_dereference(dev->xps_maps);
1042
1043 if (dev_maps) {
1044 for_each_possible_cpu(i) {
1045 map = xmap_dereference(dev_maps->cpu_map[i]);
1046 if (!map)
1047 continue;
1048 1226
1049 for (pos = 0; pos < map->len; pos++) 1227#ifdef CONFIG_XPS
1050 if (map->queues[pos] == index) 1228 xps_queue_release(queue);
1051 break; 1229#endif
1052
1053 if (pos < map->len) {
1054 if (map->len > 1)
1055 map->queues[pos] =
1056 map->queues[--map->len];
1057 else {
1058 RCU_INIT_POINTER(dev_maps->cpu_map[i],
1059 NULL);
1060 kfree_rcu(map, rcu);
1061 map = NULL;
1062 }
1063 }
1064 if (map)
1065 nonempty = 1;
1066 }
1067
1068 if (!nonempty) {
1069 RCU_INIT_POINTER(dev->xps_maps, NULL);
1070 kfree_rcu(dev_maps, rcu);
1071 }
1072 }
1073
1074 mutex_unlock(&xps_map_mutex);
1075 1230
1076 memset(kobj, 0, sizeof(*kobj)); 1231 memset(kobj, 0, sizeof(*kobj));
1077 dev_put(queue->dev); 1232 dev_put(queue->dev);
@@ -1092,22 +1247,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)
1092 kobj->kset = net->queues_kset; 1247 kobj->kset = net->queues_kset;
1093 error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, 1248 error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
1094 "tx-%u", index); 1249 "tx-%u", index);
1095 if (error) { 1250 if (error)
1096 kobject_put(kobj); 1251 goto exit;
1097 return error; 1252
1098 } 1253#ifdef CONFIG_BQL
1254 error = sysfs_create_group(kobj, &dql_group);
1255 if (error)
1256 goto exit;
1257#endif
1099 1258
1100 kobject_uevent(kobj, KOBJ_ADD); 1259 kobject_uevent(kobj, KOBJ_ADD);
1101 dev_hold(queue->dev); 1260 dev_hold(queue->dev);
1102 1261
1262 return 0;
1263exit:
1264 kobject_put(kobj);
1103 return error; 1265 return error;
1104} 1266}
1105#endif /* CONFIG_XPS */ 1267#endif /* CONFIG_SYSFS */
1106 1268
1107int 1269int
1108netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) 1270netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
1109{ 1271{
1110#ifdef CONFIG_XPS 1272#ifdef CONFIG_SYSFS
1111 int i; 1273 int i;
1112 int error = 0; 1274 int error = 0;
1113 1275
@@ -1119,20 +1281,26 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
1119 } 1281 }
1120 } 1282 }
1121 1283
1122 while (--i >= new_num) 1284 while (--i >= new_num) {
1123 kobject_put(&net->_tx[i].kobj); 1285 struct netdev_queue *queue = net->_tx + i;
1286
1287#ifdef CONFIG_BQL
1288 sysfs_remove_group(&queue->kobj, &dql_group);
1289#endif
1290 kobject_put(&queue->kobj);
1291 }
1124 1292
1125 return error; 1293 return error;
1126#else 1294#else
1127 return 0; 1295 return 0;
1128#endif 1296#endif /* CONFIG_SYSFS */
1129} 1297}
1130 1298
1131static int register_queue_kobjects(struct net_device *net) 1299static int register_queue_kobjects(struct net_device *net)
1132{ 1300{
1133 int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; 1301 int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
1134 1302
1135#if defined(CONFIG_RPS) || defined(CONFIG_XPS) 1303#ifdef CONFIG_SYSFS
1136 net->queues_kset = kset_create_and_add("queues", 1304 net->queues_kset = kset_create_and_add("queues",
1137 NULL, &net->dev.kobj); 1305 NULL, &net->dev.kobj);
1138 if (!net->queues_kset) 1306 if (!net->queues_kset)
@@ -1173,7 +1341,7 @@ static void remove_queue_kobjects(struct net_device *net)
1173 1341
1174 net_rx_queue_update_kobjects(net, real_rx, 0); 1342 net_rx_queue_update_kobjects(net, real_rx, 0);
1175 netdev_queue_update_kobjects(net, real_tx, 0); 1343 netdev_queue_update_kobjects(net, real_tx, 0);
1176#if defined(CONFIG_RPS) || defined(CONFIG_XPS) 1344#ifdef CONFIG_SYSFS
1177 kset_unregister(net->queues_kset); 1345 kset_unregister(net->queues_kset);
1178#endif 1346#endif
1179} 1347}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index aefcd7acbffa..0e950fda9a0a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -30,6 +30,20 @@ EXPORT_SYMBOL(init_net);
30 30
31#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ 31#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
32 32
33static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
34
35static struct net_generic *net_alloc_generic(void)
36{
37 struct net_generic *ng;
38 size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
39
40 ng = kzalloc(generic_size, GFP_KERNEL);
41 if (ng)
42 ng->len = max_gen_ptrs;
43
44 return ng;
45}
46
33static int net_assign_generic(struct net *net, int id, void *data) 47static int net_assign_generic(struct net *net, int id, void *data)
34{ 48{
35 struct net_generic *ng, *old_ng; 49 struct net_generic *ng, *old_ng;
@@ -43,8 +57,7 @@ static int net_assign_generic(struct net *net, int id, void *data)
43 if (old_ng->len >= id) 57 if (old_ng->len >= id)
44 goto assign; 58 goto assign;
45 59
46 ng = kzalloc(sizeof(struct net_generic) + 60 ng = net_alloc_generic();
47 id * sizeof(void *), GFP_KERNEL);
48 if (ng == NULL) 61 if (ng == NULL)
49 return -ENOMEM; 62 return -ENOMEM;
50 63
@@ -59,7 +72,6 @@ static int net_assign_generic(struct net *net, int id, void *data)
59 * the old copy for kfree after a grace period. 72 * the old copy for kfree after a grace period.
60 */ 73 */
61 74
62 ng->len = id;
63 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); 75 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
64 76
65 rcu_assign_pointer(net->gen, ng); 77 rcu_assign_pointer(net->gen, ng);
@@ -161,18 +173,6 @@ out_undo:
161 goto out; 173 goto out;
162} 174}
163 175
164static struct net_generic *net_alloc_generic(void)
165{
166 struct net_generic *ng;
167 size_t generic_size = sizeof(struct net_generic) +
168 INITIAL_NET_GEN_PTRS * sizeof(void *);
169
170 ng = kzalloc(generic_size, GFP_KERNEL);
171 if (ng)
172 ng->len = INITIAL_NET_GEN_PTRS;
173
174 return ng;
175}
176 176
177#ifdef CONFIG_NET_NS 177#ifdef CONFIG_NET_NS
178static struct kmem_cache *net_cachep; 178static struct kmem_cache *net_cachep;
@@ -483,6 +483,7 @@ again:
483 } 483 }
484 return error; 484 return error;
485 } 485 }
486 max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
486 } 487 }
487 error = __register_pernet_operations(list, ops); 488 error = __register_pernet_operations(list, ops);
488 if (error) { 489 if (error) {
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index cf64c1ffa4cd..ddefc513b44a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -76,7 +76,7 @@ static void queue_process(struct work_struct *work)
76 76
77 local_irq_save(flags); 77 local_irq_save(flags);
78 __netif_tx_lock(txq, smp_processor_id()); 78 __netif_tx_lock(txq, smp_processor_id());
79 if (netif_tx_queue_frozen_or_stopped(txq) || 79 if (netif_xmit_frozen_or_stopped(txq) ||
80 ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { 80 ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
81 skb_queue_head(&npinfo->txq, skb); 81 skb_queue_head(&npinfo->txq, skb);
82 __netif_tx_unlock(txq); 82 __netif_tx_unlock(txq);
@@ -194,7 +194,7 @@ static void netpoll_poll_dev(struct net_device *dev)
194 194
195 poll_napi(dev); 195 poll_napi(dev);
196 196
197 if (dev->priv_flags & IFF_SLAVE) { 197 if (dev->flags & IFF_SLAVE) {
198 if (dev->npinfo) { 198 if (dev->npinfo) {
199 struct net_device *bond_dev = dev->master; 199 struct net_device *bond_dev = dev->master;
200 struct sk_buff *skb; 200 struct sk_buff *skb;
@@ -317,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
317 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; 317 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
318 tries > 0; --tries) { 318 tries > 0; --tries) {
319 if (__netif_tx_trylock(txq)) { 319 if (__netif_tx_trylock(txq)) {
320 if (!netif_tx_queue_stopped(txq)) { 320 if (!netif_xmit_stopped(txq)) {
321 status = ops->ndo_start_xmit(skb, dev); 321 status = ops->ndo_start_xmit(skb, dev);
322 if (status == NETDEV_TX_OK) 322 if (status == NETDEV_TX_OK)
323 txq_trans_update(txq); 323 txq_trans_update(txq);
@@ -422,6 +422,7 @@ static void arp_reply(struct sk_buff *skb)
422 struct sk_buff *send_skb; 422 struct sk_buff *send_skb;
423 struct netpoll *np, *tmp; 423 struct netpoll *np, *tmp;
424 unsigned long flags; 424 unsigned long flags;
425 int hlen, tlen;
425 int hits = 0; 426 int hits = 0;
426 427
427 if (list_empty(&npinfo->rx_np)) 428 if (list_empty(&npinfo->rx_np))
@@ -479,8 +480,9 @@ static void arp_reply(struct sk_buff *skb)
479 if (tip != np->local_ip) 480 if (tip != np->local_ip)
480 continue; 481 continue;
481 482
482 send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), 483 hlen = LL_RESERVED_SPACE(np->dev);
483 LL_RESERVED_SPACE(np->dev)); 484 tlen = np->dev->needed_tailroom;
485 send_skb = find_skb(np, size + hlen + tlen, hlen);
484 if (!send_skb) 486 if (!send_skb)
485 continue; 487 continue;
486 488
@@ -763,7 +765,7 @@ int __netpoll_setup(struct netpoll *np)
763 } 765 }
764 766
765 /* last thing to do is link it to the net device structure */ 767 /* last thing to do is link it to the net device structure */
766 RCU_INIT_POINTER(ndev->npinfo, npinfo); 768 rcu_assign_pointer(ndev->npinfo, npinfo);
767 769
768 return 0; 770 return 0;
769 771
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 000000000000..4dacc44637ef
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,339 @@
1/*
2 * net/core/netprio_cgroup.c Priority Control Group
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Neil Horman <nhorman@tuxdriver.com>
10 */
11
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/types.h>
15#include <linux/string.h>
16#include <linux/errno.h>
17#include <linux/skbuff.h>
18#include <linux/cgroup.h>
19#include <linux/rcupdate.h>
20#include <linux/atomic.h>
21#include <net/rtnetlink.h>
22#include <net/pkt_cls.h>
23#include <net/sock.h>
24#include <net/netprio_cgroup.h>
25
26static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
27 struct cgroup *cgrp);
28static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
29static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
30
31struct cgroup_subsys net_prio_subsys = {
32 .name = "net_prio",
33 .create = cgrp_create,
34 .destroy = cgrp_destroy,
35 .populate = cgrp_populate,
36#ifdef CONFIG_NETPRIO_CGROUP
37 .subsys_id = net_prio_subsys_id,
38#endif
39 .module = THIS_MODULE
40};
41
42#define PRIOIDX_SZ 128
43
44static unsigned long prioidx_map[PRIOIDX_SZ];
45static DEFINE_SPINLOCK(prioidx_map_lock);
46static atomic_t max_prioidx = ATOMIC_INIT(0);
47
48static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
49{
50 return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
51 struct cgroup_netprio_state, css);
52}
53
54static int get_prioidx(u32 *prio)
55{
56 unsigned long flags;
57 u32 prioidx;
58
59 spin_lock_irqsave(&prioidx_map_lock, flags);
60 prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
61 if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) {
62 spin_unlock_irqrestore(&prioidx_map_lock, flags);
63 return -ENOSPC;
64 }
65 set_bit(prioidx, prioidx_map);
66 spin_unlock_irqrestore(&prioidx_map_lock, flags);
67 atomic_set(&max_prioidx, prioidx);
68 *prio = prioidx;
69 return 0;
70}
71
72static void put_prioidx(u32 idx)
73{
74 unsigned long flags;
75
76 spin_lock_irqsave(&prioidx_map_lock, flags);
77 clear_bit(idx, prioidx_map);
78 spin_unlock_irqrestore(&prioidx_map_lock, flags);
79}
80
81static void extend_netdev_table(struct net_device *dev, u32 new_len)
82{
83 size_t new_size = sizeof(struct netprio_map) +
84 ((sizeof(u32) * new_len));
85 struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
86 struct netprio_map *old_priomap;
87 int i;
88
89 old_priomap = rtnl_dereference(dev->priomap);
90
91 if (!new_priomap) {
92 printk(KERN_WARNING "Unable to alloc new priomap!\n");
93 return;
94 }
95
96 for (i = 0;
97 old_priomap && (i < old_priomap->priomap_len);
98 i++)
99 new_priomap->priomap[i] = old_priomap->priomap[i];
100
101 new_priomap->priomap_len = new_len;
102
103 rcu_assign_pointer(dev->priomap, new_priomap);
104 if (old_priomap)
105 kfree_rcu(old_priomap, rcu);
106}
107
108static void update_netdev_tables(void)
109{
110 struct net_device *dev;
111 u32 max_len = atomic_read(&max_prioidx) + 1;
112 struct netprio_map *map;
113
114 rtnl_lock();
115 for_each_netdev(&init_net, dev) {
116 map = rtnl_dereference(dev->priomap);
117 if ((!map) ||
118 (map->priomap_len < max_len))
119 extend_netdev_table(dev, max_len);
120 }
121 rtnl_unlock();
122}
123
124static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
125 struct cgroup *cgrp)
126{
127 struct cgroup_netprio_state *cs;
128 int ret;
129
130 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
131 if (!cs)
132 return ERR_PTR(-ENOMEM);
133
134 if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) {
135 kfree(cs);
136 return ERR_PTR(-EINVAL);
137 }
138
139 ret = get_prioidx(&cs->prioidx);
140 if (ret != 0) {
141 printk(KERN_WARNING "No space in priority index array\n");
142 kfree(cs);
143 return ERR_PTR(ret);
144 }
145
146 return &cs->css;
147}
148
149static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
150{
151 struct cgroup_netprio_state *cs;
152 struct net_device *dev;
153 struct netprio_map *map;
154
155 cs = cgrp_netprio_state(cgrp);
156 rtnl_lock();
157 for_each_netdev(&init_net, dev) {
158 map = rtnl_dereference(dev->priomap);
159 if (map)
160 map->priomap[cs->prioidx] = 0;
161 }
162 rtnl_unlock();
163 put_prioidx(cs->prioidx);
164 kfree(cs);
165}
166
167static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
168{
169 return (u64)cgrp_netprio_state(cgrp)->prioidx;
170}
171
172static int read_priomap(struct cgroup *cont, struct cftype *cft,
173 struct cgroup_map_cb *cb)
174{
175 struct net_device *dev;
176 u32 prioidx = cgrp_netprio_state(cont)->prioidx;
177 u32 priority;
178 struct netprio_map *map;
179
180 rcu_read_lock();
181 for_each_netdev_rcu(&init_net, dev) {
182 map = rcu_dereference(dev->priomap);
183 priority = map ? map->priomap[prioidx] : 0;
184 cb->fill(cb, dev->name, priority);
185 }
186 rcu_read_unlock();
187 return 0;
188}
189
190static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
191 const char *buffer)
192{
193 char *devname = kstrdup(buffer, GFP_KERNEL);
194 int ret = -EINVAL;
195 u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
196 unsigned long priority;
197 char *priostr;
198 struct net_device *dev;
199 struct netprio_map *map;
200
201 if (!devname)
202 return -ENOMEM;
203
204 /*
205 * Minimally sized valid priomap string
206 */
207 if (strlen(devname) < 3)
208 goto out_free_devname;
209
210 priostr = strstr(devname, " ");
211 if (!priostr)
212 goto out_free_devname;
213
214 /*
215 *Separate the devname from the associated priority
216 *and advance the priostr poitner to the priority value
217 */
218 *priostr = '\0';
219 priostr++;
220
221 /*
222 * If the priostr points to NULL, we're at the end of the passed
223 * in string, and its not a valid write
224 */
225 if (*priostr == '\0')
226 goto out_free_devname;
227
228 ret = kstrtoul(priostr, 10, &priority);
229 if (ret < 0)
230 goto out_free_devname;
231
232 ret = -ENODEV;
233
234 dev = dev_get_by_name(&init_net, devname);
235 if (!dev)
236 goto out_free_devname;
237
238 update_netdev_tables();
239 ret = 0;
240 rcu_read_lock();
241 map = rcu_dereference(dev->priomap);
242 if (map)
243 map->priomap[prioidx] = priority;
244 rcu_read_unlock();
245 dev_put(dev);
246
247out_free_devname:
248 kfree(devname);
249 return ret;
250}
251
252static struct cftype ss_files[] = {
253 {
254 .name = "prioidx",
255 .read_u64 = read_prioidx,
256 },
257 {
258 .name = "ifpriomap",
259 .read_map = read_priomap,
260 .write_string = write_priomap,
261 },
262};
263
264static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
265{
266 return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
267}
268
269static int netprio_device_event(struct notifier_block *unused,
270 unsigned long event, void *ptr)
271{
272 struct net_device *dev = ptr;
273 struct netprio_map *old;
274
275 /*
276 * Note this is called with rtnl_lock held so we have update side
277 * protection on our rcu assignments
278 */
279
280 switch (event) {
281 case NETDEV_UNREGISTER:
282 old = rtnl_dereference(dev->priomap);
283 RCU_INIT_POINTER(dev->priomap, NULL);
284 if (old)
285 kfree_rcu(old, rcu);
286 break;
287 }
288 return NOTIFY_DONE;
289}
290
291static struct notifier_block netprio_device_notifier = {
292 .notifier_call = netprio_device_event
293};
294
295static int __init init_cgroup_netprio(void)
296{
297 int ret;
298
299 ret = cgroup_load_subsys(&net_prio_subsys);
300 if (ret)
301 goto out;
302#ifndef CONFIG_NETPRIO_CGROUP
303 smp_wmb();
304 net_prio_subsys_id = net_prio_subsys.subsys_id;
305#endif
306
307 register_netdevice_notifier(&netprio_device_notifier);
308
309out:
310 return ret;
311}
312
313static void __exit exit_cgroup_netprio(void)
314{
315 struct netprio_map *old;
316 struct net_device *dev;
317
318 unregister_netdevice_notifier(&netprio_device_notifier);
319
320 cgroup_unload_subsys(&net_prio_subsys);
321
322#ifndef CONFIG_NETPRIO_CGROUP
323 net_prio_subsys_id = -1;
324 synchronize_rcu();
325#endif
326
327 rtnl_lock();
328 for_each_netdev(&init_net, dev) {
329 old = rtnl_dereference(dev->priomap);
330 RCU_INIT_POINTER(dev->priomap, NULL);
331 if (old)
332 kfree_rcu(old, rcu);
333 }
334 rtnl_unlock();
335}
336
337module_init(init_cgroup_netprio);
338module_exit(exit_cgroup_netprio);
339MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0001c243b35c..4d8ce93cd503 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -767,8 +767,8 @@ done:
767 return i; 767 return i;
768} 768}
769 769
770static unsigned long num_arg(const char __user * user_buffer, 770static long num_arg(const char __user *user_buffer, unsigned long maxlen,
771 unsigned long maxlen, unsigned long *num) 771 unsigned long *num)
772{ 772{
773 int i; 773 int i;
774 *num = 0; 774 *num = 0;
@@ -1304,7 +1304,7 @@ static ssize_t pktgen_if_write(struct file *file,
1304 scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); 1304 scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
1305 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); 1305 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
1306 1306
1307 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); 1307 pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
1308 1308
1309 if (debug) 1309 if (debug)
1310 printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); 1310 printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf);
@@ -1327,8 +1327,7 @@ static ssize_t pktgen_if_write(struct file *file,
1327 scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); 1327 scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
1328 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); 1328 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
1329 1329
1330 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, 1330 pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
1331 &pkt_dev->min_in6_daddr);
1332 if (debug) 1331 if (debug)
1333 printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); 1332 printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf);
1334 1333
@@ -1371,7 +1370,7 @@ static ssize_t pktgen_if_write(struct file *file,
1371 scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); 1370 scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
1372 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); 1371 snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
1373 1372
1374 ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); 1373 pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
1375 1374
1376 if (debug) 1375 if (debug)
1377 printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); 1376 printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf);
@@ -2025,13 +2024,13 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
2025 pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", 2024 pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
2026 pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, 2025 pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
2027 pkt_dev->odevname); 2026 pkt_dev->odevname);
2028 pkt_dev->queue_map_min = ntxq - 1; 2027 pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
2029 } 2028 }
2030 if (pkt_dev->queue_map_max >= ntxq) { 2029 if (pkt_dev->queue_map_max >= ntxq) {
2031 pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", 2030 pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
2032 pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, 2031 pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
2033 pkt_dev->odevname); 2032 pkt_dev->odevname);
2034 pkt_dev->queue_map_max = ntxq - 1; 2033 pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
2035 } 2034 }
2036 2035
2037 /* Default to the interface's mac if not explicitly set. */ 2036 /* Default to the interface's mac if not explicitly set. */
@@ -2079,9 +2078,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
2079 ifp = ifp->if_next) { 2078 ifp = ifp->if_next) {
2080 if (ifp->scope == IFA_LINK && 2079 if (ifp->scope == IFA_LINK &&
2081 !(ifp->flags & IFA_F_TENTATIVE)) { 2080 !(ifp->flags & IFA_F_TENTATIVE)) {
2082 ipv6_addr_copy(&pkt_dev-> 2081 pkt_dev->cur_in6_saddr = ifp->addr;
2083 cur_in6_saddr,
2084 &ifp->addr);
2085 err = 0; 2082 err = 0;
2086 break; 2083 break;
2087 } 2084 }
@@ -2958,8 +2955,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2958 iph->payload_len = htons(sizeof(struct udphdr) + datalen); 2955 iph->payload_len = htons(sizeof(struct udphdr) + datalen);
2959 iph->nexthdr = IPPROTO_UDP; 2956 iph->nexthdr = IPPROTO_UDP;
2960 2957
2961 ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); 2958 iph->daddr = pkt_dev->cur_in6_daddr;
2962 ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); 2959 iph->saddr = pkt_dev->cur_in6_saddr;
2963 2960
2964 skb->mac_header = (skb->network_header - ETH_HLEN - 2961 skb->mac_header = (skb->network_header - ETH_HLEN -
2965 pkt_dev->pkt_overhead); 2962 pkt_dev->pkt_overhead);
@@ -3345,7 +3342,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3345 3342
3346 __netif_tx_lock_bh(txq); 3343 __netif_tx_lock_bh(txq);
3347 3344
3348 if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { 3345 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
3349 ret = NETDEV_TX_BUSY; 3346 ret = NETDEV_TX_BUSY;
3350 pkt_dev->last_ok = 0; 3347 pkt_dev->last_ok = 0;
3351 goto unlock; 3348 goto unlock;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 182236b2510a..9b570a6a33c5 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -26,10 +26,11 @@
26 * but then some measure against one socket starving all other sockets 26 * but then some measure against one socket starving all other sockets
27 * would be needed. 27 * would be needed.
28 * 28 *
29 * It was 128 by default. Experiments with real servers show, that 29 * The minimum value of it is 128. Experiments with real servers show that
30 * it is absolutely not enough even at 100conn/sec. 256 cures most 30 * it is absolutely not enough even at 100conn/sec. 256 cures most
31 * of problems. This value is adjusted to 128 for very small machines 31 * of problems.
32 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). 32 * This value is adjusted to 128 for low memory machines,
33 * and it will increase in proportion to the memory of machine.
33 * Note : Dont forget somaxconn that may limit backlog too. 34 * Note : Dont forget somaxconn that may limit backlog too.
34 */ 35 */
35int sysctl_max_syn_backlog = 256; 36int sysctl_max_syn_backlog = 256;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9083e82bdae5..606a6e8f3671 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -60,7 +60,6 @@ struct rtnl_link {
60}; 60};
61 61
62static DEFINE_MUTEX(rtnl_mutex); 62static DEFINE_MUTEX(rtnl_mutex);
63static u16 min_ifinfo_dump_size;
64 63
65void rtnl_lock(void) 64void rtnl_lock(void)
66{ 65{
@@ -273,6 +272,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all);
273 272
274static LIST_HEAD(link_ops); 273static LIST_HEAD(link_ops);
275 274
275static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
276{
277 const struct rtnl_link_ops *ops;
278
279 list_for_each_entry(ops, &link_ops, list) {
280 if (!strcmp(ops->kind, kind))
281 return ops;
282 }
283 return NULL;
284}
285
276/** 286/**
277 * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. 287 * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
278 * @ops: struct rtnl_link_ops * to register 288 * @ops: struct rtnl_link_ops * to register
@@ -285,6 +295,9 @@ static LIST_HEAD(link_ops);
285 */ 295 */
286int __rtnl_link_register(struct rtnl_link_ops *ops) 296int __rtnl_link_register(struct rtnl_link_ops *ops)
287{ 297{
298 if (rtnl_link_ops_get(ops->kind))
299 return -EEXIST;
300
288 if (!ops->dellink) 301 if (!ops->dellink)
289 ops->dellink = unregister_netdevice_queue; 302 ops->dellink = unregister_netdevice_queue;
290 303
@@ -351,17 +364,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops)
351} 364}
352EXPORT_SYMBOL_GPL(rtnl_link_unregister); 365EXPORT_SYMBOL_GPL(rtnl_link_unregister);
353 366
354static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
355{
356 const struct rtnl_link_ops *ops;
357
358 list_for_each_entry(ops, &link_ops, list) {
359 if (!strcmp(ops->kind, kind))
360 return ops;
361 }
362 return NULL;
363}
364
365static size_t rtnl_link_get_size(const struct net_device *dev) 367static size_t rtnl_link_get_size(const struct net_device *dev)
366{ 368{
367 const struct rtnl_link_ops *ops = dev->rtnl_link_ops; 369 const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
@@ -721,10 +723,11 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
721} 723}
722 724
723/* All VF info */ 725/* All VF info */
724static inline int rtnl_vfinfo_size(const struct net_device *dev) 726static inline int rtnl_vfinfo_size(const struct net_device *dev,
727 u32 ext_filter_mask)
725{ 728{
726 if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { 729 if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
727 730 (ext_filter_mask & RTEXT_FILTER_VF)) {
728 int num_vfs = dev_num_vf(dev->dev.parent); 731 int num_vfs = dev_num_vf(dev->dev.parent);
729 size_t size = nla_total_size(sizeof(struct nlattr)); 732 size_t size = nla_total_size(sizeof(struct nlattr));
730 size += nla_total_size(num_vfs * sizeof(struct nlattr)); 733 size += nla_total_size(num_vfs * sizeof(struct nlattr));
@@ -763,7 +766,8 @@ static size_t rtnl_port_size(const struct net_device *dev)
763 return port_self_size; 766 return port_self_size;
764} 767}
765 768
766static noinline size_t if_nlmsg_size(const struct net_device *dev) 769static noinline size_t if_nlmsg_size(const struct net_device *dev,
770 u32 ext_filter_mask)
767{ 771{
768 return NLMSG_ALIGN(sizeof(struct ifinfomsg)) 772 return NLMSG_ALIGN(sizeof(struct ifinfomsg))
769 + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ 773 + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
@@ -781,8 +785,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev)
781 + nla_total_size(4) /* IFLA_MASTER */ 785 + nla_total_size(4) /* IFLA_MASTER */
782 + nla_total_size(1) /* IFLA_OPERSTATE */ 786 + nla_total_size(1) /* IFLA_OPERSTATE */
783 + nla_total_size(1) /* IFLA_LINKMODE */ 787 + nla_total_size(1) /* IFLA_LINKMODE */
784 + nla_total_size(4) /* IFLA_NUM_VF */ 788 + nla_total_size(ext_filter_mask
785 + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ 789 & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
790 + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
786 + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ 791 + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
787 + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ 792 + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
788 + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ 793 + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
@@ -865,7 +870,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
865 870
866static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, 871static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
867 int type, u32 pid, u32 seq, u32 change, 872 int type, u32 pid, u32 seq, u32 change,
868 unsigned int flags) 873 unsigned int flags, u32 ext_filter_mask)
869{ 874{
870 struct ifinfomsg *ifm; 875 struct ifinfomsg *ifm;
871 struct nlmsghdr *nlh; 876 struct nlmsghdr *nlh;
@@ -938,10 +943,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
938 goto nla_put_failure; 943 goto nla_put_failure;
939 copy_rtnl_link_stats64(nla_data(attr), stats); 944 copy_rtnl_link_stats64(nla_data(attr), stats);
940 945
941 if (dev->dev.parent) 946 if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF))
942 NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); 947 NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
943 948
944 if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { 949 if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
950 && (ext_filter_mask & RTEXT_FILTER_VF)) {
945 int i; 951 int i;
946 952
947 struct nlattr *vfinfo, *vf; 953 struct nlattr *vfinfo, *vf;
@@ -1045,6 +1051,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1045 struct net_device *dev; 1051 struct net_device *dev;
1046 struct hlist_head *head; 1052 struct hlist_head *head;
1047 struct hlist_node *node; 1053 struct hlist_node *node;
1054 struct nlattr *tb[IFLA_MAX+1];
1055 u32 ext_filter_mask = 0;
1048 1056
1049 s_h = cb->args[0]; 1057 s_h = cb->args[0];
1050 s_idx = cb->args[1]; 1058 s_idx = cb->args[1];
@@ -1052,6 +1060,12 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1052 rcu_read_lock(); 1060 rcu_read_lock();
1053 cb->seq = net->dev_base_seq; 1061 cb->seq = net->dev_base_seq;
1054 1062
1063 nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX,
1064 ifla_policy);
1065
1066 if (tb[IFLA_EXT_MASK])
1067 ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
1068
1055 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 1069 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1056 idx = 0; 1070 idx = 0;
1057 head = &net->dev_index_head[h]; 1071 head = &net->dev_index_head[h];
@@ -1061,7 +1075,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1061 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, 1075 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
1062 NETLINK_CB(cb->skb).pid, 1076 NETLINK_CB(cb->skb).pid,
1063 cb->nlh->nlmsg_seq, 0, 1077 cb->nlh->nlmsg_seq, 0,
1064 NLM_F_MULTI) <= 0) 1078 NLM_F_MULTI,
1079 ext_filter_mask) <= 0)
1065 goto out; 1080 goto out;
1066 1081
1067 nl_dump_check_consistent(cb, nlmsg_hdr(skb)); 1082 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1097,6 +1112,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1097 [IFLA_VF_PORTS] = { .type = NLA_NESTED }, 1112 [IFLA_VF_PORTS] = { .type = NLA_NESTED },
1098 [IFLA_PORT_SELF] = { .type = NLA_NESTED }, 1113 [IFLA_PORT_SELF] = { .type = NLA_NESTED },
1099 [IFLA_AF_SPEC] = { .type = NLA_NESTED }, 1114 [IFLA_AF_SPEC] = { .type = NLA_NESTED },
1115 [IFLA_EXT_MASK] = { .type = NLA_U32 },
1100}; 1116};
1101EXPORT_SYMBOL(ifla_policy); 1117EXPORT_SYMBOL(ifla_policy);
1102 1118
@@ -1506,6 +1522,7 @@ errout:
1506 1522
1507 if (send_addr_notify) 1523 if (send_addr_notify)
1508 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 1524 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1525
1509 return err; 1526 return err;
1510} 1527}
1511 1528
@@ -1836,6 +1853,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1836 struct net_device *dev = NULL; 1853 struct net_device *dev = NULL;
1837 struct sk_buff *nskb; 1854 struct sk_buff *nskb;
1838 int err; 1855 int err;
1856 u32 ext_filter_mask = 0;
1839 1857
1840 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); 1858 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
1841 if (err < 0) 1859 if (err < 0)
@@ -1844,6 +1862,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1844 if (tb[IFLA_IFNAME]) 1862 if (tb[IFLA_IFNAME])
1845 nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); 1863 nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
1846 1864
1865 if (tb[IFLA_EXT_MASK])
1866 ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
1867
1847 ifm = nlmsg_data(nlh); 1868 ifm = nlmsg_data(nlh);
1848 if (ifm->ifi_index > 0) 1869 if (ifm->ifi_index > 0)
1849 dev = __dev_get_by_index(net, ifm->ifi_index); 1870 dev = __dev_get_by_index(net, ifm->ifi_index);
@@ -1855,12 +1876,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1855 if (dev == NULL) 1876 if (dev == NULL)
1856 return -ENODEV; 1877 return -ENODEV;
1857 1878
1858 nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); 1879 nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
1859 if (nskb == NULL) 1880 if (nskb == NULL)
1860 return -ENOBUFS; 1881 return -ENOBUFS;
1861 1882
1862 err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, 1883 err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
1863 nlh->nlmsg_seq, 0, 0); 1884 nlh->nlmsg_seq, 0, 0, ext_filter_mask);
1864 if (err < 0) { 1885 if (err < 0) {
1865 /* -EMSGSIZE implies BUG in if_nlmsg_size */ 1886 /* -EMSGSIZE implies BUG in if_nlmsg_size */
1866 WARN_ON(err == -EMSGSIZE); 1887 WARN_ON(err == -EMSGSIZE);
@@ -1871,8 +1892,31 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1871 return err; 1892 return err;
1872} 1893}
1873 1894
1874static u16 rtnl_calcit(struct sk_buff *skb) 1895static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
1875{ 1896{
1897 struct net *net = sock_net(skb->sk);
1898 struct net_device *dev;
1899 struct nlattr *tb[IFLA_MAX+1];
1900 u32 ext_filter_mask = 0;
1901 u16 min_ifinfo_dump_size = 0;
1902
1903 nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ifla_policy);
1904
1905 if (tb[IFLA_EXT_MASK])
1906 ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
1907
1908 if (!ext_filter_mask)
1909 return NLMSG_GOODSIZE;
1910 /*
1911 * traverse the list of net devices and compute the minimum
1912 * buffer size based upon the filter mask.
1913 */
1914 list_for_each_entry(dev, &net->dev_base_head, dev_list) {
1915 min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
1916 if_nlmsg_size(dev,
1917 ext_filter_mask));
1918 }
1919
1876 return min_ifinfo_dump_size; 1920 return min_ifinfo_dump_size;
1877} 1921}
1878 1922
@@ -1907,13 +1951,11 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
1907 int err = -ENOBUFS; 1951 int err = -ENOBUFS;
1908 size_t if_info_size; 1952 size_t if_info_size;
1909 1953
1910 skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL); 1954 skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL);
1911 if (skb == NULL) 1955 if (skb == NULL)
1912 goto errout; 1956 goto errout;
1913 1957
1914 min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size); 1958 err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
1915
1916 err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0);
1917 if (err < 0) { 1959 if (err < 0) {
1918 /* -EMSGSIZE implies BUG in if_nlmsg_size() */ 1960 /* -EMSGSIZE implies BUG in if_nlmsg_size() */
1919 WARN_ON(err == -EMSGSIZE); 1961 WARN_ON(err == -EMSGSIZE);
@@ -1957,7 +1999,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1957 sz_idx = type>>2; 1999 sz_idx = type>>2;
1958 kind = type&3; 2000 kind = type&3;
1959 2001
1960 if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) 2002 if (kind != 2 && !capable(CAP_NET_ADMIN))
1961 return -EPERM; 2003 return -EPERM;
1962 2004
1963 if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { 2005 if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
@@ -1971,7 +2013,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1971 return -EOPNOTSUPP; 2013 return -EOPNOTSUPP;
1972 calcit = rtnl_get_calcit(family, type); 2014 calcit = rtnl_get_calcit(family, type);
1973 if (calcit) 2015 if (calcit)
1974 min_dump_alloc = calcit(skb); 2016 min_dump_alloc = calcit(skb, nlh);
1975 2017
1976 __rtnl_unlock(); 2018 __rtnl_unlock();
1977 rtnl = net->rtnl; 2019 rtnl = net->rtnl;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 025233de25f9..99b2596531bb 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -19,6 +19,7 @@ static int __init net_secret_init(void)
19} 19}
20late_initcall(net_secret_init); 20late_initcall(net_secret_init);
21 21
22#ifdef CONFIG_INET
22static u32 seq_scale(u32 seq) 23static u32 seq_scale(u32 seq)
23{ 24{
24 /* 25 /*
@@ -33,8 +34,9 @@ static u32 seq_scale(u32 seq)
33 */ 34 */
34 return seq + (ktime_to_ns(ktime_get_real()) >> 6); 35 return seq + (ktime_to_ns(ktime_get_real()) >> 6);
35} 36}
37#endif
36 38
37#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 39#if IS_ENABLED(CONFIG_IPV6)
38__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, 40__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
39 __be16 sport, __be16 dport) 41 __be16 sport, __be16 dport)
40{ 42{
@@ -44,7 +46,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
44 46
45 memcpy(hash, saddr, 16); 47 memcpy(hash, saddr, 16);
46 for (i = 0; i < 4; i++) 48 for (i = 0; i < 4; i++)
47 secret[i] = net_secret[i] + daddr[i]; 49 secret[i] = net_secret[i] + (__force u32)daddr[i];
48 secret[4] = net_secret[4] + 50 secret[4] = net_secret[4] +
49 (((__force u16)sport << 16) + (__force u16)dport); 51 (((__force u16)sport << 16) + (__force u16)dport);
50 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) 52 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
@@ -132,7 +134,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
132EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); 134EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
133#endif 135#endif
134 136
135#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) 137#if IS_ENABLED(CONFIG_IP_DCCP)
136u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, 138u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
137 __be16 sport, __be16 dport) 139 __be16 sport, __be16 dport)
138{ 140{
@@ -154,7 +156,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
154} 156}
155EXPORT_SYMBOL(secure_dccp_sequence_number); 157EXPORT_SYMBOL(secure_dccp_sequence_number);
156 158
157#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 159#if IS_ENABLED(CONFIG_IPV6)
158u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, 160u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
159 __be16 sport, __be16 dport) 161 __be16 sport, __be16 dport)
160{ 162{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 18a3cebb753d..da0c97f2fab4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -245,6 +245,55 @@ nodata:
245EXPORT_SYMBOL(__alloc_skb); 245EXPORT_SYMBOL(__alloc_skb);
246 246
247/** 247/**
248 * build_skb - build a network buffer
249 * @data: data buffer provided by caller
250 *
251 * Allocate a new &sk_buff. Caller provides space holding head and
252 * skb_shared_info. @data must have been allocated by kmalloc()
253 * The return is the new skb buffer.
254 * On a failure the return is %NULL, and @data is not freed.
255 * Notes :
256 * Before IO, driver allocates only data buffer where NIC put incoming frame
257 * Driver should add room at head (NET_SKB_PAD) and
258 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
259 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
260 * before giving packet to stack.
261 * RX rings only contains data buffers, not full skbs.
262 */
263struct sk_buff *build_skb(void *data)
264{
265 struct skb_shared_info *shinfo;
266 struct sk_buff *skb;
267 unsigned int size;
268
269 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
270 if (!skb)
271 return NULL;
272
273 size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
274
275 memset(skb, 0, offsetof(struct sk_buff, tail));
276 skb->truesize = SKB_TRUESIZE(size);
277 atomic_set(&skb->users, 1);
278 skb->head = data;
279 skb->data = data;
280 skb_reset_tail_pointer(skb);
281 skb->end = skb->tail + size;
282#ifdef NET_SKBUFF_DATA_USES_OFFSET
283 skb->mac_header = ~0U;
284#endif
285
286 /* make sure we initialize shinfo sequentially */
287 shinfo = skb_shinfo(skb);
288 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
289 atomic_set(&shinfo->dataref, 1);
290 kmemcheck_annotate_variable(shinfo->destructor_arg);
291
292 return skb;
293}
294EXPORT_SYMBOL(build_skb);
295
296/**
248 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 297 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
249 * @dev: network device to receive on 298 * @dev: network device to receive on
250 * @length: length to allocate 299 * @length: length to allocate
@@ -403,7 +452,7 @@ static void skb_release_head_state(struct sk_buff *skb)
403 WARN_ON(in_irq()); 452 WARN_ON(in_irq());
404 skb->destructor(skb); 453 skb->destructor(skb);
405 } 454 }
406#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 455#if IS_ENABLED(CONFIG_NF_CONNTRACK)
407 nf_conntrack_put(skb->nfct); 456 nf_conntrack_put(skb->nfct);
408#endif 457#endif
409#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 458#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
@@ -553,15 +602,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
553 new->ip_summed = old->ip_summed; 602 new->ip_summed = old->ip_summed;
554 skb_copy_queue_mapping(new, old); 603 skb_copy_queue_mapping(new, old);
555 new->priority = old->priority; 604 new->priority = old->priority;
556#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 605#if IS_ENABLED(CONFIG_IP_VS)
557 new->ipvs_property = old->ipvs_property; 606 new->ipvs_property = old->ipvs_property;
558#endif 607#endif
559 new->protocol = old->protocol; 608 new->protocol = old->protocol;
560 new->mark = old->mark; 609 new->mark = old->mark;
561 new->skb_iif = old->skb_iif; 610 new->skb_iif = old->skb_iif;
562 __nf_copy(new, old); 611 __nf_copy(new, old);
563#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 612#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
564 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
565 new->nf_trace = old->nf_trace; 613 new->nf_trace = old->nf_trace;
566#endif 614#endif
567#ifdef CONFIG_NET_SCHED 615#ifdef CONFIG_NET_SCHED
@@ -791,8 +839,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
791EXPORT_SYMBOL(skb_copy); 839EXPORT_SYMBOL(skb_copy);
792 840
793/** 841/**
794 * pskb_copy - create copy of an sk_buff with private head. 842 * __pskb_copy - create copy of an sk_buff with private head.
795 * @skb: buffer to copy 843 * @skb: buffer to copy
844 * @headroom: headroom of new skb
796 * @gfp_mask: allocation priority 845 * @gfp_mask: allocation priority
797 * 846 *
798 * Make a copy of both an &sk_buff and part of its data, located 847 * Make a copy of both an &sk_buff and part of its data, located
@@ -803,16 +852,16 @@ EXPORT_SYMBOL(skb_copy);
803 * The returned buffer has a reference count of 1. 852 * The returned buffer has a reference count of 1.
804 */ 853 */
805 854
806struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) 855struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
807{ 856{
808 unsigned int size = skb_end_pointer(skb) - skb->head; 857 unsigned int size = skb_headlen(skb) + headroom;
809 struct sk_buff *n = alloc_skb(size, gfp_mask); 858 struct sk_buff *n = alloc_skb(size, gfp_mask);
810 859
811 if (!n) 860 if (!n)
812 goto out; 861 goto out;
813 862
814 /* Set the data pointer */ 863 /* Set the data pointer */
815 skb_reserve(n, skb_headroom(skb)); 864 skb_reserve(n, headroom);
816 /* Set the tail pointer and length */ 865 /* Set the tail pointer and length */
817 skb_put(n, skb_headlen(skb)); 866 skb_put(n, skb_headlen(skb));
818 /* Copy the bytes */ 867 /* Copy the bytes */
@@ -848,7 +897,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
848out: 897out:
849 return n; 898 return n;
850} 899}
851EXPORT_SYMBOL(pskb_copy); 900EXPORT_SYMBOL(__pskb_copy);
852 901
853/** 902/**
854 * pskb_expand_head - reallocate header of &sk_buff 903 * pskb_expand_head - reallocate header of &sk_buff
@@ -2230,7 +2279,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)
2230 * @shiftlen: shift up to this many bytes 2279 * @shiftlen: shift up to this many bytes
2231 * 2280 *
2232 * Attempts to shift up to shiftlen worth of bytes, which may be less than 2281 * Attempts to shift up to shiftlen worth of bytes, which may be less than
2233 * the length of the skb, from tgt to skb. Returns number bytes shifted. 2282 * the length of the skb, from skb to tgt. Returns number bytes shifted.
2234 * It's up to caller to free skb if everything was shifted. 2283 * It's up to caller to free skb if everything was shifted.
2235 * 2284 *
2236 * If @tgt runs out of frags, the whole operation is aborted. 2285 * If @tgt runs out of frags, the whole operation is aborted.
@@ -2621,7 +2670,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
2621 * a pointer to the first in a list of new skbs for the segments. 2670 * a pointer to the first in a list of new skbs for the segments.
2622 * In case of error it returns ERR_PTR(err). 2671 * In case of error it returns ERR_PTR(err).
2623 */ 2672 */
2624struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) 2673struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
2625{ 2674{
2626 struct sk_buff *segs = NULL; 2675 struct sk_buff *segs = NULL;
2627 struct sk_buff *tail = NULL; 2676 struct sk_buff *tail = NULL;
@@ -3169,6 +3218,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
3169} 3218}
3170EXPORT_SYMBOL_GPL(skb_tstamp_tx); 3219EXPORT_SYMBOL_GPL(skb_tstamp_tx);
3171 3220
3221void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3222{
3223 struct sock *sk = skb->sk;
3224 struct sock_exterr_skb *serr;
3225 int err;
3226
3227 skb->wifi_acked_valid = 1;
3228 skb->wifi_acked = acked;
3229
3230 serr = SKB_EXT_ERR(skb);
3231 memset(serr, 0, sizeof(*serr));
3232 serr->ee.ee_errno = ENOMSG;
3233 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3234
3235 err = sock_queue_err_skb(sk, skb);
3236 if (err)
3237 kfree_skb(skb);
3238}
3239EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3240
3172 3241
3173/** 3242/**
3174 * skb_partial_csum_set - set up and verify partial csum values for packet 3243 * skb_partial_csum_set - set up and verify partial csum values for packet
diff --git a/net/core/sock.c b/net/core/sock.c
index 4ed7b1d12f5e..02f8dfe320b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -111,6 +111,8 @@
111#include <linux/init.h> 111#include <linux/init.h>
112#include <linux/highmem.h> 112#include <linux/highmem.h>
113#include <linux/user_namespace.h> 113#include <linux/user_namespace.h>
114#include <linux/jump_label.h>
115#include <linux/memcontrol.h>
114 116
115#include <asm/uaccess.h> 117#include <asm/uaccess.h>
116#include <asm/system.h> 118#include <asm/system.h>
@@ -125,6 +127,7 @@
125#include <net/xfrm.h> 127#include <net/xfrm.h>
126#include <linux/ipsec.h> 128#include <linux/ipsec.h>
127#include <net/cls_cgroup.h> 129#include <net/cls_cgroup.h>
130#include <net/netprio_cgroup.h>
128 131
129#include <linux/filter.h> 132#include <linux/filter.h>
130 133
@@ -134,6 +137,46 @@
134#include <net/tcp.h> 137#include <net/tcp.h>
135#endif 138#endif
136 139
140static DEFINE_MUTEX(proto_list_mutex);
141static LIST_HEAD(proto_list);
142
143#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145{
146 struct proto *proto;
147 int ret = 0;
148
149 mutex_lock(&proto_list_mutex);
150 list_for_each_entry(proto, &proto_list, node) {
151 if (proto->init_cgroup) {
152 ret = proto->init_cgroup(cgrp, ss);
153 if (ret)
154 goto out;
155 }
156 }
157
158 mutex_unlock(&proto_list_mutex);
159 return ret;
160out:
161 list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 if (proto->destroy_cgroup)
163 proto->destroy_cgroup(cgrp, ss);
164 mutex_unlock(&proto_list_mutex);
165 return ret;
166}
167
168void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
169{
170 struct proto *proto;
171
172 mutex_lock(&proto_list_mutex);
173 list_for_each_entry_reverse(proto, &proto_list, node)
174 if (proto->destroy_cgroup)
175 proto->destroy_cgroup(cgrp, ss);
176 mutex_unlock(&proto_list_mutex);
177}
178#endif
179
137/* 180/*
138 * Each address family might have different locking rules, so we have 181 * Each address family might have different locking rules, so we have
139 * one slock key per address family: 182 * one slock key per address family:
@@ -141,6 +184,9 @@
141static struct lock_class_key af_family_keys[AF_MAX]; 184static struct lock_class_key af_family_keys[AF_MAX];
142static struct lock_class_key af_family_slock_keys[AF_MAX]; 185static struct lock_class_key af_family_slock_keys[AF_MAX];
143 186
187struct jump_label_key memcg_socket_limit_enabled;
188EXPORT_SYMBOL(memcg_socket_limit_enabled);
189
144/* 190/*
145 * Make lock validator output more readable. (we pre-construct these 191 * Make lock validator output more readable. (we pre-construct these
146 * strings build-time, so that runtime initialization of socket 192 * strings build-time, so that runtime initialization of socket
@@ -221,10 +267,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
221int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 267int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
222EXPORT_SYMBOL(sysctl_optmem_max); 268EXPORT_SYMBOL(sysctl_optmem_max);
223 269
224#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) 270#if defined(CONFIG_CGROUPS)
271#if !defined(CONFIG_NET_CLS_CGROUP)
225int net_cls_subsys_id = -1; 272int net_cls_subsys_id = -1;
226EXPORT_SYMBOL_GPL(net_cls_subsys_id); 273EXPORT_SYMBOL_GPL(net_cls_subsys_id);
227#endif 274#endif
275#if !defined(CONFIG_NETPRIO_CGROUP)
276int net_prio_subsys_id = -1;
277EXPORT_SYMBOL_GPL(net_prio_subsys_id);
278#endif
279#endif
228 280
229static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 281static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
230{ 282{
@@ -269,14 +321,14 @@ static void sock_warn_obsolete_bsdism(const char *name)
269 } 321 }
270} 322}
271 323
272static void sock_disable_timestamp(struct sock *sk, int flag) 324#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
325
326static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
273{ 327{
274 if (sock_flag(sk, flag)) { 328 if (sk->sk_flags & flags) {
275 sock_reset_flag(sk, flag); 329 sk->sk_flags &= ~flags;
276 if (!sock_flag(sk, SOCK_TIMESTAMP) && 330 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
277 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
278 net_disable_timestamp(); 331 net_disable_timestamp();
279 }
280 } 332 }
281} 333}
282 334
@@ -288,11 +340,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
288 unsigned long flags; 340 unsigned long flags;
289 struct sk_buff_head *list = &sk->sk_receive_queue; 341 struct sk_buff_head *list = &sk->sk_receive_queue;
290 342
291 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces 343 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
292 number of warnings when compiling with -W --ANK
293 */
294 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
295 (unsigned)sk->sk_rcvbuf) {
296 atomic_inc(&sk->sk_drops); 344 atomic_inc(&sk->sk_drops);
297 trace_sock_rcvqueue_full(sk, skb); 345 trace_sock_rcvqueue_full(sk, skb);
298 return -ENOMEM; 346 return -ENOMEM;
@@ -682,7 +730,7 @@ set_rcvbuf:
682 SOCK_TIMESTAMPING_RX_SOFTWARE); 730 SOCK_TIMESTAMPING_RX_SOFTWARE);
683 else 731 else
684 sock_disable_timestamp(sk, 732 sock_disable_timestamp(sk,
685 SOCK_TIMESTAMPING_RX_SOFTWARE); 733 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
686 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, 734 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
687 val & SOF_TIMESTAMPING_SOFTWARE); 735 val & SOF_TIMESTAMPING_SOFTWARE);
688 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, 736 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
@@ -740,6 +788,11 @@ set_rcvbuf:
740 case SO_RXQ_OVFL: 788 case SO_RXQ_OVFL:
741 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 789 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
742 break; 790 break;
791
792 case SO_WIFI_STATUS:
793 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
794 break;
795
743 default: 796 default:
744 ret = -ENOPROTOOPT; 797 ret = -ENOPROTOOPT;
745 break; 798 break;
@@ -961,6 +1014,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
961 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); 1014 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
962 break; 1015 break;
963 1016
1017 case SO_WIFI_STATUS:
1018 v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1019 break;
1020
964 default: 1021 default:
965 return -ENOPROTOOPT; 1022 return -ENOPROTOOPT;
966 } 1023 }
@@ -1111,6 +1168,15 @@ void sock_update_classid(struct sock *sk)
1111 sk->sk_classid = classid; 1168 sk->sk_classid = classid;
1112} 1169}
1113EXPORT_SYMBOL(sock_update_classid); 1170EXPORT_SYMBOL(sock_update_classid);
1171
1172void sock_update_netprioidx(struct sock *sk)
1173{
1174 if (in_interrupt())
1175 return;
1176
1177 sk->sk_cgrp_prioidx = task_netprioidx(current);
1178}
1179EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1114#endif 1180#endif
1115 1181
1116/** 1182/**
@@ -1138,6 +1204,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1138 atomic_set(&sk->sk_wmem_alloc, 1); 1204 atomic_set(&sk->sk_wmem_alloc, 1);
1139 1205
1140 sock_update_classid(sk); 1206 sock_update_classid(sk);
1207 sock_update_netprioidx(sk);
1141 } 1208 }
1142 1209
1143 return sk; 1210 return sk;
@@ -1158,8 +1225,7 @@ static void __sk_free(struct sock *sk)
1158 RCU_INIT_POINTER(sk->sk_filter, NULL); 1225 RCU_INIT_POINTER(sk->sk_filter, NULL);
1159 } 1226 }
1160 1227
1161 sock_disable_timestamp(sk, SOCK_TIMESTAMP); 1228 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1162 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1163 1229
1164 if (atomic_read(&sk->sk_omem_alloc)) 1230 if (atomic_read(&sk->sk_omem_alloc))
1165 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", 1231 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
@@ -1204,7 +1270,20 @@ void sk_release_kernel(struct sock *sk)
1204} 1270}
1205EXPORT_SYMBOL(sk_release_kernel); 1271EXPORT_SYMBOL(sk_release_kernel);
1206 1272
1207struct sock *sk_clone(const struct sock *sk, const gfp_t priority) 1273static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1274{
1275 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1276 sock_update_memcg(newsk);
1277}
1278
1279/**
1280 * sk_clone_lock - clone a socket, and lock its clone
1281 * @sk: the socket to clone
1282 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1283 *
1284 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1285 */
1286struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1208{ 1287{
1209 struct sock *newsk; 1288 struct sock *newsk;
1210 1289
@@ -1287,17 +1366,18 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1287 sk_set_socket(newsk, NULL); 1366 sk_set_socket(newsk, NULL);
1288 newsk->sk_wq = NULL; 1367 newsk->sk_wq = NULL;
1289 1368
1369 sk_update_clone(sk, newsk);
1370
1290 if (newsk->sk_prot->sockets_allocated) 1371 if (newsk->sk_prot->sockets_allocated)
1291 percpu_counter_inc(newsk->sk_prot->sockets_allocated); 1372 sk_sockets_allocated_inc(newsk);
1292 1373
1293 if (sock_flag(newsk, SOCK_TIMESTAMP) || 1374 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1294 sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1295 net_enable_timestamp(); 1375 net_enable_timestamp();
1296 } 1376 }
1297out: 1377out:
1298 return newsk; 1378 return newsk;
1299} 1379}
1300EXPORT_SYMBOL_GPL(sk_clone); 1380EXPORT_SYMBOL_GPL(sk_clone_lock);
1301 1381
1302void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1382void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1303{ 1383{
@@ -1677,30 +1757,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
1677 struct proto *prot = sk->sk_prot; 1757 struct proto *prot = sk->sk_prot;
1678 int amt = sk_mem_pages(size); 1758 int amt = sk_mem_pages(size);
1679 long allocated; 1759 long allocated;
1760 int parent_status = UNDER_LIMIT;
1680 1761
1681 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 1762 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1682 allocated = atomic_long_add_return(amt, prot->memory_allocated); 1763
1764 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1683 1765
1684 /* Under limit. */ 1766 /* Under limit. */
1685 if (allocated <= prot->sysctl_mem[0]) { 1767 if (parent_status == UNDER_LIMIT &&
1686 if (prot->memory_pressure && *prot->memory_pressure) 1768 allocated <= sk_prot_mem_limits(sk, 0)) {
1687 *prot->memory_pressure = 0; 1769 sk_leave_memory_pressure(sk);
1688 return 1; 1770 return 1;
1689 } 1771 }
1690 1772
1691 /* Under pressure. */ 1773 /* Under pressure. (we or our parents) */
1692 if (allocated > prot->sysctl_mem[1]) 1774 if ((parent_status > SOFT_LIMIT) ||
1693 if (prot->enter_memory_pressure) 1775 allocated > sk_prot_mem_limits(sk, 1))
1694 prot->enter_memory_pressure(sk); 1776 sk_enter_memory_pressure(sk);
1695 1777
1696 /* Over hard limit. */ 1778 /* Over hard limit (we or our parents) */
1697 if (allocated > prot->sysctl_mem[2]) 1779 if ((parent_status == OVER_LIMIT) ||
1780 (allocated > sk_prot_mem_limits(sk, 2)))
1698 goto suppress_allocation; 1781 goto suppress_allocation;
1699 1782
1700 /* guarantee minimum buffer size under pressure */ 1783 /* guarantee minimum buffer size under pressure */
1701 if (kind == SK_MEM_RECV) { 1784 if (kind == SK_MEM_RECV) {
1702 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) 1785 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1703 return 1; 1786 return 1;
1787
1704 } else { /* SK_MEM_SEND */ 1788 } else { /* SK_MEM_SEND */
1705 if (sk->sk_type == SOCK_STREAM) { 1789 if (sk->sk_type == SOCK_STREAM) {
1706 if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) 1790 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1710,13 +1794,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
1710 return 1; 1794 return 1;
1711 } 1795 }
1712 1796
1713 if (prot->memory_pressure) { 1797 if (sk_has_memory_pressure(sk)) {
1714 int alloc; 1798 int alloc;
1715 1799
1716 if (!*prot->memory_pressure) 1800 if (!sk_under_memory_pressure(sk))
1717 return 1; 1801 return 1;
1718 alloc = percpu_counter_read_positive(prot->sockets_allocated); 1802 alloc = sk_sockets_allocated_read_positive(sk);
1719 if (prot->sysctl_mem[2] > alloc * 1803 if (sk_prot_mem_limits(sk, 2) > alloc *
1720 sk_mem_pages(sk->sk_wmem_queued + 1804 sk_mem_pages(sk->sk_wmem_queued +
1721 atomic_read(&sk->sk_rmem_alloc) + 1805 atomic_read(&sk->sk_rmem_alloc) +
1722 sk->sk_forward_alloc)) 1806 sk->sk_forward_alloc))
@@ -1739,7 +1823,9 @@ suppress_allocation:
1739 1823
1740 /* Alas. Undo changes. */ 1824 /* Alas. Undo changes. */
1741 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 1825 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1742 atomic_long_sub(amt, prot->memory_allocated); 1826
1827 sk_memory_allocated_sub(sk, amt);
1828
1743 return 0; 1829 return 0;
1744} 1830}
1745EXPORT_SYMBOL(__sk_mem_schedule); 1831EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1750,15 +1836,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
1750 */ 1836 */
1751void __sk_mem_reclaim(struct sock *sk) 1837void __sk_mem_reclaim(struct sock *sk)
1752{ 1838{
1753 struct proto *prot = sk->sk_prot; 1839 sk_memory_allocated_sub(sk,
1754 1840 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1755 atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1756 prot->memory_allocated);
1757 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; 1841 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1758 1842
1759 if (prot->memory_pressure && *prot->memory_pressure && 1843 if (sk_under_memory_pressure(sk) &&
1760 (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) 1844 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1761 *prot->memory_pressure = 0; 1845 sk_leave_memory_pressure(sk);
1762} 1846}
1763EXPORT_SYMBOL(__sk_mem_reclaim); 1847EXPORT_SYMBOL(__sk_mem_reclaim);
1764 1848
@@ -2129,16 +2213,15 @@ EXPORT_SYMBOL(sock_get_timestampns);
2129void sock_enable_timestamp(struct sock *sk, int flag) 2213void sock_enable_timestamp(struct sock *sk, int flag)
2130{ 2214{
2131 if (!sock_flag(sk, flag)) { 2215 if (!sock_flag(sk, flag)) {
2216 unsigned long previous_flags = sk->sk_flags;
2217
2132 sock_set_flag(sk, flag); 2218 sock_set_flag(sk, flag);
2133 /* 2219 /*
2134 * we just set one of the two flags which require net 2220 * we just set one of the two flags which require net
2135 * time stamping, but time stamping might have been on 2221 * time stamping, but time stamping might have been on
2136 * already because of the other one 2222 * already because of the other one
2137 */ 2223 */
2138 if (!sock_flag(sk, 2224 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2139 flag == SOCK_TIMESTAMP ?
2140 SOCK_TIMESTAMPING_RX_SOFTWARE :
2141 SOCK_TIMESTAMP))
2142 net_enable_timestamp(); 2225 net_enable_timestamp();
2143 } 2226 }
2144} 2227}
@@ -2250,9 +2333,6 @@ void sk_common_release(struct sock *sk)
2250} 2333}
2251EXPORT_SYMBOL(sk_common_release); 2334EXPORT_SYMBOL(sk_common_release);
2252 2335
2253static DEFINE_RWLOCK(proto_list_lock);
2254static LIST_HEAD(proto_list);
2255
2256#ifdef CONFIG_PROC_FS 2336#ifdef CONFIG_PROC_FS
2257#define PROTO_INUSE_NR 64 /* should be enough for the first time */ 2337#define PROTO_INUSE_NR 64 /* should be enough for the first time */
2258struct prot_inuse { 2338struct prot_inuse {
@@ -2401,10 +2481,10 @@ int proto_register(struct proto *prot, int alloc_slab)
2401 } 2481 }
2402 } 2482 }
2403 2483
2404 write_lock(&proto_list_lock); 2484 mutex_lock(&proto_list_mutex);
2405 list_add(&prot->node, &proto_list); 2485 list_add(&prot->node, &proto_list);
2406 assign_proto_idx(prot); 2486 assign_proto_idx(prot);
2407 write_unlock(&proto_list_lock); 2487 mutex_unlock(&proto_list_mutex);
2408 return 0; 2488 return 0;
2409 2489
2410out_free_timewait_sock_slab_name: 2490out_free_timewait_sock_slab_name:
@@ -2427,10 +2507,10 @@ EXPORT_SYMBOL(proto_register);
2427 2507
2428void proto_unregister(struct proto *prot) 2508void proto_unregister(struct proto *prot)
2429{ 2509{
2430 write_lock(&proto_list_lock); 2510 mutex_lock(&proto_list_mutex);
2431 release_proto_idx(prot); 2511 release_proto_idx(prot);
2432 list_del(&prot->node); 2512 list_del(&prot->node);
2433 write_unlock(&proto_list_lock); 2513 mutex_unlock(&proto_list_mutex);
2434 2514
2435 if (prot->slab != NULL) { 2515 if (prot->slab != NULL) {
2436 kmem_cache_destroy(prot->slab); 2516 kmem_cache_destroy(prot->slab);
@@ -2453,9 +2533,9 @@ EXPORT_SYMBOL(proto_unregister);
2453 2533
2454#ifdef CONFIG_PROC_FS 2534#ifdef CONFIG_PROC_FS
2455static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 2535static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2456 __acquires(proto_list_lock) 2536 __acquires(proto_list_mutex)
2457{ 2537{
2458 read_lock(&proto_list_lock); 2538 mutex_lock(&proto_list_mutex);
2459 return seq_list_start_head(&proto_list, *pos); 2539 return seq_list_start_head(&proto_list, *pos);
2460} 2540}
2461 2541
@@ -2465,25 +2545,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2465} 2545}
2466 2546
2467static void proto_seq_stop(struct seq_file *seq, void *v) 2547static void proto_seq_stop(struct seq_file *seq, void *v)
2468 __releases(proto_list_lock) 2548 __releases(proto_list_mutex)
2469{ 2549{
2470 read_unlock(&proto_list_lock); 2550 mutex_unlock(&proto_list_mutex);
2471} 2551}
2472 2552
2473static char proto_method_implemented(const void *method) 2553static char proto_method_implemented(const void *method)
2474{ 2554{
2475 return method == NULL ? 'n' : 'y'; 2555 return method == NULL ? 'n' : 'y';
2476} 2556}
2557static long sock_prot_memory_allocated(struct proto *proto)
2558{
2559 return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2560}
2561
2562static char *sock_prot_memory_pressure(struct proto *proto)
2563{
2564 return proto->memory_pressure != NULL ?
2565 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2566}
2477 2567
2478static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2568static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2479{ 2569{
2570
2480 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 2571 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2481 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2572 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2482 proto->name, 2573 proto->name,
2483 proto->obj_size, 2574 proto->obj_size,
2484 sock_prot_inuse_get(seq_file_net(seq), proto), 2575 sock_prot_inuse_get(seq_file_net(seq), proto),
2485 proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, 2576 sock_prot_memory_allocated(proto),
2486 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", 2577 sock_prot_memory_pressure(proto),
2487 proto->max_header, 2578 proto->max_header,
2488 proto->slab == NULL ? "no" : "yes", 2579 proto->slab == NULL ? "no" : "yes",
2489 module_name(proto->owner), 2580 module_name(proto->owner),
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 000000000000..b9868e1fd62c
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,192 @@
1#include <linux/mutex.h>
2#include <linux/socket.h>
3#include <linux/skbuff.h>
4#include <net/netlink.h>
5#include <net/net_namespace.h>
6#include <linux/module.h>
7#include <linux/rtnetlink.h>
8#include <net/sock.h>
9
10#include <linux/inet_diag.h>
11#include <linux/sock_diag.h>
12
13static struct sock_diag_handler *sock_diag_handlers[AF_MAX];
14static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
15static DEFINE_MUTEX(sock_diag_table_mutex);
16
17int sock_diag_check_cookie(void *sk, __u32 *cookie)
18{
19 if ((cookie[0] != INET_DIAG_NOCOOKIE ||
20 cookie[1] != INET_DIAG_NOCOOKIE) &&
21 ((u32)(unsigned long)sk != cookie[0] ||
22 (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
23 return -ESTALE;
24 else
25 return 0;
26}
27EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
28
29void sock_diag_save_cookie(void *sk, __u32 *cookie)
30{
31 cookie[0] = (u32)(unsigned long)sk;
32 cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
33}
34EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
35
36int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
37{
38 __u32 *mem;
39
40 mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32)));
41
42 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
43 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
44 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
45 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
46 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
47 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
48 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
49
50 return 0;
51
52rtattr_failure:
53 return -EMSGSIZE;
54}
55EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
56
57void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
58{
59 mutex_lock(&sock_diag_table_mutex);
60 inet_rcv_compat = fn;
61 mutex_unlock(&sock_diag_table_mutex);
62}
63EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
64
65void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
66{
67 mutex_lock(&sock_diag_table_mutex);
68 inet_rcv_compat = NULL;
69 mutex_unlock(&sock_diag_table_mutex);
70}
71EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
72
73int sock_diag_register(struct sock_diag_handler *hndl)
74{
75 int err = 0;
76
77 if (hndl->family >= AF_MAX)
78 return -EINVAL;
79
80 mutex_lock(&sock_diag_table_mutex);
81 if (sock_diag_handlers[hndl->family])
82 err = -EBUSY;
83 else
84 sock_diag_handlers[hndl->family] = hndl;
85 mutex_unlock(&sock_diag_table_mutex);
86
87 return err;
88}
89EXPORT_SYMBOL_GPL(sock_diag_register);
90
91void sock_diag_unregister(struct sock_diag_handler *hnld)
92{
93 int family = hnld->family;
94
95 if (family >= AF_MAX)
96 return;
97
98 mutex_lock(&sock_diag_table_mutex);
99 BUG_ON(sock_diag_handlers[family] != hnld);
100 sock_diag_handlers[family] = NULL;
101 mutex_unlock(&sock_diag_table_mutex);
102}
103EXPORT_SYMBOL_GPL(sock_diag_unregister);
104
105static inline struct sock_diag_handler *sock_diag_lock_handler(int family)
106{
107 if (sock_diag_handlers[family] == NULL)
108 request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
109 NETLINK_SOCK_DIAG, family);
110
111 mutex_lock(&sock_diag_table_mutex);
112 return sock_diag_handlers[family];
113}
114
115static inline void sock_diag_unlock_handler(struct sock_diag_handler *h)
116{
117 mutex_unlock(&sock_diag_table_mutex);
118}
119
120static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
121{
122 int err;
123 struct sock_diag_req *req = NLMSG_DATA(nlh);
124 struct sock_diag_handler *hndl;
125
126 if (nlmsg_len(nlh) < sizeof(*req))
127 return -EINVAL;
128
129 hndl = sock_diag_lock_handler(req->sdiag_family);
130 if (hndl == NULL)
131 err = -ENOENT;
132 else
133 err = hndl->dump(skb, nlh);
134 sock_diag_unlock_handler(hndl);
135
136 return err;
137}
138
139static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
140{
141 int ret;
142
143 switch (nlh->nlmsg_type) {
144 case TCPDIAG_GETSOCK:
145 case DCCPDIAG_GETSOCK:
146 if (inet_rcv_compat == NULL)
147 request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
148 NETLINK_SOCK_DIAG, AF_INET);
149
150 mutex_lock(&sock_diag_table_mutex);
151 if (inet_rcv_compat != NULL)
152 ret = inet_rcv_compat(skb, nlh);
153 else
154 ret = -EOPNOTSUPP;
155 mutex_unlock(&sock_diag_table_mutex);
156
157 return ret;
158 case SOCK_DIAG_BY_FAMILY:
159 return __sock_diag_rcv_msg(skb, nlh);
160 default:
161 return -EINVAL;
162 }
163}
164
165static DEFINE_MUTEX(sock_diag_mutex);
166
167static void sock_diag_rcv(struct sk_buff *skb)
168{
169 mutex_lock(&sock_diag_mutex);
170 netlink_rcv_skb(skb, &sock_diag_rcv_msg);
171 mutex_unlock(&sock_diag_mutex);
172}
173
174struct sock *sock_diag_nlsk;
175EXPORT_SYMBOL_GPL(sock_diag_nlsk);
176
177static int __init sock_diag_init(void)
178{
179 sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
180 sock_diag_rcv, NULL, THIS_MODULE);
181 return sock_diag_nlsk == NULL ? -ENOMEM : 0;
182}
183
184static void __exit sock_diag_exit(void)
185{
186 netlink_kernel_release(sock_diag_nlsk);
187}
188
189module_init(sock_diag_init);
190module_exit(sock_diag_exit);
191MODULE_LICENSE("GPL");
192MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 77a65f031488..d05559d4d9cd 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -68,8 +68,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
68 68
69 if (sock_table != orig_sock_table) { 69 if (sock_table != orig_sock_table) {
70 rcu_assign_pointer(rps_sock_flow_table, sock_table); 70 rcu_assign_pointer(rps_sock_flow_table, sock_table);
71 synchronize_rcu(); 71 if (sock_table)
72 vfree(orig_sock_table); 72 jump_label_inc(&rps_needed);
73 if (orig_sock_table) {
74 jump_label_dec(&rps_needed);
75 synchronize_rcu();
76 vfree(orig_sock_table);
77 }
73 } 78 }
74 } 79 }
75 80