aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-02 23:53:45 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-02 23:53:45 -0400
commitcd6362befe4cc7bf589a5236d2a780af2d47bcc9 (patch)
tree3bd4e13ec3f92a00dc4f6c3d65e820b54dbfe46e /net/core
parent0f1b1e6d73cb989ce2c071edc57deade3b084dfe (diff)
parentb1586f099ba897542ece36e8a23c1a62907261ef (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Here is my initial pull request for the networking subsystem during this merge window: 1) Support for ESN in AH (RFC 4302) from Fan Du. 2) Add full kernel doc for ethtool command structures, from Ben Hutchings. 3) Add BCM7xxx PHY driver, from Florian Fainelli. 4) Export computed TCP rate information in netlink socket dumps, from Eric Dumazet. 5) Allow IPSEC SA to be dumped partially using a filter, from Nicolas Dichtel. 6) Convert many drivers to pci_enable_msix_range(), from Alexander Gordeev. 7) Record SKB timestamps more efficiently, from Eric Dumazet. 8) Switch to microsecond resolution for TCP round trip times, also from Eric Dumazet. 9) Clean up and fix 6lowpan fragmentation handling by making use of the existing inet_frag api for it's implementation. 10) Add TX grant mapping to xen-netback driver, from Zoltan Kiss. 11) Auto size SKB lengths when composing netlink messages based upon past message sizes used, from Eric Dumazet. 12) qdisc dumps can take a long time, add a cond_resched(), From Eric Dumazet. 13) Sanitize netpoll core and drivers wrt. SKB handling semantics. Get rid of never-used-in-tree netpoll RX handling. From Eric W Biederman. 14) Support inter-address-family and namespace changing in VTI tunnel driver(s). From Steffen Klassert. 15) Add Altera TSE driver, from Vince Bridgers. 16) Optimizing csum_replace2() so that it doesn't adjust the checksum by checksumming the entire header, from Eric Dumazet. 17) Expand BPF internal implementation for faster interpreting, more direct translations into JIT'd code, and much cleaner uses of BPF filtering in non-socket ocntexts. From Daniel Borkmann and Alexei Starovoitov" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1976 commits) netpoll: Use skb_irq_freeable to make zap_completion_queue safe. net: Add a test to see if a skb is freeable in irq context qlcnic: Fix build failure due to undefined reference to `vxlan_get_rx_port' net: ptp: move PTP classifier in its own file net: sxgbe: make "core_ops" static net: sxgbe: fix logical vs bitwise operation net: sxgbe: sxgbe_mdio_register() frees the bus Call efx_set_channels() before efx->type->dimension_resources() xen-netback: disable rogue vif in kthread context net/mlx4: Set proper build dependancy with vxlan be2net: fix build dependency on VxLAN mac802154: make csma/cca parameters per-wpan mac802154: allow only one WPAN to be up at any given time net: filter: minor: fix kdoc in __sk_run_filter netlink: don't compare the nul-termination in nla_strcmp can: c_can: Avoid led toggling for every packet. can: c_can: Simplify TX interrupt cleanup can: c_can: Store dlc private can: c_can: Reduce register access can: c_can: Make the code readable ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c153
-rw-r--r--net/core/filter.c1567
-rw-r--r--net/core/flow.c132
-rw-r--r--net/core/flow_dissector.c24
-rw-r--r--net/core/neighbour.c9
-rw-r--r--net/core/net-sysfs.c22
-rw-r--r--net/core/netpoll.c587
-rw-r--r--net/core/pktgen.c32
-rw-r--r--net/core/ptp_classifier.c141
-rw-r--r--net/core/request_sock.c1
-rw-r--r--net/core/rtnetlink.c113
-rw-r--r--net/core/skbuff.c166
-rw-r--r--net/core/sock_diag.c23
-rw-r--r--net/core/timestamping.c19
15 files changed, 1813 insertions, 1177 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 9628c20acff6..826b925aa453 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
21obj-$(CONFIG_TRACEPOINTS) += net-traces.o 21obj-$(CONFIG_TRACEPOINTS) += net-traces.o
22obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o 22obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
23obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o 23obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
24obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
24obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o 25obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
25obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o 26obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 4a91591b30a6..757063420ce0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1245,7 +1245,7 @@ static int __dev_open(struct net_device *dev)
1245 * If we don't do this there is a chance ndo_poll_controller 1245 * If we don't do this there is a chance ndo_poll_controller
1246 * or ndo_poll may be running while we open the device 1246 * or ndo_poll may be running while we open the device
1247 */ 1247 */
1248 netpoll_rx_disable(dev); 1248 netpoll_poll_disable(dev);
1249 1249
1250 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1250 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1251 ret = notifier_to_errno(ret); 1251 ret = notifier_to_errno(ret);
@@ -1260,7 +1260,7 @@ static int __dev_open(struct net_device *dev)
1260 if (!ret && ops->ndo_open) 1260 if (!ret && ops->ndo_open)
1261 ret = ops->ndo_open(dev); 1261 ret = ops->ndo_open(dev);
1262 1262
1263 netpoll_rx_enable(dev); 1263 netpoll_poll_enable(dev);
1264 1264
1265 if (ret) 1265 if (ret)
1266 clear_bit(__LINK_STATE_START, &dev->state); 1266 clear_bit(__LINK_STATE_START, &dev->state);
@@ -1313,6 +1313,9 @@ static int __dev_close_many(struct list_head *head)
1313 might_sleep(); 1313 might_sleep();
1314 1314
1315 list_for_each_entry(dev, head, close_list) { 1315 list_for_each_entry(dev, head, close_list) {
1316 /* Temporarily disable netpoll until the interface is down */
1317 netpoll_poll_disable(dev);
1318
1316 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1319 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1317 1320
1318 clear_bit(__LINK_STATE_START, &dev->state); 1321 clear_bit(__LINK_STATE_START, &dev->state);
@@ -1343,6 +1346,7 @@ static int __dev_close_many(struct list_head *head)
1343 1346
1344 dev->flags &= ~IFF_UP; 1347 dev->flags &= ~IFF_UP;
1345 net_dmaengine_put(); 1348 net_dmaengine_put();
1349 netpoll_poll_enable(dev);
1346 } 1350 }
1347 1351
1348 return 0; 1352 return 0;
@@ -1353,14 +1357,10 @@ static int __dev_close(struct net_device *dev)
1353 int retval; 1357 int retval;
1354 LIST_HEAD(single); 1358 LIST_HEAD(single);
1355 1359
1356 /* Temporarily disable netpoll until the interface is down */
1357 netpoll_rx_disable(dev);
1358
1359 list_add(&dev->close_list, &single); 1360 list_add(&dev->close_list, &single);
1360 retval = __dev_close_many(&single); 1361 retval = __dev_close_many(&single);
1361 list_del(&single); 1362 list_del(&single);
1362 1363
1363 netpoll_rx_enable(dev);
1364 return retval; 1364 return retval;
1365} 1365}
1366 1366
@@ -1398,14 +1398,9 @@ int dev_close(struct net_device *dev)
1398 if (dev->flags & IFF_UP) { 1398 if (dev->flags & IFF_UP) {
1399 LIST_HEAD(single); 1399 LIST_HEAD(single);
1400 1400
1401 /* Block netpoll rx while the interface is going down */
1402 netpoll_rx_disable(dev);
1403
1404 list_add(&dev->close_list, &single); 1401 list_add(&dev->close_list, &single);
1405 dev_close_many(&single); 1402 dev_close_many(&single);
1406 list_del(&single); 1403 list_del(&single);
1407
1408 netpoll_rx_enable(dev);
1409 } 1404 }
1410 return 0; 1405 return 0;
1411} 1406}
@@ -1645,8 +1640,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
1645 __net_timestamp(SKB); \ 1640 __net_timestamp(SKB); \
1646 } \ 1641 } \
1647 1642
1648static inline bool is_skb_forwardable(struct net_device *dev, 1643bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1649 struct sk_buff *skb)
1650{ 1644{
1651 unsigned int len; 1645 unsigned int len;
1652 1646
@@ -1665,6 +1659,7 @@ static inline bool is_skb_forwardable(struct net_device *dev,
1665 1659
1666 return false; 1660 return false;
1667} 1661}
1662EXPORT_SYMBOL_GPL(is_skb_forwardable);
1668 1663
1669/** 1664/**
1670 * dev_forward_skb - loopback an skb to another netif 1665 * dev_forward_skb - loopback an skb to another netif
@@ -2885,6 +2880,7 @@ recursion_alert:
2885 rc = -ENETDOWN; 2880 rc = -ENETDOWN;
2886 rcu_read_unlock_bh(); 2881 rcu_read_unlock_bh();
2887 2882
2883 atomic_long_inc(&dev->tx_dropped);
2888 kfree_skb(skb); 2884 kfree_skb(skb);
2889 return rc; 2885 return rc;
2890out: 2886out:
@@ -2957,7 +2953,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2957 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2953 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2958 if (!flow_table) 2954 if (!flow_table)
2959 goto out; 2955 goto out;
2960 flow_id = skb->rxhash & flow_table->mask; 2956 flow_id = skb_get_hash(skb) & flow_table->mask;
2961 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 2957 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2962 rxq_index, flow_id); 2958 rxq_index, flow_id);
2963 if (rc < 0) 2959 if (rc < 0)
@@ -2991,6 +2987,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2991 struct rps_sock_flow_table *sock_flow_table; 2987 struct rps_sock_flow_table *sock_flow_table;
2992 int cpu = -1; 2988 int cpu = -1;
2993 u16 tcpu; 2989 u16 tcpu;
2990 u32 hash;
2994 2991
2995 if (skb_rx_queue_recorded(skb)) { 2992 if (skb_rx_queue_recorded(skb)) {
2996 u16 index = skb_get_rx_queue(skb); 2993 u16 index = skb_get_rx_queue(skb);
@@ -3019,7 +3016,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3019 } 3016 }
3020 3017
3021 skb_reset_network_header(skb); 3018 skb_reset_network_header(skb);
3022 if (!skb_get_hash(skb)) 3019 hash = skb_get_hash(skb);
3020 if (!hash)
3023 goto done; 3021 goto done;
3024 3022
3025 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3023 flow_table = rcu_dereference(rxqueue->rps_flow_table);
@@ -3028,11 +3026,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3028 u16 next_cpu; 3026 u16 next_cpu;
3029 struct rps_dev_flow *rflow; 3027 struct rps_dev_flow *rflow;
3030 3028
3031 rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; 3029 rflow = &flow_table->flows[hash & flow_table->mask];
3032 tcpu = rflow->cpu; 3030 tcpu = rflow->cpu;
3033 3031
3034 next_cpu = sock_flow_table->ents[skb->rxhash & 3032 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3035 sock_flow_table->mask];
3036 3033
3037 /* 3034 /*
3038 * If the desired CPU (where last recvmsg was done) is 3035 * If the desired CPU (where last recvmsg was done) is
@@ -3061,7 +3058,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3061 } 3058 }
3062 3059
3063 if (map) { 3060 if (map) {
3064 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 3061 tcpu = map->cpus[((u64) hash * map->len) >> 32];
3065 3062
3066 if (cpu_online(tcpu)) { 3063 if (cpu_online(tcpu)) {
3067 cpu = tcpu; 3064 cpu = tcpu;
@@ -3236,10 +3233,6 @@ static int netif_rx_internal(struct sk_buff *skb)
3236{ 3233{
3237 int ret; 3234 int ret;
3238 3235
3239 /* if netpoll wants it, pretend we never saw it */
3240 if (netpoll_rx(skb))
3241 return NET_RX_DROP;
3242
3243 net_timestamp_check(netdev_tstamp_prequeue, skb); 3236 net_timestamp_check(netdev_tstamp_prequeue, skb);
3244 3237
3245 trace_netif_rx(skb); 3238 trace_netif_rx(skb);
@@ -3500,11 +3493,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3500static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 3493static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3501{ 3494{
3502 switch (skb->protocol) { 3495 switch (skb->protocol) {
3503 case __constant_htons(ETH_P_ARP): 3496 case htons(ETH_P_ARP):
3504 case __constant_htons(ETH_P_IP): 3497 case htons(ETH_P_IP):
3505 case __constant_htons(ETH_P_IPV6): 3498 case htons(ETH_P_IPV6):
3506 case __constant_htons(ETH_P_8021Q): 3499 case htons(ETH_P_8021Q):
3507 case __constant_htons(ETH_P_8021AD): 3500 case htons(ETH_P_8021AD):
3508 return true; 3501 return true;
3509 default: 3502 default:
3510 return false; 3503 return false;
@@ -3525,10 +3518,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3525 3518
3526 trace_netif_receive_skb(skb); 3519 trace_netif_receive_skb(skb);
3527 3520
3528 /* if we've gotten here through NAPI, check netpoll */
3529 if (netpoll_receive_skb(skb))
3530 goto out;
3531
3532 orig_dev = skb->dev; 3521 orig_dev = skb->dev;
3533 3522
3534 skb_reset_network_header(skb); 3523 skb_reset_network_header(skb);
@@ -3655,7 +3644,6 @@ drop:
3655 3644
3656unlock: 3645unlock:
3657 rcu_read_unlock(); 3646 rcu_read_unlock();
3658out:
3659 return ret; 3647 return ret;
3660} 3648}
3661 3649
@@ -3845,10 +3833,10 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3845 diffs |= p->vlan_tci ^ skb->vlan_tci; 3833 diffs |= p->vlan_tci ^ skb->vlan_tci;
3846 if (maclen == ETH_HLEN) 3834 if (maclen == ETH_HLEN)
3847 diffs |= compare_ether_header(skb_mac_header(p), 3835 diffs |= compare_ether_header(skb_mac_header(p),
3848 skb_gro_mac_header(skb)); 3836 skb_mac_header(skb));
3849 else if (!diffs) 3837 else if (!diffs)
3850 diffs = memcmp(skb_mac_header(p), 3838 diffs = memcmp(skb_mac_header(p),
3851 skb_gro_mac_header(skb), 3839 skb_mac_header(skb),
3852 maclen); 3840 maclen);
3853 NAPI_GRO_CB(p)->same_flow = !diffs; 3841 NAPI_GRO_CB(p)->same_flow = !diffs;
3854 } 3842 }
@@ -3871,6 +3859,27 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
3871 } 3859 }
3872} 3860}
3873 3861
3862static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3863{
3864 struct skb_shared_info *pinfo = skb_shinfo(skb);
3865
3866 BUG_ON(skb->end - skb->tail < grow);
3867
3868 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3869
3870 skb->data_len -= grow;
3871 skb->tail += grow;
3872
3873 pinfo->frags[0].page_offset += grow;
3874 skb_frag_size_sub(&pinfo->frags[0], grow);
3875
3876 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3877 skb_frag_unref(skb, 0);
3878 memmove(pinfo->frags, pinfo->frags + 1,
3879 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3880 }
3881}
3882
3874static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3883static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3875{ 3884{
3876 struct sk_buff **pp = NULL; 3885 struct sk_buff **pp = NULL;
@@ -3879,14 +3888,14 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3879 struct list_head *head = &offload_base; 3888 struct list_head *head = &offload_base;
3880 int same_flow; 3889 int same_flow;
3881 enum gro_result ret; 3890 enum gro_result ret;
3891 int grow;
3882 3892
3883 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3893 if (!(skb->dev->features & NETIF_F_GRO))
3884 goto normal; 3894 goto normal;
3885 3895
3886 if (skb_is_gso(skb) || skb_has_frag_list(skb)) 3896 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3887 goto normal; 3897 goto normal;
3888 3898
3889 skb_gro_reset_offset(skb);
3890 gro_list_prepare(napi, skb); 3899 gro_list_prepare(napi, skb);
3891 NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ 3900 NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3892 3901
@@ -3950,27 +3959,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3950 ret = GRO_HELD; 3959 ret = GRO_HELD;
3951 3960
3952pull: 3961pull:
3953 if (skb_headlen(skb) < skb_gro_offset(skb)) { 3962 grow = skb_gro_offset(skb) - skb_headlen(skb);
3954 int grow = skb_gro_offset(skb) - skb_headlen(skb); 3963 if (grow > 0)
3955 3964 gro_pull_from_frag0(skb, grow);
3956 BUG_ON(skb->end - skb->tail < grow);
3957
3958 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3959
3960 skb->tail += grow;
3961 skb->data_len -= grow;
3962
3963 skb_shinfo(skb)->frags[0].page_offset += grow;
3964 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3965
3966 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3967 skb_frag_unref(skb, 0);
3968 memmove(skb_shinfo(skb)->frags,
3969 skb_shinfo(skb)->frags + 1,
3970 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3971 }
3972 }
3973
3974ok: 3965ok:
3975 return ret; 3966 return ret;
3976 3967
@@ -4038,6 +4029,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4038{ 4029{
4039 trace_napi_gro_receive_entry(skb); 4030 trace_napi_gro_receive_entry(skb);
4040 4031
4032 skb_gro_reset_offset(skb);
4033
4041 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4034 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4042} 4035}
4043EXPORT_SYMBOL(napi_gro_receive); 4036EXPORT_SYMBOL(napi_gro_receive);
@@ -4066,12 +4059,16 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
4066} 4059}
4067EXPORT_SYMBOL(napi_get_frags); 4060EXPORT_SYMBOL(napi_get_frags);
4068 4061
4069static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, 4062static gro_result_t napi_frags_finish(struct napi_struct *napi,
4070 gro_result_t ret) 4063 struct sk_buff *skb,
4064 gro_result_t ret)
4071{ 4065{
4072 switch (ret) { 4066 switch (ret) {
4073 case GRO_NORMAL: 4067 case GRO_NORMAL:
4074 if (netif_receive_skb_internal(skb)) 4068 case GRO_HELD:
4069 __skb_push(skb, ETH_HLEN);
4070 skb->protocol = eth_type_trans(skb, skb->dev);
4071 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4075 ret = GRO_DROP; 4072 ret = GRO_DROP;
4076 break; 4073 break;
4077 4074
@@ -4080,7 +4077,6 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
4080 napi_reuse_skb(napi, skb); 4077 napi_reuse_skb(napi, skb);
4081 break; 4078 break;
4082 4079
4083 case GRO_HELD:
4084 case GRO_MERGED: 4080 case GRO_MERGED:
4085 break; 4081 break;
4086 } 4082 }
@@ -4088,17 +4084,41 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
4088 return ret; 4084 return ret;
4089} 4085}
4090 4086
4087/* Upper GRO stack assumes network header starts at gro_offset=0
4088 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4089 * We copy ethernet header into skb->data to have a common layout.
4090 */
4091static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4091static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4092{ 4092{
4093 struct sk_buff *skb = napi->skb; 4093 struct sk_buff *skb = napi->skb;
4094 const struct ethhdr *eth;
4095 unsigned int hlen = sizeof(*eth);
4094 4096
4095 napi->skb = NULL; 4097 napi->skb = NULL;
4096 4098
4097 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) { 4099 skb_reset_mac_header(skb);
4098 napi_reuse_skb(napi, skb); 4100 skb_gro_reset_offset(skb);
4099 return NULL; 4101
4102 eth = skb_gro_header_fast(skb, 0);
4103 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4104 eth = skb_gro_header_slow(skb, hlen, 0);
4105 if (unlikely(!eth)) {
4106 napi_reuse_skb(napi, skb);
4107 return NULL;
4108 }
4109 } else {
4110 gro_pull_from_frag0(skb, hlen);
4111 NAPI_GRO_CB(skb)->frag0 += hlen;
4112 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4100 } 4113 }
4101 skb->protocol = eth_type_trans(skb, skb->dev); 4114 __skb_pull(skb, hlen);
4115
4116 /*
4117 * This works because the only protocols we care about don't require
4118 * special handling.
4119 * We'll fix it up properly in napi_frags_finish()
4120 */
4121 skb->protocol = eth->h_proto;
4102 4122
4103 return skb; 4123 return skb;
4104} 4124}
@@ -6251,6 +6271,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6251 netdev_stats_to_stats64(storage, &dev->stats); 6271 netdev_stats_to_stats64(storage, &dev->stats);
6252 } 6272 }
6253 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 6273 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6274 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6254 return storage; 6275 return storage;
6255} 6276}
6256EXPORT_SYMBOL(dev_get_stats); 6277EXPORT_SYMBOL(dev_get_stats);
diff --git a/net/core/filter.c b/net/core/filter.c
index ad30d626a5bd..765556ba32ef 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,11 +1,16 @@
1/* 1/*
2 * Linux Socket Filter - Kernel level socket filtering 2 * Linux Socket Filter - Kernel level socket filtering
3 * 3 *
4 * Author: 4 * Based on the design of the Berkeley Packet Filter. The new
5 * Jay Schulist <jschlst@samba.org> 5 * internal format has been designed by PLUMgrid:
6 * 6 *
7 * Based on the design of: 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8 * - The Berkeley Packet Filter 8 *
9 * Authors:
10 *
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
9 * 14 *
10 * This program is free software; you can redistribute it and/or 15 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 16 * modify it under the terms of the GNU General Public License
@@ -108,304 +113,1045 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
108} 113}
109EXPORT_SYMBOL(sk_filter); 114EXPORT_SYMBOL(sk_filter);
110 115
116/* Base function for offset calculation. Needs to go into .text section,
117 * therefore keeping it non-static as well; will also be used by JITs
118 * anyway later on, so do not let the compiler omit it.
119 */
120noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
121{
122 return 0;
123}
124
111/** 125/**
112 * sk_run_filter - run a filter on a socket 126 * __sk_run_filter - run a filter on a given context
113 * @skb: buffer to run the filter on 127 * @ctx: buffer to run the filter on
114 * @fentry: filter to apply 128 * @insn: filter to apply
115 * 129 *
116 * Decode and apply filter instructions to the skb->data. 130 * Decode and apply filter instructions to the skb->data. Return length to
117 * Return length to keep, 0 for none. @skb is the data we are 131 * keep, 0 for none. @ctx is the data we are operating on, @insn is the
118 * filtering, @filter is the array of filter instructions. 132 * array of filter instructions.
119 * Because all jumps are guaranteed to be before last instruction,
120 * and last instruction guaranteed to be a RET, we dont need to check
121 * flen. (We used to pass to this function the length of filter)
122 */ 133 */
123unsigned int sk_run_filter(const struct sk_buff *skb, 134unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
124 const struct sock_filter *fentry)
125{ 135{
136 u64 stack[MAX_BPF_STACK / sizeof(u64)];
137 u64 regs[MAX_BPF_REG], tmp;
126 void *ptr; 138 void *ptr;
127 u32 A = 0; /* Accumulator */ 139 int off;
128 u32 X = 0; /* Index Register */
129 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
130 u32 tmp;
131 int k;
132 140
133 /* 141#define K insn->imm
134 * Process array of filter instructions. 142#define A regs[insn->a_reg]
135 */ 143#define X regs[insn->x_reg]
136 for (;; fentry++) { 144#define R0 regs[0]
137#if defined(CONFIG_X86_32) 145
138#define K (fentry->k) 146#define CONT ({insn++; goto select_insn; })
139#else 147#define CONT_JMP ({insn++; goto select_insn; })
140 const u32 K = fentry->k; 148
141#endif 149 static const void *jumptable[256] = {
142 150 [0 ... 255] = &&default_label,
143 switch (fentry->code) { 151 /* Now overwrite non-defaults ... */
144 case BPF_S_ALU_ADD_X: 152#define DL(A, B, C) [A|B|C] = &&A##_##B##_##C
145 A += X; 153 DL(BPF_ALU, BPF_ADD, BPF_X),
146 continue; 154 DL(BPF_ALU, BPF_ADD, BPF_K),
147 case BPF_S_ALU_ADD_K: 155 DL(BPF_ALU, BPF_SUB, BPF_X),
148 A += K; 156 DL(BPF_ALU, BPF_SUB, BPF_K),
149 continue; 157 DL(BPF_ALU, BPF_AND, BPF_X),
150 case BPF_S_ALU_SUB_X: 158 DL(BPF_ALU, BPF_AND, BPF_K),
151 A -= X; 159 DL(BPF_ALU, BPF_OR, BPF_X),
152 continue; 160 DL(BPF_ALU, BPF_OR, BPF_K),
153 case BPF_S_ALU_SUB_K: 161 DL(BPF_ALU, BPF_LSH, BPF_X),
154 A -= K; 162 DL(BPF_ALU, BPF_LSH, BPF_K),
155 continue; 163 DL(BPF_ALU, BPF_RSH, BPF_X),
156 case BPF_S_ALU_MUL_X: 164 DL(BPF_ALU, BPF_RSH, BPF_K),
157 A *= X; 165 DL(BPF_ALU, BPF_XOR, BPF_X),
158 continue; 166 DL(BPF_ALU, BPF_XOR, BPF_K),
159 case BPF_S_ALU_MUL_K: 167 DL(BPF_ALU, BPF_MUL, BPF_X),
160 A *= K; 168 DL(BPF_ALU, BPF_MUL, BPF_K),
161 continue; 169 DL(BPF_ALU, BPF_MOV, BPF_X),
162 case BPF_S_ALU_DIV_X: 170 DL(BPF_ALU, BPF_MOV, BPF_K),
163 if (X == 0) 171 DL(BPF_ALU, BPF_DIV, BPF_X),
164 return 0; 172 DL(BPF_ALU, BPF_DIV, BPF_K),
165 A /= X; 173 DL(BPF_ALU, BPF_MOD, BPF_X),
166 continue; 174 DL(BPF_ALU, BPF_MOD, BPF_K),
167 case BPF_S_ALU_DIV_K: 175 DL(BPF_ALU, BPF_NEG, 0),
168 A /= K; 176 DL(BPF_ALU, BPF_END, BPF_TO_BE),
169 continue; 177 DL(BPF_ALU, BPF_END, BPF_TO_LE),
170 case BPF_S_ALU_MOD_X: 178 DL(BPF_ALU64, BPF_ADD, BPF_X),
171 if (X == 0) 179 DL(BPF_ALU64, BPF_ADD, BPF_K),
172 return 0; 180 DL(BPF_ALU64, BPF_SUB, BPF_X),
173 A %= X; 181 DL(BPF_ALU64, BPF_SUB, BPF_K),
174 continue; 182 DL(BPF_ALU64, BPF_AND, BPF_X),
175 case BPF_S_ALU_MOD_K: 183 DL(BPF_ALU64, BPF_AND, BPF_K),
176 A %= K; 184 DL(BPF_ALU64, BPF_OR, BPF_X),
177 continue; 185 DL(BPF_ALU64, BPF_OR, BPF_K),
178 case BPF_S_ALU_AND_X: 186 DL(BPF_ALU64, BPF_LSH, BPF_X),
179 A &= X; 187 DL(BPF_ALU64, BPF_LSH, BPF_K),
180 continue; 188 DL(BPF_ALU64, BPF_RSH, BPF_X),
181 case BPF_S_ALU_AND_K: 189 DL(BPF_ALU64, BPF_RSH, BPF_K),
182 A &= K; 190 DL(BPF_ALU64, BPF_XOR, BPF_X),
183 continue; 191 DL(BPF_ALU64, BPF_XOR, BPF_K),
184 case BPF_S_ALU_OR_X: 192 DL(BPF_ALU64, BPF_MUL, BPF_X),
185 A |= X; 193 DL(BPF_ALU64, BPF_MUL, BPF_K),
186 continue; 194 DL(BPF_ALU64, BPF_MOV, BPF_X),
187 case BPF_S_ALU_OR_K: 195 DL(BPF_ALU64, BPF_MOV, BPF_K),
188 A |= K; 196 DL(BPF_ALU64, BPF_ARSH, BPF_X),
189 continue; 197 DL(BPF_ALU64, BPF_ARSH, BPF_K),
190 case BPF_S_ANC_ALU_XOR_X: 198 DL(BPF_ALU64, BPF_DIV, BPF_X),
191 case BPF_S_ALU_XOR_X: 199 DL(BPF_ALU64, BPF_DIV, BPF_K),
192 A ^= X; 200 DL(BPF_ALU64, BPF_MOD, BPF_X),
193 continue; 201 DL(BPF_ALU64, BPF_MOD, BPF_K),
194 case BPF_S_ALU_XOR_K: 202 DL(BPF_ALU64, BPF_NEG, 0),
195 A ^= K; 203 DL(BPF_JMP, BPF_CALL, 0),
196 continue; 204 DL(BPF_JMP, BPF_JA, 0),
197 case BPF_S_ALU_LSH_X: 205 DL(BPF_JMP, BPF_JEQ, BPF_X),
198 A <<= X; 206 DL(BPF_JMP, BPF_JEQ, BPF_K),
199 continue; 207 DL(BPF_JMP, BPF_JNE, BPF_X),
200 case BPF_S_ALU_LSH_K: 208 DL(BPF_JMP, BPF_JNE, BPF_K),
201 A <<= K; 209 DL(BPF_JMP, BPF_JGT, BPF_X),
202 continue; 210 DL(BPF_JMP, BPF_JGT, BPF_K),
203 case BPF_S_ALU_RSH_X: 211 DL(BPF_JMP, BPF_JGE, BPF_X),
204 A >>= X; 212 DL(BPF_JMP, BPF_JGE, BPF_K),
205 continue; 213 DL(BPF_JMP, BPF_JSGT, BPF_X),
206 case BPF_S_ALU_RSH_K: 214 DL(BPF_JMP, BPF_JSGT, BPF_K),
207 A >>= K; 215 DL(BPF_JMP, BPF_JSGE, BPF_X),
208 continue; 216 DL(BPF_JMP, BPF_JSGE, BPF_K),
209 case BPF_S_ALU_NEG: 217 DL(BPF_JMP, BPF_JSET, BPF_X),
210 A = -A; 218 DL(BPF_JMP, BPF_JSET, BPF_K),
211 continue; 219 DL(BPF_JMP, BPF_EXIT, 0),
212 case BPF_S_JMP_JA: 220 DL(BPF_STX, BPF_MEM, BPF_B),
213 fentry += K; 221 DL(BPF_STX, BPF_MEM, BPF_H),
214 continue; 222 DL(BPF_STX, BPF_MEM, BPF_W),
215 case BPF_S_JMP_JGT_K: 223 DL(BPF_STX, BPF_MEM, BPF_DW),
216 fentry += (A > K) ? fentry->jt : fentry->jf; 224 DL(BPF_STX, BPF_XADD, BPF_W),
217 continue; 225 DL(BPF_STX, BPF_XADD, BPF_DW),
218 case BPF_S_JMP_JGE_K: 226 DL(BPF_ST, BPF_MEM, BPF_B),
219 fentry += (A >= K) ? fentry->jt : fentry->jf; 227 DL(BPF_ST, BPF_MEM, BPF_H),
220 continue; 228 DL(BPF_ST, BPF_MEM, BPF_W),
221 case BPF_S_JMP_JEQ_K: 229 DL(BPF_ST, BPF_MEM, BPF_DW),
222 fentry += (A == K) ? fentry->jt : fentry->jf; 230 DL(BPF_LDX, BPF_MEM, BPF_B),
223 continue; 231 DL(BPF_LDX, BPF_MEM, BPF_H),
224 case BPF_S_JMP_JSET_K: 232 DL(BPF_LDX, BPF_MEM, BPF_W),
225 fentry += (A & K) ? fentry->jt : fentry->jf; 233 DL(BPF_LDX, BPF_MEM, BPF_DW),
226 continue; 234 DL(BPF_LD, BPF_ABS, BPF_W),
227 case BPF_S_JMP_JGT_X: 235 DL(BPF_LD, BPF_ABS, BPF_H),
228 fentry += (A > X) ? fentry->jt : fentry->jf; 236 DL(BPF_LD, BPF_ABS, BPF_B),
229 continue; 237 DL(BPF_LD, BPF_IND, BPF_W),
230 case BPF_S_JMP_JGE_X: 238 DL(BPF_LD, BPF_IND, BPF_H),
231 fentry += (A >= X) ? fentry->jt : fentry->jf; 239 DL(BPF_LD, BPF_IND, BPF_B),
232 continue; 240#undef DL
233 case BPF_S_JMP_JEQ_X: 241 };
234 fentry += (A == X) ? fentry->jt : fentry->jf; 242
235 continue; 243 regs[FP_REG] = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
236 case BPF_S_JMP_JSET_X: 244 regs[ARG1_REG] = (u64) (unsigned long) ctx;
237 fentry += (A & X) ? fentry->jt : fentry->jf; 245
238 continue; 246select_insn:
239 case BPF_S_LD_W_ABS: 247 goto *jumptable[insn->code];
240 k = K; 248
241load_w: 249 /* ALU */
242 ptr = load_pointer(skb, k, 4, &tmp); 250#define ALU(OPCODE, OP) \
243 if (ptr != NULL) { 251 BPF_ALU64_##OPCODE##_BPF_X: \
244 A = get_unaligned_be32(ptr); 252 A = A OP X; \
245 continue; 253 CONT; \
246 } 254 BPF_ALU_##OPCODE##_BPF_X: \
247 return 0; 255 A = (u32) A OP (u32) X; \
248 case BPF_S_LD_H_ABS: 256 CONT; \
249 k = K; 257 BPF_ALU64_##OPCODE##_BPF_K: \
250load_h: 258 A = A OP K; \
251 ptr = load_pointer(skb, k, 2, &tmp); 259 CONT; \
252 if (ptr != NULL) { 260 BPF_ALU_##OPCODE##_BPF_K: \
253 A = get_unaligned_be16(ptr); 261 A = (u32) A OP (u32) K; \
254 continue; 262 CONT;
263
264 ALU(BPF_ADD, +)
265 ALU(BPF_SUB, -)
266 ALU(BPF_AND, &)
267 ALU(BPF_OR, |)
268 ALU(BPF_LSH, <<)
269 ALU(BPF_RSH, >>)
270 ALU(BPF_XOR, ^)
271 ALU(BPF_MUL, *)
272#undef ALU
273 BPF_ALU_BPF_NEG_0:
274 A = (u32) -A;
275 CONT;
276 BPF_ALU64_BPF_NEG_0:
277 A = -A;
278 CONT;
279 BPF_ALU_BPF_MOV_BPF_X:
280 A = (u32) X;
281 CONT;
282 BPF_ALU_BPF_MOV_BPF_K:
283 A = (u32) K;
284 CONT;
285 BPF_ALU64_BPF_MOV_BPF_X:
286 A = X;
287 CONT;
288 BPF_ALU64_BPF_MOV_BPF_K:
289 A = K;
290 CONT;
291 BPF_ALU64_BPF_ARSH_BPF_X:
292 (*(s64 *) &A) >>= X;
293 CONT;
294 BPF_ALU64_BPF_ARSH_BPF_K:
295 (*(s64 *) &A) >>= K;
296 CONT;
297 BPF_ALU64_BPF_MOD_BPF_X:
298 tmp = A;
299 if (X)
300 A = do_div(tmp, X);
301 CONT;
302 BPF_ALU_BPF_MOD_BPF_X:
303 tmp = (u32) A;
304 if (X)
305 A = do_div(tmp, (u32) X);
306 CONT;
307 BPF_ALU64_BPF_MOD_BPF_K:
308 tmp = A;
309 if (K)
310 A = do_div(tmp, K);
311 CONT;
312 BPF_ALU_BPF_MOD_BPF_K:
313 tmp = (u32) A;
314 if (K)
315 A = do_div(tmp, (u32) K);
316 CONT;
317 BPF_ALU64_BPF_DIV_BPF_X:
318 if (X)
319 do_div(A, X);
320 CONT;
321 BPF_ALU_BPF_DIV_BPF_X:
322 tmp = (u32) A;
323 if (X)
324 do_div(tmp, (u32) X);
325 A = (u32) tmp;
326 CONT;
327 BPF_ALU64_BPF_DIV_BPF_K:
328 if (K)
329 do_div(A, K);
330 CONT;
331 BPF_ALU_BPF_DIV_BPF_K:
332 tmp = (u32) A;
333 if (K)
334 do_div(tmp, (u32) K);
335 A = (u32) tmp;
336 CONT;
337 BPF_ALU_BPF_END_BPF_TO_BE:
338 switch (K) {
339 case 16:
340 A = (__force u16) cpu_to_be16(A);
341 break;
342 case 32:
343 A = (__force u32) cpu_to_be32(A);
344 break;
345 case 64:
346 A = (__force u64) cpu_to_be64(A);
347 break;
348 }
349 CONT;
350 BPF_ALU_BPF_END_BPF_TO_LE:
351 switch (K) {
352 case 16:
353 A = (__force u16) cpu_to_le16(A);
354 break;
355 case 32:
356 A = (__force u32) cpu_to_le32(A);
357 break;
358 case 64:
359 A = (__force u64) cpu_to_le64(A);
360 break;
361 }
362 CONT;
363
364 /* CALL */
365 BPF_JMP_BPF_CALL_0:
366 /* Function call scratches R1-R5 registers, preserves R6-R9,
367 * and stores return value into R0.
368 */
369 R0 = (__bpf_call_base + insn->imm)(regs[1], regs[2], regs[3],
370 regs[4], regs[5]);
371 CONT;
372
373 /* JMP */
374 BPF_JMP_BPF_JA_0:
375 insn += insn->off;
376 CONT;
377 BPF_JMP_BPF_JEQ_BPF_X:
378 if (A == X) {
379 insn += insn->off;
380 CONT_JMP;
381 }
382 CONT;
383 BPF_JMP_BPF_JEQ_BPF_K:
384 if (A == K) {
385 insn += insn->off;
386 CONT_JMP;
387 }
388 CONT;
389 BPF_JMP_BPF_JNE_BPF_X:
390 if (A != X) {
391 insn += insn->off;
392 CONT_JMP;
393 }
394 CONT;
395 BPF_JMP_BPF_JNE_BPF_K:
396 if (A != K) {
397 insn += insn->off;
398 CONT_JMP;
399 }
400 CONT;
401 BPF_JMP_BPF_JGT_BPF_X:
402 if (A > X) {
403 insn += insn->off;
404 CONT_JMP;
405 }
406 CONT;
407 BPF_JMP_BPF_JGT_BPF_K:
408 if (A > K) {
409 insn += insn->off;
410 CONT_JMP;
411 }
412 CONT;
413 BPF_JMP_BPF_JGE_BPF_X:
414 if (A >= X) {
415 insn += insn->off;
416 CONT_JMP;
417 }
418 CONT;
419 BPF_JMP_BPF_JGE_BPF_K:
420 if (A >= K) {
421 insn += insn->off;
422 CONT_JMP;
423 }
424 CONT;
425 BPF_JMP_BPF_JSGT_BPF_X:
426 if (((s64)A) > ((s64)X)) {
427 insn += insn->off;
428 CONT_JMP;
429 }
430 CONT;
431 BPF_JMP_BPF_JSGT_BPF_K:
432 if (((s64)A) > ((s64)K)) {
433 insn += insn->off;
434 CONT_JMP;
435 }
436 CONT;
437 BPF_JMP_BPF_JSGE_BPF_X:
438 if (((s64)A) >= ((s64)X)) {
439 insn += insn->off;
440 CONT_JMP;
441 }
442 CONT;
443 BPF_JMP_BPF_JSGE_BPF_K:
444 if (((s64)A) >= ((s64)K)) {
445 insn += insn->off;
446 CONT_JMP;
447 }
448 CONT;
449 BPF_JMP_BPF_JSET_BPF_X:
450 if (A & X) {
451 insn += insn->off;
452 CONT_JMP;
453 }
454 CONT;
455 BPF_JMP_BPF_JSET_BPF_K:
456 if (A & K) {
457 insn += insn->off;
458 CONT_JMP;
459 }
460 CONT;
461 BPF_JMP_BPF_EXIT_0:
462 return R0;
463
464 /* STX and ST and LDX*/
465#define LDST(SIZEOP, SIZE) \
466 BPF_STX_BPF_MEM_##SIZEOP: \
467 *(SIZE *)(unsigned long) (A + insn->off) = X; \
468 CONT; \
469 BPF_ST_BPF_MEM_##SIZEOP: \
470 *(SIZE *)(unsigned long) (A + insn->off) = K; \
471 CONT; \
472 BPF_LDX_BPF_MEM_##SIZEOP: \
473 A = *(SIZE *)(unsigned long) (X + insn->off); \
474 CONT;
475
476 LDST(BPF_B, u8)
477 LDST(BPF_H, u16)
478 LDST(BPF_W, u32)
479 LDST(BPF_DW, u64)
480#undef LDST
481 BPF_STX_BPF_XADD_BPF_W: /* lock xadd *(u32 *)(A + insn->off) += X */
482 atomic_add((u32) X, (atomic_t *)(unsigned long)
483 (A + insn->off));
484 CONT;
485 BPF_STX_BPF_XADD_BPF_DW: /* lock xadd *(u64 *)(A + insn->off) += X */
486 atomic64_add((u64) X, (atomic64_t *)(unsigned long)
487 (A + insn->off));
488 CONT;
489 BPF_LD_BPF_ABS_BPF_W: /* R0 = ntohl(*(u32 *) (skb->data + K)) */
490 off = K;
491load_word:
492 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
493 * appearing in the programs where ctx == skb. All programs
494 * keep 'ctx' in regs[CTX_REG] == R6, sk_convert_filter()
495 * saves it in R6, internal BPF verifier will check that
496 * R6 == ctx.
497 *
498 * BPF_ABS and BPF_IND are wrappers of function calls, so
499 * they scratch R1-R5 registers, preserve R6-R9, and store
500 * return value into R0.
501 *
502 * Implicit input:
503 * ctx
504 *
505 * Explicit input:
506 * X == any register
507 * K == 32-bit immediate
508 *
509 * Output:
510 * R0 - 8/16/32-bit skb data converted to cpu endianness
511 */
512 ptr = load_pointer((struct sk_buff *) ctx, off, 4, &tmp);
513 if (likely(ptr != NULL)) {
514 R0 = get_unaligned_be32(ptr);
515 CONT;
516 }
517 return 0;
518 BPF_LD_BPF_ABS_BPF_H: /* R0 = ntohs(*(u16 *) (skb->data + K)) */
519 off = K;
520load_half:
521 ptr = load_pointer((struct sk_buff *) ctx, off, 2, &tmp);
522 if (likely(ptr != NULL)) {
523 R0 = get_unaligned_be16(ptr);
524 CONT;
525 }
526 return 0;
527 BPF_LD_BPF_ABS_BPF_B: /* R0 = *(u8 *) (ctx + K) */
528 off = K;
529load_byte:
530 ptr = load_pointer((struct sk_buff *) ctx, off, 1, &tmp);
531 if (likely(ptr != NULL)) {
532 R0 = *(u8 *)ptr;
533 CONT;
534 }
535 return 0;
536 BPF_LD_BPF_IND_BPF_W: /* R0 = ntohl(*(u32 *) (skb->data + X + K)) */
537 off = K + X;
538 goto load_word;
539 BPF_LD_BPF_IND_BPF_H: /* R0 = ntohs(*(u16 *) (skb->data + X + K)) */
540 off = K + X;
541 goto load_half;
542 BPF_LD_BPF_IND_BPF_B: /* R0 = *(u8 *) (skb->data + X + K) */
543 off = K + X;
544 goto load_byte;
545
546 default_label:
547 /* If we ever reach this, we have a bug somewhere. */
548 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
549 return 0;
550#undef CONT_JMP
551#undef CONT
552
553#undef R0
554#undef X
555#undef A
556#undef K
557}
558
559u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
560 const struct sock_filter_int *insni)
561 __attribute__ ((alias ("__sk_run_filter")));
562
563u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
564 const struct sock_filter_int *insni)
565 __attribute__ ((alias ("__sk_run_filter")));
566EXPORT_SYMBOL_GPL(sk_run_filter_int_skb);
567
568/* Helper to find the offset of pkt_type in sk_buff structure. We want
569 * to make sure its still a 3bit field starting at a byte boundary;
570 * taken from arch/x86/net/bpf_jit_comp.c.
571 */
572#define PKT_TYPE_MAX 7
573static unsigned int pkt_type_offset(void)
574{
575 struct sk_buff skb_probe = { .pkt_type = ~0, };
576 u8 *ct = (u8 *) &skb_probe;
577 unsigned int off;
578
579 for (off = 0; off < sizeof(struct sk_buff); off++) {
580 if (ct[off] == PKT_TYPE_MAX)
581 return off;
582 }
583
584 pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
585 return -1;
586}
587
588static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
589{
590 struct sk_buff *skb = (struct sk_buff *)(long) ctx;
591
592 return __skb_get_poff(skb);
593}
594
595static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
596{
597 struct sk_buff *skb = (struct sk_buff *)(long) ctx;
598 struct nlattr *nla;
599
600 if (skb_is_nonlinear(skb))
601 return 0;
602
603 if (A > skb->len - sizeof(struct nlattr))
604 return 0;
605
606 nla = nla_find((struct nlattr *) &skb->data[A], skb->len - A, X);
607 if (nla)
608 return (void *) nla - (void *) skb->data;
609
610 return 0;
611}
612
613static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
614{
615 struct sk_buff *skb = (struct sk_buff *)(long) ctx;
616 struct nlattr *nla;
617
618 if (skb_is_nonlinear(skb))
619 return 0;
620
621 if (A > skb->len - sizeof(struct nlattr))
622 return 0;
623
624 nla = (struct nlattr *) &skb->data[A];
625 if (nla->nla_len > A - skb->len)
626 return 0;
627
628 nla = nla_find_nested(nla, X);
629 if (nla)
630 return (void *) nla - (void *) skb->data;
631
632 return 0;
633}
634
635static u64 __get_raw_cpu_id(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
636{
637 return raw_smp_processor_id();
638}
639
640/* Register mappings for user programs. */
641#define A_REG 0
642#define X_REG 7
643#define TMP_REG 8
644#define ARG2_REG 2
645#define ARG3_REG 3
646
647static bool convert_bpf_extensions(struct sock_filter *fp,
648 struct sock_filter_int **insnp)
649{
650 struct sock_filter_int *insn = *insnp;
651
652 switch (fp->k) {
653 case SKF_AD_OFF + SKF_AD_PROTOCOL:
654 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
655
656 insn->code = BPF_LDX | BPF_MEM | BPF_H;
657 insn->a_reg = A_REG;
658 insn->x_reg = CTX_REG;
659 insn->off = offsetof(struct sk_buff, protocol);
660 insn++;
661
662 /* A = ntohs(A) [emitting a nop or swap16] */
663 insn->code = BPF_ALU | BPF_END | BPF_FROM_BE;
664 insn->a_reg = A_REG;
665 insn->imm = 16;
666 break;
667
668 case SKF_AD_OFF + SKF_AD_PKTTYPE:
669 insn->code = BPF_LDX | BPF_MEM | BPF_B;
670 insn->a_reg = A_REG;
671 insn->x_reg = CTX_REG;
672 insn->off = pkt_type_offset();
673 if (insn->off < 0)
674 return false;
675 insn++;
676
677 insn->code = BPF_ALU | BPF_AND | BPF_K;
678 insn->a_reg = A_REG;
679 insn->imm = PKT_TYPE_MAX;
680 break;
681
682 case SKF_AD_OFF + SKF_AD_IFINDEX:
683 case SKF_AD_OFF + SKF_AD_HATYPE:
684 if (FIELD_SIZEOF(struct sk_buff, dev) == 8)
685 insn->code = BPF_LDX | BPF_MEM | BPF_DW;
686 else
687 insn->code = BPF_LDX | BPF_MEM | BPF_W;
688 insn->a_reg = TMP_REG;
689 insn->x_reg = CTX_REG;
690 insn->off = offsetof(struct sk_buff, dev);
691 insn++;
692
693 insn->code = BPF_JMP | BPF_JNE | BPF_K;
694 insn->a_reg = TMP_REG;
695 insn->imm = 0;
696 insn->off = 1;
697 insn++;
698
699 insn->code = BPF_JMP | BPF_EXIT;
700 insn++;
701
702 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
703 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
704
705 insn->a_reg = A_REG;
706 insn->x_reg = TMP_REG;
707
708 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) {
709 insn->code = BPF_LDX | BPF_MEM | BPF_W;
710 insn->off = offsetof(struct net_device, ifindex);
711 } else {
712 insn->code = BPF_LDX | BPF_MEM | BPF_H;
713 insn->off = offsetof(struct net_device, type);
714 }
715 break;
716
717 case SKF_AD_OFF + SKF_AD_MARK:
718 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
719
720 insn->code = BPF_LDX | BPF_MEM | BPF_W;
721 insn->a_reg = A_REG;
722 insn->x_reg = CTX_REG;
723 insn->off = offsetof(struct sk_buff, mark);
724 break;
725
726 case SKF_AD_OFF + SKF_AD_RXHASH:
727 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
728
729 insn->code = BPF_LDX | BPF_MEM | BPF_W;
730 insn->a_reg = A_REG;
731 insn->x_reg = CTX_REG;
732 insn->off = offsetof(struct sk_buff, hash);
733 break;
734
735 case SKF_AD_OFF + SKF_AD_QUEUE:
736 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
737
738 insn->code = BPF_LDX | BPF_MEM | BPF_H;
739 insn->a_reg = A_REG;
740 insn->x_reg = CTX_REG;
741 insn->off = offsetof(struct sk_buff, queue_mapping);
742 break;
743
744 case SKF_AD_OFF + SKF_AD_VLAN_TAG:
745 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
746 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
747
748 insn->code = BPF_LDX | BPF_MEM | BPF_H;
749 insn->a_reg = A_REG;
750 insn->x_reg = CTX_REG;
751 insn->off = offsetof(struct sk_buff, vlan_tci);
752 insn++;
753
754 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
755
756 if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
757 insn->code = BPF_ALU | BPF_AND | BPF_K;
758 insn->a_reg = A_REG;
759 insn->imm = ~VLAN_TAG_PRESENT;
760 } else {
761 insn->code = BPF_ALU | BPF_RSH | BPF_K;
762 insn->a_reg = A_REG;
763 insn->imm = 12;
764 insn++;
765
766 insn->code = BPF_ALU | BPF_AND | BPF_K;
767 insn->a_reg = A_REG;
768 insn->imm = 1;
769 }
770 break;
771
772 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
773 case SKF_AD_OFF + SKF_AD_NLATTR:
774 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
775 case SKF_AD_OFF + SKF_AD_CPU:
776 /* arg1 = ctx */
777 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
778 insn->a_reg = ARG1_REG;
779 insn->x_reg = CTX_REG;
780 insn++;
781
782 /* arg2 = A */
783 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
784 insn->a_reg = ARG2_REG;
785 insn->x_reg = A_REG;
786 insn++;
787
788 /* arg3 = X */
789 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
790 insn->a_reg = ARG3_REG;
791 insn->x_reg = X_REG;
792 insn++;
793
794 /* Emit call(ctx, arg2=A, arg3=X) */
795 insn->code = BPF_JMP | BPF_CALL;
796 switch (fp->k) {
797 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
798 insn->imm = __skb_get_pay_offset - __bpf_call_base;
799 break;
800 case SKF_AD_OFF + SKF_AD_NLATTR:
801 insn->imm = __skb_get_nlattr - __bpf_call_base;
802 break;
803 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
804 insn->imm = __skb_get_nlattr_nest - __bpf_call_base;
805 break;
806 case SKF_AD_OFF + SKF_AD_CPU:
807 insn->imm = __get_raw_cpu_id - __bpf_call_base;
808 break;
809 }
810 break;
811
812 case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
813 insn->code = BPF_ALU | BPF_XOR | BPF_X;
814 insn->a_reg = A_REG;
815 insn->x_reg = X_REG;
816 break;
817
818 default:
819 /* This is just a dummy call to avoid letting the compiler
820 * evict __bpf_call_base() as an optimization. Placed here
821 * where no-one bothers.
822 */
823 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
824 return false;
825 }
826
827 *insnp = insn;
828 return true;
829}
830
831/**
832 * sk_convert_filter - convert filter program
833 * @prog: the user passed filter program
834 * @len: the length of the user passed filter program
835 * @new_prog: buffer where converted program will be stored
836 * @new_len: pointer to store length of converted program
837 *
838 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
839 * Conversion workflow:
840 *
841 * 1) First pass for calculating the new program length:
842 * sk_convert_filter(old_prog, old_len, NULL, &new_len)
843 *
844 * 2) 2nd pass to remap in two passes: 1st pass finds new
845 * jump offsets, 2nd pass remapping:
846 * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
847 * sk_convert_filter(old_prog, old_len, new_prog, &new_len);
848 *
849 * User BPF's register A is mapped to our BPF register 6, user BPF
850 * register X is mapped to BPF register 7; frame pointer is always
851 * register 10; Context 'void *ctx' is stored in register 1, that is,
852 * for socket filters: ctx == 'struct sk_buff *', for seccomp:
853 * ctx == 'struct seccomp_data *'.
854 */
855int sk_convert_filter(struct sock_filter *prog, int len,
856 struct sock_filter_int *new_prog, int *new_len)
857{
858 int new_flen = 0, pass = 0, target, i;
859 struct sock_filter_int *new_insn;
860 struct sock_filter *fp;
861 int *addrs = NULL;
862 u8 bpf_src;
863
864 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
865 BUILD_BUG_ON(FP_REG + 1 != MAX_BPF_REG);
866
867 if (len <= 0 || len >= BPF_MAXINSNS)
868 return -EINVAL;
869
870 if (new_prog) {
871 addrs = kzalloc(len * sizeof(*addrs), GFP_KERNEL);
872 if (!addrs)
873 return -ENOMEM;
874 }
875
876do_pass:
877 new_insn = new_prog;
878 fp = prog;
879
880 if (new_insn) {
881 new_insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
882 new_insn->a_reg = CTX_REG;
883 new_insn->x_reg = ARG1_REG;
884 }
885 new_insn++;
886
887 for (i = 0; i < len; fp++, i++) {
888 struct sock_filter_int tmp_insns[6] = { };
889 struct sock_filter_int *insn = tmp_insns;
890
891 if (addrs)
892 addrs[i] = new_insn - new_prog;
893
894 switch (fp->code) {
895 /* All arithmetic insns and skb loads map as-is. */
896 case BPF_ALU | BPF_ADD | BPF_X:
897 case BPF_ALU | BPF_ADD | BPF_K:
898 case BPF_ALU | BPF_SUB | BPF_X:
899 case BPF_ALU | BPF_SUB | BPF_K:
900 case BPF_ALU | BPF_AND | BPF_X:
901 case BPF_ALU | BPF_AND | BPF_K:
902 case BPF_ALU | BPF_OR | BPF_X:
903 case BPF_ALU | BPF_OR | BPF_K:
904 case BPF_ALU | BPF_LSH | BPF_X:
905 case BPF_ALU | BPF_LSH | BPF_K:
906 case BPF_ALU | BPF_RSH | BPF_X:
907 case BPF_ALU | BPF_RSH | BPF_K:
908 case BPF_ALU | BPF_XOR | BPF_X:
909 case BPF_ALU | BPF_XOR | BPF_K:
910 case BPF_ALU | BPF_MUL | BPF_X:
911 case BPF_ALU | BPF_MUL | BPF_K:
912 case BPF_ALU | BPF_DIV | BPF_X:
913 case BPF_ALU | BPF_DIV | BPF_K:
914 case BPF_ALU | BPF_MOD | BPF_X:
915 case BPF_ALU | BPF_MOD | BPF_K:
916 case BPF_ALU | BPF_NEG:
917 case BPF_LD | BPF_ABS | BPF_W:
918 case BPF_LD | BPF_ABS | BPF_H:
919 case BPF_LD | BPF_ABS | BPF_B:
920 case BPF_LD | BPF_IND | BPF_W:
921 case BPF_LD | BPF_IND | BPF_H:
922 case BPF_LD | BPF_IND | BPF_B:
923 /* Check for overloaded BPF extension and
924 * directly convert it if found, otherwise
925 * just move on with mapping.
926 */
927 if (BPF_CLASS(fp->code) == BPF_LD &&
928 BPF_MODE(fp->code) == BPF_ABS &&
929 convert_bpf_extensions(fp, &insn))
930 break;
931
932 insn->code = fp->code;
933 insn->a_reg = A_REG;
934 insn->x_reg = X_REG;
935 insn->imm = fp->k;
936 break;
937
938 /* Jump opcodes map as-is, but offsets need adjustment. */
939 case BPF_JMP | BPF_JA:
940 target = i + fp->k + 1;
941 insn->code = fp->code;
942#define EMIT_JMP \
943 do { \
944 if (target >= len || target < 0) \
945 goto err; \
946 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
947 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
948 insn->off -= insn - tmp_insns; \
949 } while (0)
950
951 EMIT_JMP;
952 break;
953
954 case BPF_JMP | BPF_JEQ | BPF_K:
955 case BPF_JMP | BPF_JEQ | BPF_X:
956 case BPF_JMP | BPF_JSET | BPF_K:
957 case BPF_JMP | BPF_JSET | BPF_X:
958 case BPF_JMP | BPF_JGT | BPF_K:
959 case BPF_JMP | BPF_JGT | BPF_X:
960 case BPF_JMP | BPF_JGE | BPF_K:
961 case BPF_JMP | BPF_JGE | BPF_X:
962 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
963 /* BPF immediates are signed, zero extend
964 * immediate into tmp register and use it
965 * in compare insn.
966 */
967 insn->code = BPF_ALU | BPF_MOV | BPF_K;
968 insn->a_reg = TMP_REG;
969 insn->imm = fp->k;
970 insn++;
971
972 insn->a_reg = A_REG;
973 insn->x_reg = TMP_REG;
974 bpf_src = BPF_X;
975 } else {
976 insn->a_reg = A_REG;
977 insn->x_reg = X_REG;
978 insn->imm = fp->k;
979 bpf_src = BPF_SRC(fp->code);
255 } 980 }
256 return 0; 981
257 case BPF_S_LD_B_ABS: 982 /* Common case where 'jump_false' is next insn. */
258 k = K; 983 if (fp->jf == 0) {
259load_b: 984 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
260 ptr = load_pointer(skb, k, 1, &tmp); 985 target = i + fp->jt + 1;
261 if (ptr != NULL) { 986 EMIT_JMP;
262 A = *(u8 *)ptr; 987 break;
263 continue;
264 } 988 }
265 return 0; 989
266 case BPF_S_LD_W_LEN: 990 /* Convert JEQ into JNE when 'jump_true' is next insn. */
267 A = skb->len; 991 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
268 continue; 992 insn->code = BPF_JMP | BPF_JNE | bpf_src;
269 case BPF_S_LDX_W_LEN: 993 target = i + fp->jf + 1;
270 X = skb->len; 994 EMIT_JMP;
271 continue; 995 break;
272 case BPF_S_LD_W_IND:
273 k = X + K;
274 goto load_w;
275 case BPF_S_LD_H_IND:
276 k = X + K;
277 goto load_h;
278 case BPF_S_LD_B_IND:
279 k = X + K;
280 goto load_b;
281 case BPF_S_LDX_B_MSH:
282 ptr = load_pointer(skb, K, 1, &tmp);
283 if (ptr != NULL) {
284 X = (*(u8 *)ptr & 0xf) << 2;
285 continue;
286 } 996 }
287 return 0; 997
288 case BPF_S_LD_IMM: 998 /* Other jumps are mapped into two insns: Jxx and JA. */
289 A = K; 999 target = i + fp->jt + 1;
290 continue; 1000 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
291 case BPF_S_LDX_IMM: 1001 EMIT_JMP;
292 X = K; 1002 insn++;
293 continue; 1003
294 case BPF_S_LD_MEM: 1004 insn->code = BPF_JMP | BPF_JA;
295 A = mem[K]; 1005 target = i + fp->jf + 1;
296 continue; 1006 EMIT_JMP;
297 case BPF_S_LDX_MEM: 1007 break;
298 X = mem[K]; 1008
299 continue; 1009 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
300 case BPF_S_MISC_TAX: 1010 case BPF_LDX | BPF_MSH | BPF_B:
301 X = A; 1011 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
302 continue; 1012 insn->a_reg = TMP_REG;
303 case BPF_S_MISC_TXA: 1013 insn->x_reg = A_REG;
304 A = X; 1014 insn++;
305 continue; 1015
306 case BPF_S_RET_K: 1016 insn->code = BPF_LD | BPF_ABS | BPF_B;
307 return K; 1017 insn->a_reg = A_REG;
308 case BPF_S_RET_A: 1018 insn->imm = fp->k;
309 return A; 1019 insn++;
310 case BPF_S_ST: 1020
311 mem[K] = A; 1021 insn->code = BPF_ALU | BPF_AND | BPF_K;
312 continue; 1022 insn->a_reg = A_REG;
313 case BPF_S_STX: 1023 insn->imm = 0xf;
314 mem[K] = X; 1024 insn++;
315 continue; 1025
316 case BPF_S_ANC_PROTOCOL: 1026 insn->code = BPF_ALU | BPF_LSH | BPF_K;
317 A = ntohs(skb->protocol); 1027 insn->a_reg = A_REG;
318 continue; 1028 insn->imm = 2;
319 case BPF_S_ANC_PKTTYPE: 1029 insn++;
320 A = skb->pkt_type; 1030
321 continue; 1031 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
322 case BPF_S_ANC_IFINDEX: 1032 insn->a_reg = X_REG;
323 if (!skb->dev) 1033 insn->x_reg = A_REG;
324 return 0; 1034 insn++;
325 A = skb->dev->ifindex; 1035
326 continue; 1036 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
327 case BPF_S_ANC_MARK: 1037 insn->a_reg = A_REG;
328 A = skb->mark; 1038 insn->x_reg = TMP_REG;
329 continue; 1039 break;
330 case BPF_S_ANC_QUEUE: 1040
331 A = skb->queue_mapping; 1041 /* RET_K, RET_A are remaped into 2 insns. */
332 continue; 1042 case BPF_RET | BPF_A:
333 case BPF_S_ANC_HATYPE: 1043 case BPF_RET | BPF_K:
334 if (!skb->dev) 1044 insn->code = BPF_ALU | BPF_MOV |
335 return 0; 1045 (BPF_RVAL(fp->code) == BPF_K ?
336 A = skb->dev->type; 1046 BPF_K : BPF_X);
337 continue; 1047 insn->a_reg = 0;
338 case BPF_S_ANC_RXHASH: 1048 insn->x_reg = A_REG;
339 A = skb->rxhash; 1049 insn->imm = fp->k;
340 continue; 1050 insn++;
341 case BPF_S_ANC_CPU: 1051
342 A = raw_smp_processor_id(); 1052 insn->code = BPF_JMP | BPF_EXIT;
343 continue; 1053 break;
344 case BPF_S_ANC_VLAN_TAG: 1054
345 A = vlan_tx_tag_get(skb); 1055 /* Store to stack. */
346 continue; 1056 case BPF_ST:
347 case BPF_S_ANC_VLAN_TAG_PRESENT: 1057 case BPF_STX:
348 A = !!vlan_tx_tag_present(skb); 1058 insn->code = BPF_STX | BPF_MEM | BPF_W;
349 continue; 1059 insn->a_reg = FP_REG;
350 case BPF_S_ANC_PAY_OFFSET: 1060 insn->x_reg = fp->code == BPF_ST ? A_REG : X_REG;
351 A = __skb_get_poff(skb); 1061 insn->off = -(BPF_MEMWORDS - fp->k) * 4;
352 continue; 1062 break;
353 case BPF_S_ANC_NLATTR: { 1063
354 struct nlattr *nla; 1064 /* Load from stack. */
355 1065 case BPF_LD | BPF_MEM:
356 if (skb_is_nonlinear(skb)) 1066 case BPF_LDX | BPF_MEM:
357 return 0; 1067 insn->code = BPF_LDX | BPF_MEM | BPF_W;
358 if (A > skb->len - sizeof(struct nlattr)) 1068 insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
359 return 0; 1069 A_REG : X_REG;
360 1070 insn->x_reg = FP_REG;
361 nla = nla_find((struct nlattr *)&skb->data[A], 1071 insn->off = -(BPF_MEMWORDS - fp->k) * 4;
362 skb->len - A, X); 1072 break;
363 if (nla) 1073
364 A = (void *)nla - (void *)skb->data; 1074 /* A = K or X = K */
365 else 1075 case BPF_LD | BPF_IMM:
366 A = 0; 1076 case BPF_LDX | BPF_IMM:
367 continue; 1077 insn->code = BPF_ALU | BPF_MOV | BPF_K;
368 } 1078 insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
369 case BPF_S_ANC_NLATTR_NEST: { 1079 A_REG : X_REG;
370 struct nlattr *nla; 1080 insn->imm = fp->k;
371 1081 break;
372 if (skb_is_nonlinear(skb)) 1082
373 return 0; 1083 /* X = A */
374 if (A > skb->len - sizeof(struct nlattr)) 1084 case BPF_MISC | BPF_TAX:
375 return 0; 1085 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
376 1086 insn->a_reg = X_REG;
377 nla = (struct nlattr *)&skb->data[A]; 1087 insn->x_reg = A_REG;
378 if (nla->nla_len > A - skb->len) 1088 break;
379 return 0; 1089
380 1090 /* A = X */
381 nla = nla_find_nested(nla, X); 1091 case BPF_MISC | BPF_TXA:
382 if (nla) 1092 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
383 A = (void *)nla - (void *)skb->data; 1093 insn->a_reg = A_REG;
384 else 1094 insn->x_reg = X_REG;
385 A = 0; 1095 break;
386 continue; 1096
387 } 1097 /* A = skb->len or X = skb->len */
388#ifdef CONFIG_SECCOMP_FILTER 1098 case BPF_LD | BPF_W | BPF_LEN:
389 case BPF_S_ANC_SECCOMP_LD_W: 1099 case BPF_LDX | BPF_W | BPF_LEN:
390 A = seccomp_bpf_load(fentry->k); 1100 insn->code = BPF_LDX | BPF_MEM | BPF_W;
391 continue; 1101 insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
392#endif 1102 A_REG : X_REG;
1103 insn->x_reg = CTX_REG;
1104 insn->off = offsetof(struct sk_buff, len);
1105 break;
1106
1107 /* access seccomp_data fields */
1108 case BPF_LDX | BPF_ABS | BPF_W:
1109 insn->code = BPF_LDX | BPF_MEM | BPF_W;
1110 insn->a_reg = A_REG;
1111 insn->x_reg = CTX_REG;
1112 insn->off = fp->k;
1113 break;
1114
393 default: 1115 default:
394 WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", 1116 goto err;
395 fentry->code, fentry->jt,
396 fentry->jf, fentry->k);
397 return 0;
398 } 1117 }
1118
1119 insn++;
1120 if (new_prog)
1121 memcpy(new_insn, tmp_insns,
1122 sizeof(*insn) * (insn - tmp_insns));
1123
1124 new_insn += insn - tmp_insns;
1125 }
1126
1127 if (!new_prog) {
1128 /* Only calculating new length. */
1129 *new_len = new_insn - new_prog;
1130 return 0;
399 } 1131 }
400 1132
1133 pass++;
1134 if (new_flen != new_insn - new_prog) {
1135 new_flen = new_insn - new_prog;
1136 if (pass > 2)
1137 goto err;
1138
1139 goto do_pass;
1140 }
1141
1142 kfree(addrs);
1143 BUG_ON(*new_len != new_flen);
401 return 0; 1144 return 0;
1145err:
1146 kfree(addrs);
1147 return -EINVAL;
402} 1148}
403EXPORT_SYMBOL(sk_run_filter);
404 1149
405/* 1150/* Security:
406 * Security : 1151 *
407 * A BPF program is able to use 16 cells of memory to store intermediate 1152 * A BPF program is able to use 16 cells of memory to store intermediate
408 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) 1153 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
1154 *
409 * As we dont want to clear mem[] array for each packet going through 1155 * As we dont want to clear mem[] array for each packet going through
410 * sk_run_filter(), we check that filter loaded by user never try to read 1156 * sk_run_filter(), we check that filter loaded by user never try to read
411 * a cell if not previously written, and we check all branches to be sure 1157 * a cell if not previously written, and we check all branches to be sure
@@ -629,30 +1375,197 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
629} 1375}
630EXPORT_SYMBOL(sk_chk_filter); 1376EXPORT_SYMBOL(sk_chk_filter);
631 1377
1378static int sk_store_orig_filter(struct sk_filter *fp,
1379 const struct sock_fprog *fprog)
1380{
1381 unsigned int fsize = sk_filter_proglen(fprog);
1382 struct sock_fprog_kern *fkprog;
1383
1384 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1385 if (!fp->orig_prog)
1386 return -ENOMEM;
1387
1388 fkprog = fp->orig_prog;
1389 fkprog->len = fprog->len;
1390 fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
1391 if (!fkprog->filter) {
1392 kfree(fp->orig_prog);
1393 return -ENOMEM;
1394 }
1395
1396 return 0;
1397}
1398
1399static void sk_release_orig_filter(struct sk_filter *fp)
1400{
1401 struct sock_fprog_kern *fprog = fp->orig_prog;
1402
1403 if (fprog) {
1404 kfree(fprog->filter);
1405 kfree(fprog);
1406 }
1407}
1408
632/** 1409/**
633 * sk_filter_release_rcu - Release a socket filter by rcu_head 1410 * sk_filter_release_rcu - Release a socket filter by rcu_head
634 * @rcu: rcu_head that contains the sk_filter to free 1411 * @rcu: rcu_head that contains the sk_filter to free
635 */ 1412 */
636void sk_filter_release_rcu(struct rcu_head *rcu) 1413static void sk_filter_release_rcu(struct rcu_head *rcu)
637{ 1414{
638 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 1415 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
639 1416
1417 sk_release_orig_filter(fp);
640 bpf_jit_free(fp); 1418 bpf_jit_free(fp);
641} 1419}
642EXPORT_SYMBOL(sk_filter_release_rcu);
643 1420
644static int __sk_prepare_filter(struct sk_filter *fp) 1421/**
1422 * sk_filter_release - release a socket filter
1423 * @fp: filter to remove
1424 *
1425 * Remove a filter from a socket and release its resources.
1426 */
1427static void sk_filter_release(struct sk_filter *fp)
1428{
1429 if (atomic_dec_and_test(&fp->refcnt))
1430 call_rcu(&fp->rcu, sk_filter_release_rcu);
1431}
1432
1433void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1434{
1435 atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
1436 sk_filter_release(fp);
1437}
1438
1439void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1440{
1441 atomic_inc(&fp->refcnt);
1442 atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
1443}
1444
1445static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
1446 struct sock *sk,
1447 unsigned int len)
1448{
1449 struct sk_filter *fp_new;
1450
1451 if (sk == NULL)
1452 return krealloc(fp, len, GFP_KERNEL);
1453
1454 fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
1455 if (fp_new) {
1456 memcpy(fp_new, fp, sizeof(struct sk_filter));
1457 /* As we're kepping orig_prog in fp_new along,
1458 * we need to make sure we're not evicting it
1459 * from the old fp.
1460 */
1461 fp->orig_prog = NULL;
1462 sk_filter_uncharge(sk, fp);
1463 }
1464
1465 return fp_new;
1466}
1467
1468static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
1469 struct sock *sk)
1470{
1471 struct sock_filter *old_prog;
1472 struct sk_filter *old_fp;
1473 int i, err, new_len, old_len = fp->len;
1474
1475 /* We are free to overwrite insns et al right here as it
1476 * won't be used at this point in time anymore internally
1477 * after the migration to the internal BPF instruction
1478 * representation.
1479 */
1480 BUILD_BUG_ON(sizeof(struct sock_filter) !=
1481 sizeof(struct sock_filter_int));
1482
1483 /* For now, we need to unfiddle BPF_S_* identifiers in place.
1484 * This can sooner or later on be subject to removal, e.g. when
1485 * JITs have been converted.
1486 */
1487 for (i = 0; i < fp->len; i++)
1488 sk_decode_filter(&fp->insns[i], &fp->insns[i]);
1489
1490 /* Conversion cannot happen on overlapping memory areas,
1491 * so we need to keep the user BPF around until the 2nd
1492 * pass. At this time, the user BPF is stored in fp->insns.
1493 */
1494 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
1495 GFP_KERNEL);
1496 if (!old_prog) {
1497 err = -ENOMEM;
1498 goto out_err;
1499 }
1500
1501 /* 1st pass: calculate the new program length. */
1502 err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
1503 if (err)
1504 goto out_err_free;
1505
1506 /* Expand fp for appending the new filter representation. */
1507 old_fp = fp;
1508 fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
1509 if (!fp) {
1510 /* The old_fp is still around in case we couldn't
1511 * allocate new memory, so uncharge on that one.
1512 */
1513 fp = old_fp;
1514 err = -ENOMEM;
1515 goto out_err_free;
1516 }
1517
1518 fp->bpf_func = sk_run_filter_int_skb;
1519 fp->len = new_len;
1520
1521 /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
1522 err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
1523 if (err)
1524 /* 2nd sk_convert_filter() can fail only if it fails
1525 * to allocate memory, remapping must succeed. Note,
1526 * that at this time old_fp has already been released
1527 * by __sk_migrate_realloc().
1528 */
1529 goto out_err_free;
1530
1531 kfree(old_prog);
1532 return fp;
1533
1534out_err_free:
1535 kfree(old_prog);
1536out_err:
1537 /* Rollback filter setup. */
1538 if (sk != NULL)
1539 sk_filter_uncharge(sk, fp);
1540 else
1541 kfree(fp);
1542 return ERR_PTR(err);
1543}
1544
1545static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
1546 struct sock *sk)
645{ 1547{
646 int err; 1548 int err;
647 1549
648 fp->bpf_func = sk_run_filter; 1550 fp->bpf_func = NULL;
1551 fp->jited = 0;
649 1552
650 err = sk_chk_filter(fp->insns, fp->len); 1553 err = sk_chk_filter(fp->insns, fp->len);
651 if (err) 1554 if (err)
652 return err; 1555 return ERR_PTR(err);
653 1556
1557 /* Probe if we can JIT compile the filter and if so, do
1558 * the compilation of the filter.
1559 */
654 bpf_jit_compile(fp); 1560 bpf_jit_compile(fp);
655 return 0; 1561
1562 /* JIT compiler couldn't process this filter, so do the
1563 * internal BPF translation for the optimized interpreter.
1564 */
1565 if (!fp->jited)
1566 fp = __sk_migrate_filter(fp, sk);
1567
1568 return fp;
656} 1569}
657 1570
658/** 1571/**
@@ -668,9 +1581,8 @@ static int __sk_prepare_filter(struct sk_filter *fp)
668int sk_unattached_filter_create(struct sk_filter **pfp, 1581int sk_unattached_filter_create(struct sk_filter **pfp,
669 struct sock_fprog *fprog) 1582 struct sock_fprog *fprog)
670{ 1583{
1584 unsigned int fsize = sk_filter_proglen(fprog);
671 struct sk_filter *fp; 1585 struct sk_filter *fp;
672 unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
673 int err;
674 1586
675 /* Make sure new filter is there and in the right amounts. */ 1587 /* Make sure new filter is there and in the right amounts. */
676 if (fprog->filter == NULL) 1588 if (fprog->filter == NULL)
@@ -679,20 +1591,26 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
679 fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); 1591 fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
680 if (!fp) 1592 if (!fp)
681 return -ENOMEM; 1593 return -ENOMEM;
1594
682 memcpy(fp->insns, fprog->filter, fsize); 1595 memcpy(fp->insns, fprog->filter, fsize);
683 1596
684 atomic_set(&fp->refcnt, 1); 1597 atomic_set(&fp->refcnt, 1);
685 fp->len = fprog->len; 1598 fp->len = fprog->len;
1599 /* Since unattached filters are not copied back to user
1600 * space through sk_get_filter(), we do not need to hold
1601 * a copy here, and can spare us the work.
1602 */
1603 fp->orig_prog = NULL;
686 1604
687 err = __sk_prepare_filter(fp); 1605 /* __sk_prepare_filter() already takes care of uncharging
688 if (err) 1606 * memory in case something goes wrong.
689 goto free_mem; 1607 */
1608 fp = __sk_prepare_filter(fp, NULL);
1609 if (IS_ERR(fp))
1610 return PTR_ERR(fp);
690 1611
691 *pfp = fp; 1612 *pfp = fp;
692 return 0; 1613 return 0;
693free_mem:
694 kfree(fp);
695 return err;
696} 1614}
697EXPORT_SYMBOL_GPL(sk_unattached_filter_create); 1615EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
698 1616
@@ -715,7 +1633,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
715int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1633int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
716{ 1634{
717 struct sk_filter *fp, *old_fp; 1635 struct sk_filter *fp, *old_fp;
718 unsigned int fsize = sizeof(struct sock_filter) * fprog->len; 1636 unsigned int fsize = sk_filter_proglen(fprog);
719 unsigned int sk_fsize = sk_filter_size(fprog->len); 1637 unsigned int sk_fsize = sk_filter_size(fprog->len);
720 int err; 1638 int err;
721 1639
@@ -729,6 +1647,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
729 fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); 1647 fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
730 if (!fp) 1648 if (!fp)
731 return -ENOMEM; 1649 return -ENOMEM;
1650
732 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1651 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
733 sock_kfree_s(sk, fp, sk_fsize); 1652 sock_kfree_s(sk, fp, sk_fsize);
734 return -EFAULT; 1653 return -EFAULT;
@@ -737,18 +1656,26 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
737 atomic_set(&fp->refcnt, 1); 1656 atomic_set(&fp->refcnt, 1);
738 fp->len = fprog->len; 1657 fp->len = fprog->len;
739 1658
740 err = __sk_prepare_filter(fp); 1659 err = sk_store_orig_filter(fp, fprog);
741 if (err) { 1660 if (err) {
742 sk_filter_uncharge(sk, fp); 1661 sk_filter_uncharge(sk, fp);
743 return err; 1662 return -ENOMEM;
744 } 1663 }
745 1664
1665 /* __sk_prepare_filter() already takes care of uncharging
1666 * memory in case something goes wrong.
1667 */
1668 fp = __sk_prepare_filter(fp, sk);
1669 if (IS_ERR(fp))
1670 return PTR_ERR(fp);
1671
746 old_fp = rcu_dereference_protected(sk->sk_filter, 1672 old_fp = rcu_dereference_protected(sk->sk_filter,
747 sock_owned_by_user(sk)); 1673 sock_owned_by_user(sk));
748 rcu_assign_pointer(sk->sk_filter, fp); 1674 rcu_assign_pointer(sk->sk_filter, fp);
749 1675
750 if (old_fp) 1676 if (old_fp)
751 sk_filter_uncharge(sk, old_fp); 1677 sk_filter_uncharge(sk, old_fp);
1678
752 return 0; 1679 return 0;
753} 1680}
754EXPORT_SYMBOL_GPL(sk_attach_filter); 1681EXPORT_SYMBOL_GPL(sk_attach_filter);
@@ -768,6 +1695,7 @@ int sk_detach_filter(struct sock *sk)
768 sk_filter_uncharge(sk, filter); 1695 sk_filter_uncharge(sk, filter);
769 ret = 0; 1696 ret = 0;
770 } 1697 }
1698
771 return ret; 1699 return ret;
772} 1700}
773EXPORT_SYMBOL_GPL(sk_detach_filter); 1701EXPORT_SYMBOL_GPL(sk_detach_filter);
@@ -850,34 +1778,41 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
850 to->k = filt->k; 1778 to->k = filt->k;
851} 1779}
852 1780
853int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) 1781int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
1782 unsigned int len)
854{ 1783{
1784 struct sock_fprog_kern *fprog;
855 struct sk_filter *filter; 1785 struct sk_filter *filter;
856 int i, ret; 1786 int ret = 0;
857 1787
858 lock_sock(sk); 1788 lock_sock(sk);
859 filter = rcu_dereference_protected(sk->sk_filter, 1789 filter = rcu_dereference_protected(sk->sk_filter,
860 sock_owned_by_user(sk)); 1790 sock_owned_by_user(sk));
861 ret = 0;
862 if (!filter) 1791 if (!filter)
863 goto out; 1792 goto out;
864 ret = filter->len; 1793
1794 /* We're copying the filter that has been originally attached,
1795 * so no conversion/decode needed anymore.
1796 */
1797 fprog = filter->orig_prog;
1798
1799 ret = fprog->len;
865 if (!len) 1800 if (!len)
1801 /* User space only enquires number of filter blocks. */
866 goto out; 1802 goto out;
1803
867 ret = -EINVAL; 1804 ret = -EINVAL;
868 if (len < filter->len) 1805 if (len < fprog->len)
869 goto out; 1806 goto out;
870 1807
871 ret = -EFAULT; 1808 ret = -EFAULT;
872 for (i = 0; i < filter->len; i++) { 1809 if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
873 struct sock_filter fb; 1810 goto out;
874
875 sk_decode_filter(&filter->insns[i], &fb);
876 if (copy_to_user(&ubuf[i], &fb, sizeof(fb)))
877 goto out;
878 }
879 1811
880 ret = filter->len; 1812 /* Instead of bytes, the API requests to return the number
1813 * of filter blocks.
1814 */
1815 ret = fprog->len;
881out: 1816out:
882 release_sock(sk); 1817 release_sock(sk);
883 return ret; 1818 return ret;
diff --git a/net/core/flow.c b/net/core/flow.c
index dfa602ceb8cd..31cfb365e0c6 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -24,6 +24,7 @@
24#include <net/flow.h> 24#include <net/flow.h>
25#include <linux/atomic.h> 25#include <linux/atomic.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <net/net_namespace.h>
27 28
28struct flow_cache_entry { 29struct flow_cache_entry {
29 union { 30 union {
@@ -38,37 +39,14 @@ struct flow_cache_entry {
38 struct flow_cache_object *object; 39 struct flow_cache_object *object;
39}; 40};
40 41
41struct flow_cache_percpu {
42 struct hlist_head *hash_table;
43 int hash_count;
44 u32 hash_rnd;
45 int hash_rnd_recalc;
46 struct tasklet_struct flush_tasklet;
47};
48
49struct flow_flush_info { 42struct flow_flush_info {
50 struct flow_cache *cache; 43 struct flow_cache *cache;
51 atomic_t cpuleft; 44 atomic_t cpuleft;
52 struct completion completion; 45 struct completion completion;
53}; 46};
54 47
55struct flow_cache {
56 u32 hash_shift;
57 struct flow_cache_percpu __percpu *percpu;
58 struct notifier_block hotcpu_notifier;
59 int low_watermark;
60 int high_watermark;
61 struct timer_list rnd_timer;
62};
63
64atomic_t flow_cache_genid = ATOMIC_INIT(0);
65EXPORT_SYMBOL(flow_cache_genid);
66static struct flow_cache flow_cache_global;
67static struct kmem_cache *flow_cachep __read_mostly; 48static struct kmem_cache *flow_cachep __read_mostly;
68 49
69static DEFINE_SPINLOCK(flow_cache_gc_lock);
70static LIST_HEAD(flow_cache_gc_list);
71
72#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) 50#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
73#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) 51#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
74 52
@@ -84,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg)
84 add_timer(&fc->rnd_timer); 62 add_timer(&fc->rnd_timer);
85} 63}
86 64
87static int flow_entry_valid(struct flow_cache_entry *fle) 65static int flow_entry_valid(struct flow_cache_entry *fle,
66 struct netns_xfrm *xfrm)
88{ 67{
89 if (atomic_read(&flow_cache_genid) != fle->genid) 68 if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
90 return 0; 69 return 0;
91 if (fle->object && !fle->object->ops->check(fle->object)) 70 if (fle->object && !fle->object->ops->check(fle->object))
92 return 0; 71 return 0;
93 return 1; 72 return 1;
94} 73}
95 74
96static void flow_entry_kill(struct flow_cache_entry *fle) 75static void flow_entry_kill(struct flow_cache_entry *fle,
76 struct netns_xfrm *xfrm)
97{ 77{
98 if (fle->object) 78 if (fle->object)
99 fle->object->ops->delete(fle->object); 79 fle->object->ops->delete(fle->object);
@@ -104,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work)
104{ 84{
105 struct list_head gc_list; 85 struct list_head gc_list;
106 struct flow_cache_entry *fce, *n; 86 struct flow_cache_entry *fce, *n;
87 struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
88 flow_cache_gc_work);
107 89
108 INIT_LIST_HEAD(&gc_list); 90 INIT_LIST_HEAD(&gc_list);
109 spin_lock_bh(&flow_cache_gc_lock); 91 spin_lock_bh(&xfrm->flow_cache_gc_lock);
110 list_splice_tail_init(&flow_cache_gc_list, &gc_list); 92 list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
111 spin_unlock_bh(&flow_cache_gc_lock); 93 spin_unlock_bh(&xfrm->flow_cache_gc_lock);
112 94
113 list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) 95 list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
114 flow_entry_kill(fce); 96 flow_entry_kill(fce, xfrm);
115} 97}
116static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
117 98
118static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, 99static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
119 int deleted, struct list_head *gc_list) 100 int deleted, struct list_head *gc_list,
101 struct netns_xfrm *xfrm)
120{ 102{
121 if (deleted) { 103 if (deleted) {
122 fcp->hash_count -= deleted; 104 fcp->hash_count -= deleted;
123 spin_lock_bh(&flow_cache_gc_lock); 105 spin_lock_bh(&xfrm->flow_cache_gc_lock);
124 list_splice_tail(gc_list, &flow_cache_gc_list); 106 list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
125 spin_unlock_bh(&flow_cache_gc_lock); 107 spin_unlock_bh(&xfrm->flow_cache_gc_lock);
126 schedule_work(&flow_cache_gc_work); 108 schedule_work(&xfrm->flow_cache_gc_work);
127 } 109 }
128} 110}
129 111
@@ -135,6 +117,8 @@ static void __flow_cache_shrink(struct flow_cache *fc,
135 struct hlist_node *tmp; 117 struct hlist_node *tmp;
136 LIST_HEAD(gc_list); 118 LIST_HEAD(gc_list);
137 int i, deleted = 0; 119 int i, deleted = 0;
120 struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
121 flow_cache_global);
138 122
139 for (i = 0; i < flow_cache_hash_size(fc); i++) { 123 for (i = 0; i < flow_cache_hash_size(fc); i++) {
140 int saved = 0; 124 int saved = 0;
@@ -142,7 +126,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
142 hlist_for_each_entry_safe(fle, tmp, 126 hlist_for_each_entry_safe(fle, tmp,
143 &fcp->hash_table[i], u.hlist) { 127 &fcp->hash_table[i], u.hlist) {
144 if (saved < shrink_to && 128 if (saved < shrink_to &&
145 flow_entry_valid(fle)) { 129 flow_entry_valid(fle, xfrm)) {
146 saved++; 130 saved++;
147 } else { 131 } else {
148 deleted++; 132 deleted++;
@@ -152,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
152 } 136 }
153 } 137 }
154 138
155 flow_cache_queue_garbage(fcp, deleted, &gc_list); 139 flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
156} 140}
157 141
158static void flow_cache_shrink(struct flow_cache *fc, 142static void flow_cache_shrink(struct flow_cache *fc,
@@ -208,7 +192,7 @@ struct flow_cache_object *
208flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, 192flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
209 flow_resolve_t resolver, void *ctx) 193 flow_resolve_t resolver, void *ctx)
210{ 194{
211 struct flow_cache *fc = &flow_cache_global; 195 struct flow_cache *fc = &net->xfrm.flow_cache_global;
212 struct flow_cache_percpu *fcp; 196 struct flow_cache_percpu *fcp;
213 struct flow_cache_entry *fle, *tfle; 197 struct flow_cache_entry *fle, *tfle;
214 struct flow_cache_object *flo; 198 struct flow_cache_object *flo;
@@ -258,7 +242,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
258 hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); 242 hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
259 fcp->hash_count++; 243 fcp->hash_count++;
260 } 244 }
261 } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { 245 } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
262 flo = fle->object; 246 flo = fle->object;
263 if (!flo) 247 if (!flo)
264 goto ret_object; 248 goto ret_object;
@@ -279,7 +263,7 @@ nocache:
279 } 263 }
280 flo = resolver(net, key, family, dir, flo, ctx); 264 flo = resolver(net, key, family, dir, flo, ctx);
281 if (fle) { 265 if (fle) {
282 fle->genid = atomic_read(&flow_cache_genid); 266 fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
283 if (!IS_ERR(flo)) 267 if (!IS_ERR(flo))
284 fle->object = flo; 268 fle->object = flo;
285 else 269 else
@@ -303,12 +287,14 @@ static void flow_cache_flush_tasklet(unsigned long data)
303 struct hlist_node *tmp; 287 struct hlist_node *tmp;
304 LIST_HEAD(gc_list); 288 LIST_HEAD(gc_list);
305 int i, deleted = 0; 289 int i, deleted = 0;
290 struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
291 flow_cache_global);
306 292
307 fcp = this_cpu_ptr(fc->percpu); 293 fcp = this_cpu_ptr(fc->percpu);
308 for (i = 0; i < flow_cache_hash_size(fc); i++) { 294 for (i = 0; i < flow_cache_hash_size(fc); i++) {
309 hlist_for_each_entry_safe(fle, tmp, 295 hlist_for_each_entry_safe(fle, tmp,
310 &fcp->hash_table[i], u.hlist) { 296 &fcp->hash_table[i], u.hlist) {
311 if (flow_entry_valid(fle)) 297 if (flow_entry_valid(fle, xfrm))
312 continue; 298 continue;
313 299
314 deleted++; 300 deleted++;
@@ -317,7 +303,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
317 } 303 }
318 } 304 }
319 305
320 flow_cache_queue_garbage(fcp, deleted, &gc_list); 306 flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
321 307
322 if (atomic_dec_and_test(&info->cpuleft)) 308 if (atomic_dec_and_test(&info->cpuleft))
323 complete(&info->completion); 309 complete(&info->completion);
@@ -351,10 +337,9 @@ static void flow_cache_flush_per_cpu(void *data)
351 tasklet_schedule(tasklet); 337 tasklet_schedule(tasklet);
352} 338}
353 339
354void flow_cache_flush(void) 340void flow_cache_flush(struct net *net)
355{ 341{
356 struct flow_flush_info info; 342 struct flow_flush_info info;
357 static DEFINE_MUTEX(flow_flush_sem);
358 cpumask_var_t mask; 343 cpumask_var_t mask;
359 int i, self; 344 int i, self;
360 345
@@ -365,8 +350,8 @@ void flow_cache_flush(void)
365 350
366 /* Don't want cpus going down or up during this. */ 351 /* Don't want cpus going down or up during this. */
367 get_online_cpus(); 352 get_online_cpus();
368 mutex_lock(&flow_flush_sem); 353 mutex_lock(&net->xfrm.flow_flush_sem);
369 info.cache = &flow_cache_global; 354 info.cache = &net->xfrm.flow_cache_global;
370 for_each_online_cpu(i) 355 for_each_online_cpu(i)
371 if (!flow_cache_percpu_empty(info.cache, i)) 356 if (!flow_cache_percpu_empty(info.cache, i))
372 cpumask_set_cpu(i, mask); 357 cpumask_set_cpu(i, mask);
@@ -386,21 +371,23 @@ void flow_cache_flush(void)
386 wait_for_completion(&info.completion); 371 wait_for_completion(&info.completion);
387 372
388done: 373done:
389 mutex_unlock(&flow_flush_sem); 374 mutex_unlock(&net->xfrm.flow_flush_sem);
390 put_online_cpus(); 375 put_online_cpus();
391 free_cpumask_var(mask); 376 free_cpumask_var(mask);
392} 377}
393 378
394static void flow_cache_flush_task(struct work_struct *work) 379static void flow_cache_flush_task(struct work_struct *work)
395{ 380{
396 flow_cache_flush(); 381 struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
397} 382 flow_cache_gc_work);
383 struct net *net = container_of(xfrm, struct net, xfrm);
398 384
399static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); 385 flow_cache_flush(net);
386}
400 387
401void flow_cache_flush_deferred(void) 388void flow_cache_flush_deferred(struct net *net)
402{ 389{
403 schedule_work(&flow_cache_flush_work); 390 schedule_work(&net->xfrm.flow_cache_flush_work);
404} 391}
405 392
406static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) 393static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
@@ -425,7 +412,8 @@ static int flow_cache_cpu(struct notifier_block *nfb,
425 unsigned long action, 412 unsigned long action,
426 void *hcpu) 413 void *hcpu)
427{ 414{
428 struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); 415 struct flow_cache *fc = container_of(nfb, struct flow_cache,
416 hotcpu_notifier);
429 int res, cpu = (unsigned long) hcpu; 417 int res, cpu = (unsigned long) hcpu;
430 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 418 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
431 419
@@ -444,9 +432,20 @@ static int flow_cache_cpu(struct notifier_block *nfb,
444 return NOTIFY_OK; 432 return NOTIFY_OK;
445} 433}
446 434
447static int __init flow_cache_init(struct flow_cache *fc) 435int flow_cache_init(struct net *net)
448{ 436{
449 int i; 437 int i;
438 struct flow_cache *fc = &net->xfrm.flow_cache_global;
439
440 if (!flow_cachep)
441 flow_cachep = kmem_cache_create("flow_cache",
442 sizeof(struct flow_cache_entry),
443 0, SLAB_PANIC, NULL);
444 spin_lock_init(&net->xfrm.flow_cache_gc_lock);
445 INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
446 INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
447 INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
448 mutex_init(&net->xfrm.flow_flush_sem);
450 449
451 fc->hash_shift = 10; 450 fc->hash_shift = 10;
452 fc->low_watermark = 2 * flow_cache_hash_size(fc); 451 fc->low_watermark = 2 * flow_cache_hash_size(fc);
@@ -484,14 +483,23 @@ err:
484 483
485 return -ENOMEM; 484 return -ENOMEM;
486} 485}
486EXPORT_SYMBOL(flow_cache_init);
487 487
488static int __init flow_cache_init_global(void) 488void flow_cache_fini(struct net *net)
489{ 489{
490 flow_cachep = kmem_cache_create("flow_cache", 490 int i;
491 sizeof(struct flow_cache_entry), 491 struct flow_cache *fc = &net->xfrm.flow_cache_global;
492 0, SLAB_PANIC, NULL);
493 492
494 return flow_cache_init(&flow_cache_global); 493 del_timer_sync(&fc->rnd_timer);
495} 494 unregister_hotcpu_notifier(&fc->hotcpu_notifier);
496 495
497module_init(flow_cache_init_global); 496 for_each_possible_cpu(i) {
497 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
498 kfree(fcp->hash_table);
499 fcp->hash_table = NULL;
500 }
501
502 free_percpu(fc->percpu);
503 fc->percpu = NULL;
504}
505EXPORT_SYMBOL(flow_cache_fini);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e29e810663d7..107ed12a5323 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -61,7 +61,7 @@ bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
61 61
62again: 62again:
63 switch (proto) { 63 switch (proto) {
64 case __constant_htons(ETH_P_IP): { 64 case htons(ETH_P_IP): {
65 const struct iphdr *iph; 65 const struct iphdr *iph;
66 struct iphdr _iph; 66 struct iphdr _iph;
67ip: 67ip:
@@ -77,7 +77,7 @@ ip:
77 iph_to_flow_copy_addrs(flow, iph); 77 iph_to_flow_copy_addrs(flow, iph);
78 break; 78 break;
79 } 79 }
80 case __constant_htons(ETH_P_IPV6): { 80 case htons(ETH_P_IPV6): {
81 const struct ipv6hdr *iph; 81 const struct ipv6hdr *iph;
82 struct ipv6hdr _iph; 82 struct ipv6hdr _iph;
83ipv6: 83ipv6:
@@ -91,8 +91,8 @@ ipv6:
91 nhoff += sizeof(struct ipv6hdr); 91 nhoff += sizeof(struct ipv6hdr);
92 break; 92 break;
93 } 93 }
94 case __constant_htons(ETH_P_8021AD): 94 case htons(ETH_P_8021AD):
95 case __constant_htons(ETH_P_8021Q): { 95 case htons(ETH_P_8021Q): {
96 const struct vlan_hdr *vlan; 96 const struct vlan_hdr *vlan;
97 struct vlan_hdr _vlan; 97 struct vlan_hdr _vlan;
98 98
@@ -104,7 +104,7 @@ ipv6:
104 nhoff += sizeof(*vlan); 104 nhoff += sizeof(*vlan);
105 goto again; 105 goto again;
106 } 106 }
107 case __constant_htons(ETH_P_PPP_SES): { 107 case htons(ETH_P_PPP_SES): {
108 struct { 108 struct {
109 struct pppoe_hdr hdr; 109 struct pppoe_hdr hdr;
110 __be16 proto; 110 __be16 proto;
@@ -115,9 +115,9 @@ ipv6:
115 proto = hdr->proto; 115 proto = hdr->proto;
116 nhoff += PPPOE_SES_HLEN; 116 nhoff += PPPOE_SES_HLEN;
117 switch (proto) { 117 switch (proto) {
118 case __constant_htons(PPP_IP): 118 case htons(PPP_IP):
119 goto ip; 119 goto ip;
120 case __constant_htons(PPP_IPV6): 120 case htons(PPP_IPV6):
121 goto ipv6; 121 goto ipv6;
122 default: 122 default:
123 return false; 123 return false;
@@ -203,8 +203,8 @@ static __always_inline u32 __flow_hash_1word(u32 a)
203 203
204/* 204/*
205 * __skb_get_hash: calculate a flow hash based on src/dst addresses 205 * __skb_get_hash: calculate a flow hash based on src/dst addresses
206 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value 206 * and src/dst port numbers. Sets hash in skb to non-zero hash value
207 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb 207 * on success, zero indicates no valid hash. Also, sets l4_hash in skb
208 * if hash is a canonical 4-tuple hash over transport ports. 208 * if hash is a canonical 4-tuple hash over transport ports.
209 */ 209 */
210void __skb_get_hash(struct sk_buff *skb) 210void __skb_get_hash(struct sk_buff *skb)
@@ -216,7 +216,7 @@ void __skb_get_hash(struct sk_buff *skb)
216 return; 216 return;
217 217
218 if (keys.ports) 218 if (keys.ports)
219 skb->l4_rxhash = 1; 219 skb->l4_hash = 1;
220 220
221 /* get a consistent hash (same value on both flow directions) */ 221 /* get a consistent hash (same value on both flow directions) */
222 if (((__force u32)keys.dst < (__force u32)keys.src) || 222 if (((__force u32)keys.dst < (__force u32)keys.src) ||
@@ -232,7 +232,7 @@ void __skb_get_hash(struct sk_buff *skb)
232 if (!hash) 232 if (!hash)
233 hash = 1; 233 hash = 1;
234 234
235 skb->rxhash = hash; 235 skb->hash = hash;
236} 236}
237EXPORT_SYMBOL(__skb_get_hash); 237EXPORT_SYMBOL(__skb_get_hash);
238 238
@@ -344,7 +344,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
344 hash = skb->sk->sk_hash; 344 hash = skb->sk->sk_hash;
345 else 345 else
346 hash = (__force u16) skb->protocol ^ 346 hash = (__force u16) skb->protocol ^
347 skb->rxhash; 347 skb->hash;
348 hash = __flow_hash_1word(hash); 348 hash = __flow_hash_1word(hash);
349 queue_index = map->queues[ 349 queue_index = map->queues[
350 ((u64)hash * map->len) >> 32]; 350 ((u64)hash * map->len) >> 32];
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e16129019c66..8f8a96ef9f3f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -836,10 +836,10 @@ out:
836static __inline__ int neigh_max_probes(struct neighbour *n) 836static __inline__ int neigh_max_probes(struct neighbour *n)
837{ 837{
838 struct neigh_parms *p = n->parms; 838 struct neigh_parms *p = n->parms;
839 return (n->nud_state & NUD_PROBE) ? 839 int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES);
840 NEIGH_VAR(p, UCAST_PROBES) : 840 if (!(n->nud_state & NUD_PROBE))
841 NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + 841 max_probes += NEIGH_VAR(p, MCAST_PROBES);
842 NEIGH_VAR(p, MCAST_PROBES); 842 return max_probes;
843} 843}
844 844
845static void neigh_invalidate(struct neighbour *neigh) 845static void neigh_invalidate(struct neighbour *neigh)
@@ -945,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg)
945 neigh->nud_state = NUD_FAILED; 945 neigh->nud_state = NUD_FAILED;
946 notify = 1; 946 notify = 1;
947 neigh_invalidate(neigh); 947 neigh_invalidate(neigh);
948 goto out;
948 } 949 }
949 950
950 if (neigh->nud_state & NUD_IN_TIMER) { 951 if (neigh->nud_state & NUD_IN_TIMER) {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 549043c078c9..1cac29ebb05b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -104,6 +104,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
104} 104}
105 105
106NETDEVICE_SHOW_RO(dev_id, fmt_hex); 106NETDEVICE_SHOW_RO(dev_id, fmt_hex);
107NETDEVICE_SHOW_RO(dev_port, fmt_dec);
107NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); 108NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
108NETDEVICE_SHOW_RO(addr_len, fmt_dec); 109NETDEVICE_SHOW_RO(addr_len, fmt_dec);
109NETDEVICE_SHOW_RO(iflink, fmt_dec); 110NETDEVICE_SHOW_RO(iflink, fmt_dec);
@@ -252,6 +253,16 @@ static ssize_t operstate_show(struct device *dev,
252} 253}
253static DEVICE_ATTR_RO(operstate); 254static DEVICE_ATTR_RO(operstate);
254 255
256static ssize_t carrier_changes_show(struct device *dev,
257 struct device_attribute *attr,
258 char *buf)
259{
260 struct net_device *netdev = to_net_dev(dev);
261 return sprintf(buf, fmt_dec,
262 atomic_read(&netdev->carrier_changes));
263}
264static DEVICE_ATTR_RO(carrier_changes);
265
255/* read-write attributes */ 266/* read-write attributes */
256 267
257static int change_mtu(struct net_device *net, unsigned long new_mtu) 268static int change_mtu(struct net_device *net, unsigned long new_mtu)
@@ -373,6 +384,7 @@ static struct attribute *net_class_attrs[] = {
373 &dev_attr_netdev_group.attr, 384 &dev_attr_netdev_group.attr,
374 &dev_attr_type.attr, 385 &dev_attr_type.attr,
375 &dev_attr_dev_id.attr, 386 &dev_attr_dev_id.attr,
387 &dev_attr_dev_port.attr,
376 &dev_attr_iflink.attr, 388 &dev_attr_iflink.attr,
377 &dev_attr_ifindex.attr, 389 &dev_attr_ifindex.attr,
378 &dev_attr_addr_assign_type.attr, 390 &dev_attr_addr_assign_type.attr,
@@ -384,6 +396,7 @@ static struct attribute *net_class_attrs[] = {
384 &dev_attr_duplex.attr, 396 &dev_attr_duplex.attr,
385 &dev_attr_dormant.attr, 397 &dev_attr_dormant.attr,
386 &dev_attr_operstate.attr, 398 &dev_attr_operstate.attr,
399 &dev_attr_carrier_changes.attr,
387 &dev_attr_ifalias.attr, 400 &dev_attr_ifalias.attr,
388 &dev_attr_carrier.attr, 401 &dev_attr_carrier.attr,
389 &dev_attr_mtu.attr, 402 &dev_attr_mtu.attr,
@@ -996,15 +1009,12 @@ static struct attribute_group dql_group = {
996#endif /* CONFIG_BQL */ 1009#endif /* CONFIG_BQL */
997 1010
998#ifdef CONFIG_XPS 1011#ifdef CONFIG_XPS
999static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) 1012static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
1000{ 1013{
1001 struct net_device *dev = queue->dev; 1014 struct net_device *dev = queue->dev;
1002 int i; 1015 unsigned int i;
1003
1004 for (i = 0; i < dev->num_tx_queues; i++)
1005 if (queue == &dev->_tx[i])
1006 break;
1007 1016
1017 i = queue - dev->_tx;
1008 BUG_ON(i >= dev->num_tx_queues); 1018 BUG_ON(i >= dev->num_tx_queues);
1009 1019
1010 return i; 1020 return i;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index df9e6b1a9759..e33937fb32a0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -46,13 +46,9 @@
46 46
47static struct sk_buff_head skb_pool; 47static struct sk_buff_head skb_pool;
48 48
49static atomic_t trapped;
50
51DEFINE_STATIC_SRCU(netpoll_srcu); 49DEFINE_STATIC_SRCU(netpoll_srcu);
52 50
53#define USEC_PER_POLL 50 51#define USEC_PER_POLL 50
54#define NETPOLL_RX_ENABLED 1
55#define NETPOLL_RX_DROP 2
56 52
57#define MAX_SKB_SIZE \ 53#define MAX_SKB_SIZE \
58 (sizeof(struct ethhdr) + \ 54 (sizeof(struct ethhdr) + \
@@ -61,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
61 MAX_UDP_CHUNK) 57 MAX_UDP_CHUNK)
62 58
63static void zap_completion_queue(void); 59static void zap_completion_queue(void);
64static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo);
65static void netpoll_async_cleanup(struct work_struct *work); 60static void netpoll_async_cleanup(struct work_struct *work);
66 61
67static unsigned int carrier_timeout = 4; 62static unsigned int carrier_timeout = 4;
@@ -74,6 +69,37 @@ module_param(carrier_timeout, uint, 0644);
74#define np_notice(np, fmt, ...) \ 69#define np_notice(np, fmt, ...) \
75 pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) 70 pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
76 71
72static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
73 struct netdev_queue *txq)
74{
75 const struct net_device_ops *ops = dev->netdev_ops;
76 int status = NETDEV_TX_OK;
77 netdev_features_t features;
78
79 features = netif_skb_features(skb);
80
81 if (vlan_tx_tag_present(skb) &&
82 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
83 skb = __vlan_put_tag(skb, skb->vlan_proto,
84 vlan_tx_tag_get(skb));
85 if (unlikely(!skb)) {
86 /* This is actually a packet drop, but we
87 * don't want the code that calls this
88 * function to try and operate on a NULL skb.
89 */
90 goto out;
91 }
92 skb->vlan_tci = 0;
93 }
94
95 status = ops->ndo_start_xmit(skb, dev);
96 if (status == NETDEV_TX_OK)
97 txq_trans_update(txq);
98
99out:
100 return status;
101}
102
77static void queue_process(struct work_struct *work) 103static void queue_process(struct work_struct *work)
78{ 104{
79 struct netpoll_info *npinfo = 105 struct netpoll_info *npinfo =
@@ -83,51 +109,31 @@ static void queue_process(struct work_struct *work)
83 109
84 while ((skb = skb_dequeue(&npinfo->txq))) { 110 while ((skb = skb_dequeue(&npinfo->txq))) {
85 struct net_device *dev = skb->dev; 111 struct net_device *dev = skb->dev;
86 const struct net_device_ops *ops = dev->netdev_ops;
87 struct netdev_queue *txq; 112 struct netdev_queue *txq;
88 113
89 if (!netif_device_present(dev) || !netif_running(dev)) { 114 if (!netif_device_present(dev) || !netif_running(dev)) {
90 __kfree_skb(skb); 115 kfree_skb(skb);
91 continue; 116 continue;
92 } 117 }
93 118
94 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); 119 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
95 120
96 local_irq_save(flags); 121 local_irq_save(flags);
97 __netif_tx_lock(txq, smp_processor_id()); 122 HARD_TX_LOCK(dev, txq, smp_processor_id());
98 if (netif_xmit_frozen_or_stopped(txq) || 123 if (netif_xmit_frozen_or_stopped(txq) ||
99 ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { 124 netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
100 skb_queue_head(&npinfo->txq, skb); 125 skb_queue_head(&npinfo->txq, skb);
101 __netif_tx_unlock(txq); 126 HARD_TX_UNLOCK(dev, txq);
102 local_irq_restore(flags); 127 local_irq_restore(flags);
103 128
104 schedule_delayed_work(&npinfo->tx_work, HZ/10); 129 schedule_delayed_work(&npinfo->tx_work, HZ/10);
105 return; 130 return;
106 } 131 }
107 __netif_tx_unlock(txq); 132 HARD_TX_UNLOCK(dev, txq);
108 local_irq_restore(flags); 133 local_irq_restore(flags);
109 } 134 }
110} 135}
111 136
112static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
113 unsigned short ulen, __be32 saddr, __be32 daddr)
114{
115 __wsum psum;
116
117 if (uh->check == 0 || skb_csum_unnecessary(skb))
118 return 0;
119
120 psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
121
122 if (skb->ip_summed == CHECKSUM_COMPLETE &&
123 !csum_fold(csum_add(psum, skb->csum)))
124 return 0;
125
126 skb->csum = psum;
127
128 return __skb_checksum_complete(skb);
129}
130
131/* 137/*
132 * Check whether delayed processing was scheduled for our NIC. If so, 138 * Check whether delayed processing was scheduled for our NIC. If so,
133 * we attempt to grab the poll lock and use ->poll() to pump the card. 139 * we attempt to grab the poll lock and use ->poll() to pump the card.
@@ -138,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
138 * trylock here and interrupts are already disabled in the softirq 144 * trylock here and interrupts are already disabled in the softirq
139 * case. Further, we test the poll_owner to avoid recursion on UP 145 * case. Further, we test the poll_owner to avoid recursion on UP
140 * systems where the lock doesn't exist. 146 * systems where the lock doesn't exist.
141 *
142 * In cases where there is bi-directional communications, reading only
143 * one message at a time can lead to packets being dropped by the
144 * network adapter, forcing superfluous retries and possibly timeouts.
145 * Thus, we set our budget to greater than 1.
146 */ 147 */
147static int poll_one_napi(struct netpoll_info *npinfo, 148static int poll_one_napi(struct napi_struct *napi, int budget)
148 struct napi_struct *napi, int budget)
149{ 149{
150 int work; 150 int work;
151 151
@@ -156,52 +156,35 @@ static int poll_one_napi(struct netpoll_info *npinfo,
156 if (!test_bit(NAPI_STATE_SCHED, &napi->state)) 156 if (!test_bit(NAPI_STATE_SCHED, &napi->state))
157 return budget; 157 return budget;
158 158
159 npinfo->rx_flags |= NETPOLL_RX_DROP;
160 atomic_inc(&trapped);
161 set_bit(NAPI_STATE_NPSVC, &napi->state); 159 set_bit(NAPI_STATE_NPSVC, &napi->state);
162 160
163 work = napi->poll(napi, budget); 161 work = napi->poll(napi, budget);
162 WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
164 trace_napi_poll(napi); 163 trace_napi_poll(napi);
165 164
166 clear_bit(NAPI_STATE_NPSVC, &napi->state); 165 clear_bit(NAPI_STATE_NPSVC, &napi->state);
167 atomic_dec(&trapped);
168 npinfo->rx_flags &= ~NETPOLL_RX_DROP;
169 166
170 return budget - work; 167 return budget - work;
171} 168}
172 169
173static void poll_napi(struct net_device *dev) 170static void poll_napi(struct net_device *dev, int budget)
174{ 171{
175 struct napi_struct *napi; 172 struct napi_struct *napi;
176 int budget = 16;
177 173
178 list_for_each_entry(napi, &dev->napi_list, dev_list) { 174 list_for_each_entry(napi, &dev->napi_list, dev_list) {
179 if (napi->poll_owner != smp_processor_id() && 175 if (napi->poll_owner != smp_processor_id() &&
180 spin_trylock(&napi->poll_lock)) { 176 spin_trylock(&napi->poll_lock)) {
181 budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), 177 budget = poll_one_napi(napi, budget);
182 napi, budget);
183 spin_unlock(&napi->poll_lock); 178 spin_unlock(&napi->poll_lock);
184
185 if (!budget)
186 break;
187 } 179 }
188 } 180 }
189} 181}
190 182
191static void service_neigh_queue(struct netpoll_info *npi)
192{
193 if (npi) {
194 struct sk_buff *skb;
195
196 while ((skb = skb_dequeue(&npi->neigh_tx)))
197 netpoll_neigh_reply(skb, npi);
198 }
199}
200
201static void netpoll_poll_dev(struct net_device *dev) 183static void netpoll_poll_dev(struct net_device *dev)
202{ 184{
203 const struct net_device_ops *ops; 185 const struct net_device_ops *ops;
204 struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); 186 struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
187 int budget = 0;
205 188
206 /* Don't do any rx activity if the dev_lock mutex is held 189 /* Don't do any rx activity if the dev_lock mutex is held
207 * the dev_open/close paths use this to block netpoll activity 190 * the dev_open/close paths use this to block netpoll activity
@@ -224,31 +207,14 @@ static void netpoll_poll_dev(struct net_device *dev)
224 /* Process pending work on NIC */ 207 /* Process pending work on NIC */
225 ops->ndo_poll_controller(dev); 208 ops->ndo_poll_controller(dev);
226 209
227 poll_napi(dev); 210 poll_napi(dev, budget);
228 211
229 up(&ni->dev_lock); 212 up(&ni->dev_lock);
230 213
231 if (dev->flags & IFF_SLAVE) {
232 if (ni) {
233 struct net_device *bond_dev;
234 struct sk_buff *skb;
235 struct netpoll_info *bond_ni;
236
237 bond_dev = netdev_master_upper_dev_get_rcu(dev);
238 bond_ni = rcu_dereference_bh(bond_dev->npinfo);
239 while ((skb = skb_dequeue(&ni->neigh_tx))) {
240 skb->dev = bond_dev;
241 skb_queue_tail(&bond_ni->neigh_tx, skb);
242 }
243 }
244 }
245
246 service_neigh_queue(ni);
247
248 zap_completion_queue(); 214 zap_completion_queue();
249} 215}
250 216
251void netpoll_rx_disable(struct net_device *dev) 217void netpoll_poll_disable(struct net_device *dev)
252{ 218{
253 struct netpoll_info *ni; 219 struct netpoll_info *ni;
254 int idx; 220 int idx;
@@ -259,9 +225,9 @@ void netpoll_rx_disable(struct net_device *dev)
259 down(&ni->dev_lock); 225 down(&ni->dev_lock);
260 srcu_read_unlock(&netpoll_srcu, idx); 226 srcu_read_unlock(&netpoll_srcu, idx);
261} 227}
262EXPORT_SYMBOL(netpoll_rx_disable); 228EXPORT_SYMBOL(netpoll_poll_disable);
263 229
264void netpoll_rx_enable(struct net_device *dev) 230void netpoll_poll_enable(struct net_device *dev)
265{ 231{
266 struct netpoll_info *ni; 232 struct netpoll_info *ni;
267 rcu_read_lock(); 233 rcu_read_lock();
@@ -270,7 +236,7 @@ void netpoll_rx_enable(struct net_device *dev)
270 up(&ni->dev_lock); 236 up(&ni->dev_lock);
271 rcu_read_unlock(); 237 rcu_read_unlock();
272} 238}
273EXPORT_SYMBOL(netpoll_rx_enable); 239EXPORT_SYMBOL(netpoll_poll_enable);
274 240
275static void refill_skbs(void) 241static void refill_skbs(void)
276{ 242{
@@ -304,7 +270,7 @@ static void zap_completion_queue(void)
304 while (clist != NULL) { 270 while (clist != NULL) {
305 struct sk_buff *skb = clist; 271 struct sk_buff *skb = clist;
306 clist = clist->next; 272 clist = clist->next;
307 if (skb->destructor) { 273 if (!skb_irq_freeable(skb)) {
308 atomic_inc(&skb->users); 274 atomic_inc(&skb->users);
309 dev_kfree_skb_any(skb); /* put this one back */ 275 dev_kfree_skb_any(skb); /* put this one back */
310 } else { 276 } else {
@@ -359,7 +325,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
359{ 325{
360 int status = NETDEV_TX_BUSY; 326 int status = NETDEV_TX_BUSY;
361 unsigned long tries; 327 unsigned long tries;
362 const struct net_device_ops *ops = dev->netdev_ops;
363 /* It is up to the caller to keep npinfo alive. */ 328 /* It is up to the caller to keep npinfo alive. */
364 struct netpoll_info *npinfo; 329 struct netpoll_info *npinfo;
365 330
@@ -367,7 +332,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
367 332
368 npinfo = rcu_dereference_bh(np->dev->npinfo); 333 npinfo = rcu_dereference_bh(np->dev->npinfo);
369 if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { 334 if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
370 __kfree_skb(skb); 335 dev_kfree_skb_irq(skb);
371 return; 336 return;
372 } 337 }
373 338
@@ -380,29 +345,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
380 /* try until next clock tick */ 345 /* try until next clock tick */
381 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; 346 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
382 tries > 0; --tries) { 347 tries > 0; --tries) {
383 if (__netif_tx_trylock(txq)) { 348 if (HARD_TX_TRYLOCK(dev, txq)) {
384 if (!netif_xmit_stopped(txq)) { 349 if (!netif_xmit_stopped(txq))
385 if (vlan_tx_tag_present(skb) && 350 status = netpoll_start_xmit(skb, dev, txq);
386 !vlan_hw_offload_capable(netif_skb_features(skb), 351
387 skb->vlan_proto)) { 352 HARD_TX_UNLOCK(dev, txq);
388 skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
389 if (unlikely(!skb)) {
390 /* This is actually a packet drop, but we
391 * don't want the code at the end of this
392 * function to try and re-queue a NULL skb.
393 */
394 status = NETDEV_TX_OK;
395 goto unlock_txq;
396 }
397 skb->vlan_tci = 0;
398 }
399
400 status = ops->ndo_start_xmit(skb, dev);
401 if (status == NETDEV_TX_OK)
402 txq_trans_update(txq);
403 }
404 unlock_txq:
405 __netif_tx_unlock(txq);
406 353
407 if (status == NETDEV_TX_OK) 354 if (status == NETDEV_TX_OK)
408 break; 355 break;
@@ -417,7 +364,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
417 364
418 WARN_ONCE(!irqs_disabled(), 365 WARN_ONCE(!irqs_disabled(),
419 "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", 366 "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
420 dev->name, ops->ndo_start_xmit); 367 dev->name, dev->netdev_ops->ndo_start_xmit);
421 368
422 } 369 }
423 370
@@ -529,384 +476,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
529} 476}
530EXPORT_SYMBOL(netpoll_send_udp); 477EXPORT_SYMBOL(netpoll_send_udp);
531 478
532static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo)
533{
534 int size, type = ARPOP_REPLY;
535 __be32 sip, tip;
536 unsigned char *sha;
537 struct sk_buff *send_skb;
538 struct netpoll *np, *tmp;
539 unsigned long flags;
540 int hlen, tlen;
541 int hits = 0, proto;
542
543 if (list_empty(&npinfo->rx_np))
544 return;
545
546 /* Before checking the packet, we do some early
547 inspection whether this is interesting at all */
548 spin_lock_irqsave(&npinfo->rx_lock, flags);
549 list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
550 if (np->dev == skb->dev)
551 hits++;
552 }
553 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
554
555 /* No netpoll struct is using this dev */
556 if (!hits)
557 return;
558
559 proto = ntohs(eth_hdr(skb)->h_proto);
560 if (proto == ETH_P_ARP) {
561 struct arphdr *arp;
562 unsigned char *arp_ptr;
563 /* No arp on this interface */
564 if (skb->dev->flags & IFF_NOARP)
565 return;
566
567 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
568 return;
569
570 skb_reset_network_header(skb);
571 skb_reset_transport_header(skb);
572 arp = arp_hdr(skb);
573
574 if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
575 arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
576 arp->ar_pro != htons(ETH_P_IP) ||
577 arp->ar_op != htons(ARPOP_REQUEST))
578 return;
579
580 arp_ptr = (unsigned char *)(arp+1);
581 /* save the location of the src hw addr */
582 sha = arp_ptr;
583 arp_ptr += skb->dev->addr_len;
584 memcpy(&sip, arp_ptr, 4);
585 arp_ptr += 4;
586 /* If we actually cared about dst hw addr,
587 it would get copied here */
588 arp_ptr += skb->dev->addr_len;
589 memcpy(&tip, arp_ptr, 4);
590
591 /* Should we ignore arp? */
592 if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
593 return;
594
595 size = arp_hdr_len(skb->dev);
596
597 spin_lock_irqsave(&npinfo->rx_lock, flags);
598 list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
599 if (tip != np->local_ip.ip)
600 continue;
601
602 hlen = LL_RESERVED_SPACE(np->dev);
603 tlen = np->dev->needed_tailroom;
604 send_skb = find_skb(np, size + hlen + tlen, hlen);
605 if (!send_skb)
606 continue;
607
608 skb_reset_network_header(send_skb);
609 arp = (struct arphdr *) skb_put(send_skb, size);
610 send_skb->dev = skb->dev;
611 send_skb->protocol = htons(ETH_P_ARP);
612
613 /* Fill the device header for the ARP frame */
614 if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP,
615 sha, np->dev->dev_addr,
616 send_skb->len) < 0) {
617 kfree_skb(send_skb);
618 continue;
619 }
620
621 /*
622 * Fill out the arp protocol part.
623 *
624 * we only support ethernet device type,
625 * which (according to RFC 1390) should
626 * always equal 1 (Ethernet).
627 */
628
629 arp->ar_hrd = htons(np->dev->type);
630 arp->ar_pro = htons(ETH_P_IP);
631 arp->ar_hln = np->dev->addr_len;
632 arp->ar_pln = 4;
633 arp->ar_op = htons(type);
634
635 arp_ptr = (unsigned char *)(arp + 1);
636 memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
637 arp_ptr += np->dev->addr_len;
638 memcpy(arp_ptr, &tip, 4);
639 arp_ptr += 4;
640 memcpy(arp_ptr, sha, np->dev->addr_len);
641 arp_ptr += np->dev->addr_len;
642 memcpy(arp_ptr, &sip, 4);
643
644 netpoll_send_skb(np, send_skb);
645
646 /* If there are several rx_skb_hooks for the same
647 * address we're fine by sending a single reply
648 */
649 break;
650 }
651 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
652 } else if( proto == ETH_P_IPV6) {
653#if IS_ENABLED(CONFIG_IPV6)
654 struct nd_msg *msg;
655 u8 *lladdr = NULL;
656 struct ipv6hdr *hdr;
657 struct icmp6hdr *icmp6h;
658 const struct in6_addr *saddr;
659 const struct in6_addr *daddr;
660 struct inet6_dev *in6_dev = NULL;
661 struct in6_addr *target;
662
663 in6_dev = in6_dev_get(skb->dev);
664 if (!in6_dev || !in6_dev->cnf.accept_ra)
665 return;
666
667 if (!pskb_may_pull(skb, skb->len))
668 return;
669
670 msg = (struct nd_msg *)skb_transport_header(skb);
671
672 __skb_push(skb, skb->data - skb_transport_header(skb));
673
674 if (ipv6_hdr(skb)->hop_limit != 255)
675 return;
676 if (msg->icmph.icmp6_code != 0)
677 return;
678 if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
679 return;
680
681 saddr = &ipv6_hdr(skb)->saddr;
682 daddr = &ipv6_hdr(skb)->daddr;
683
684 size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
685
686 spin_lock_irqsave(&npinfo->rx_lock, flags);
687 list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
688 if (!ipv6_addr_equal(daddr, &np->local_ip.in6))
689 continue;
690
691 hlen = LL_RESERVED_SPACE(np->dev);
692 tlen = np->dev->needed_tailroom;
693 send_skb = find_skb(np, size + hlen + tlen, hlen);
694 if (!send_skb)
695 continue;
696
697 send_skb->protocol = htons(ETH_P_IPV6);
698 send_skb->dev = skb->dev;
699
700 skb_reset_network_header(send_skb);
701 hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr));
702 *(__be32*)hdr = htonl(0x60000000);
703 hdr->payload_len = htons(size);
704 hdr->nexthdr = IPPROTO_ICMPV6;
705 hdr->hop_limit = 255;
706 hdr->saddr = *saddr;
707 hdr->daddr = *daddr;
708
709 icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr));
710 icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
711 icmp6h->icmp6_router = 0;
712 icmp6h->icmp6_solicited = 1;
713
714 target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr));
715 *target = msg->target;
716 icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size,
717 IPPROTO_ICMPV6,
718 csum_partial(icmp6h,
719 size, 0));
720
721 if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6,
722 lladdr, np->dev->dev_addr,
723 send_skb->len) < 0) {
724 kfree_skb(send_skb);
725 continue;
726 }
727
728 netpoll_send_skb(np, send_skb);
729
730 /* If there are several rx_skb_hooks for the same
731 * address, we're fine by sending a single reply
732 */
733 break;
734 }
735 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
736#endif
737 }
738}
739
740static bool pkt_is_ns(struct sk_buff *skb)
741{
742 struct nd_msg *msg;
743 struct ipv6hdr *hdr;
744
745 if (skb->protocol != htons(ETH_P_IPV6))
746 return false;
747 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)))
748 return false;
749
750 msg = (struct nd_msg *)skb_transport_header(skb);
751 __skb_push(skb, skb->data - skb_transport_header(skb));
752 hdr = ipv6_hdr(skb);
753
754 if (hdr->nexthdr != IPPROTO_ICMPV6)
755 return false;
756 if (hdr->hop_limit != 255)
757 return false;
758 if (msg->icmph.icmp6_code != 0)
759 return false;
760 if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
761 return false;
762
763 return true;
764}
765
766int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
767{
768 int proto, len, ulen, data_len;
769 int hits = 0, offset;
770 const struct iphdr *iph;
771 struct udphdr *uh;
772 struct netpoll *np, *tmp;
773 uint16_t source;
774
775 if (list_empty(&npinfo->rx_np))
776 goto out;
777
778 if (skb->dev->type != ARPHRD_ETHER)
779 goto out;
780
781 /* check if netpoll clients need ARP */
782 if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) {
783 skb_queue_tail(&npinfo->neigh_tx, skb);
784 return 1;
785 } else if (pkt_is_ns(skb) && atomic_read(&trapped)) {
786 skb_queue_tail(&npinfo->neigh_tx, skb);
787 return 1;
788 }
789
790 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
791 skb = vlan_untag(skb);
792 if (unlikely(!skb))
793 goto out;
794 }
795
796 proto = ntohs(eth_hdr(skb)->h_proto);
797 if (proto != ETH_P_IP && proto != ETH_P_IPV6)
798 goto out;
799 if (skb->pkt_type == PACKET_OTHERHOST)
800 goto out;
801 if (skb_shared(skb))
802 goto out;
803
804 if (proto == ETH_P_IP) {
805 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
806 goto out;
807 iph = (struct iphdr *)skb->data;
808 if (iph->ihl < 5 || iph->version != 4)
809 goto out;
810 if (!pskb_may_pull(skb, iph->ihl*4))
811 goto out;
812 iph = (struct iphdr *)skb->data;
813 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
814 goto out;
815
816 len = ntohs(iph->tot_len);
817 if (skb->len < len || len < iph->ihl*4)
818 goto out;
819
820 /*
821 * Our transport medium may have padded the buffer out.
822 * Now We trim to the true length of the frame.
823 */
824 if (pskb_trim_rcsum(skb, len))
825 goto out;
826
827 iph = (struct iphdr *)skb->data;
828 if (iph->protocol != IPPROTO_UDP)
829 goto out;
830
831 len -= iph->ihl*4;
832 uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
833 offset = (unsigned char *)(uh + 1) - skb->data;
834 ulen = ntohs(uh->len);
835 data_len = skb->len - offset;
836 source = ntohs(uh->source);
837
838 if (ulen != len)
839 goto out;
840 if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
841 goto out;
842 list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
843 if (np->local_ip.ip && np->local_ip.ip != iph->daddr)
844 continue;
845 if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr)
846 continue;
847 if (np->local_port && np->local_port != ntohs(uh->dest))
848 continue;
849
850 np->rx_skb_hook(np, source, skb, offset, data_len);
851 hits++;
852 }
853 } else {
854#if IS_ENABLED(CONFIG_IPV6)
855 const struct ipv6hdr *ip6h;
856
857 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
858 goto out;
859 ip6h = (struct ipv6hdr *)skb->data;
860 if (ip6h->version != 6)
861 goto out;
862 len = ntohs(ip6h->payload_len);
863 if (!len)
864 goto out;
865 if (len + sizeof(struct ipv6hdr) > skb->len)
866 goto out;
867 if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr)))
868 goto out;
869 ip6h = ipv6_hdr(skb);
870 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
871 goto out;
872 uh = udp_hdr(skb);
873 offset = (unsigned char *)(uh + 1) - skb->data;
874 ulen = ntohs(uh->len);
875 data_len = skb->len - offset;
876 source = ntohs(uh->source);
877 if (ulen != skb->len)
878 goto out;
879 if (udp6_csum_init(skb, uh, IPPROTO_UDP))
880 goto out;
881 list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
882 if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr))
883 continue;
884 if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr))
885 continue;
886 if (np->local_port && np->local_port != ntohs(uh->dest))
887 continue;
888
889 np->rx_skb_hook(np, source, skb, offset, data_len);
890 hits++;
891 }
892#endif
893 }
894
895 if (!hits)
896 goto out;
897
898 kfree_skb(skb);
899 return 1;
900
901out:
902 if (atomic_read(&trapped)) {
903 kfree_skb(skb);
904 return 1;
905 }
906
907 return 0;
908}
909
910void netpoll_print_options(struct netpoll *np) 479void netpoll_print_options(struct netpoll *np)
911{ 480{
912 np_info(np, "local port %d\n", np->local_port); 481 np_info(np, "local port %d\n", np->local_port);
@@ -1026,11 +595,10 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
1026} 595}
1027EXPORT_SYMBOL(netpoll_parse_options); 596EXPORT_SYMBOL(netpoll_parse_options);
1028 597
1029int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) 598int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
1030{ 599{
1031 struct netpoll_info *npinfo; 600 struct netpoll_info *npinfo;
1032 const struct net_device_ops *ops; 601 const struct net_device_ops *ops;
1033 unsigned long flags;
1034 int err; 602 int err;
1035 603
1036 np->dev = ndev; 604 np->dev = ndev;
@@ -1046,18 +614,13 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
1046 } 614 }
1047 615
1048 if (!ndev->npinfo) { 616 if (!ndev->npinfo) {
1049 npinfo = kmalloc(sizeof(*npinfo), gfp); 617 npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
1050 if (!npinfo) { 618 if (!npinfo) {
1051 err = -ENOMEM; 619 err = -ENOMEM;
1052 goto out; 620 goto out;
1053 } 621 }
1054 622
1055 npinfo->rx_flags = 0;
1056 INIT_LIST_HEAD(&npinfo->rx_np);
1057
1058 spin_lock_init(&npinfo->rx_lock);
1059 sema_init(&npinfo->dev_lock, 1); 623 sema_init(&npinfo->dev_lock, 1);
1060 skb_queue_head_init(&npinfo->neigh_tx);
1061 skb_queue_head_init(&npinfo->txq); 624 skb_queue_head_init(&npinfo->txq);
1062 INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); 625 INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
1063 626
@@ -1065,7 +628,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
1065 628
1066 ops = np->dev->netdev_ops; 629 ops = np->dev->netdev_ops;
1067 if (ops->ndo_netpoll_setup) { 630 if (ops->ndo_netpoll_setup) {
1068 err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); 631 err = ops->ndo_netpoll_setup(ndev, npinfo);
1069 if (err) 632 if (err)
1070 goto free_npinfo; 633 goto free_npinfo;
1071 } 634 }
@@ -1076,13 +639,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
1076 639
1077 npinfo->netpoll = np; 640 npinfo->netpoll = np;
1078 641
1079 if (np->rx_skb_hook) {
1080 spin_lock_irqsave(&npinfo->rx_lock, flags);
1081 npinfo->rx_flags |= NETPOLL_RX_ENABLED;
1082 list_add_tail(&np->rx, &npinfo->rx_np);
1083 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
1084 }
1085
1086 /* last thing to do is link it to the net device structure */ 642 /* last thing to do is link it to the net device structure */
1087 rcu_assign_pointer(ndev->npinfo, npinfo); 643 rcu_assign_pointer(ndev->npinfo, npinfo);
1088 644
@@ -1204,7 +760,7 @@ int netpoll_setup(struct netpoll *np)
1204 /* fill up the skb queue */ 760 /* fill up the skb queue */
1205 refill_skbs(); 761 refill_skbs();
1206 762
1207 err = __netpoll_setup(np, ndev, GFP_KERNEL); 763 err = __netpoll_setup(np, ndev);
1208 if (err) 764 if (err)
1209 goto put; 765 goto put;
1210 766
@@ -1231,7 +787,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
1231 struct netpoll_info *npinfo = 787 struct netpoll_info *npinfo =
1232 container_of(rcu_head, struct netpoll_info, rcu); 788 container_of(rcu_head, struct netpoll_info, rcu);
1233 789
1234 skb_queue_purge(&npinfo->neigh_tx);
1235 skb_queue_purge(&npinfo->txq); 790 skb_queue_purge(&npinfo->txq);
1236 791
1237 /* we can't call cancel_delayed_work_sync here, as we are in softirq */ 792 /* we can't call cancel_delayed_work_sync here, as we are in softirq */
@@ -1247,7 +802,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
1247void __netpoll_cleanup(struct netpoll *np) 802void __netpoll_cleanup(struct netpoll *np)
1248{ 803{
1249 struct netpoll_info *npinfo; 804 struct netpoll_info *npinfo;
1250 unsigned long flags;
1251 805
1252 /* rtnl_dereference would be preferable here but 806 /* rtnl_dereference would be preferable here but
1253 * rcu_cleanup_netpoll path can put us in here safely without 807 * rcu_cleanup_netpoll path can put us in here safely without
@@ -1257,14 +811,6 @@ void __netpoll_cleanup(struct netpoll *np)
1257 if (!npinfo) 811 if (!npinfo)
1258 return; 812 return;
1259 813
1260 if (!list_empty(&npinfo->rx_np)) {
1261 spin_lock_irqsave(&npinfo->rx_lock, flags);
1262 list_del(&np->rx);
1263 if (list_empty(&npinfo->rx_np))
1264 npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
1265 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
1266 }
1267
1268 synchronize_srcu(&netpoll_srcu); 814 synchronize_srcu(&netpoll_srcu);
1269 815
1270 if (atomic_dec_and_test(&npinfo->refcnt)) { 816 if (atomic_dec_and_test(&npinfo->refcnt)) {
@@ -1274,7 +820,7 @@ void __netpoll_cleanup(struct netpoll *np)
1274 if (ops->ndo_netpoll_cleanup) 820 if (ops->ndo_netpoll_cleanup)
1275 ops->ndo_netpoll_cleanup(np->dev); 821 ops->ndo_netpoll_cleanup(np->dev);
1276 822
1277 rcu_assign_pointer(np->dev->npinfo, NULL); 823 RCU_INIT_POINTER(np->dev->npinfo, NULL);
1278 call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); 824 call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
1279 } 825 }
1280} 826}
@@ -1308,18 +854,3 @@ out:
1308 rtnl_unlock(); 854 rtnl_unlock();
1309} 855}
1310EXPORT_SYMBOL(netpoll_cleanup); 856EXPORT_SYMBOL(netpoll_cleanup);
1311
1312int netpoll_trap(void)
1313{
1314 return atomic_read(&trapped);
1315}
1316EXPORT_SYMBOL(netpoll_trap);
1317
1318void netpoll_set_trap(int trap)
1319{
1320 if (trap)
1321 atomic_inc(&trapped);
1322 else
1323 atomic_dec(&trapped);
1324}
1325EXPORT_SYMBOL(netpoll_set_trap);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fdac61cac1bd..d0dac57291af 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -476,23 +476,22 @@ static int pgctrl_show(struct seq_file *seq, void *v)
476static ssize_t pgctrl_write(struct file *file, const char __user *buf, 476static ssize_t pgctrl_write(struct file *file, const char __user *buf,
477 size_t count, loff_t *ppos) 477 size_t count, loff_t *ppos)
478{ 478{
479 int err = 0;
480 char data[128]; 479 char data[128];
481 struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); 480 struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
482 481
483 if (!capable(CAP_NET_ADMIN)) { 482 if (!capable(CAP_NET_ADMIN))
484 err = -EPERM; 483 return -EPERM;
485 goto out; 484
486 } 485 if (count == 0)
486 return -EINVAL;
487 487
488 if (count > sizeof(data)) 488 if (count > sizeof(data))
489 count = sizeof(data); 489 count = sizeof(data);
490 490
491 if (copy_from_user(data, buf, count)) { 491 if (copy_from_user(data, buf, count))
492 err = -EFAULT; 492 return -EFAULT;
493 goto out; 493
494 } 494 data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
495 data[count - 1] = 0; /* Make string */
496 495
497 if (!strcmp(data, "stop")) 496 if (!strcmp(data, "stop"))
498 pktgen_stop_all_threads_ifs(pn); 497 pktgen_stop_all_threads_ifs(pn);
@@ -506,10 +505,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
506 else 505 else
507 pr_warning("Unknown command: %s\n", data); 506 pr_warning("Unknown command: %s\n", data);
508 507
509 err = count; 508 return count;
510
511out:
512 return err;
513} 509}
514 510
515static int pgctrl_open(struct inode *inode, struct file *file) 511static int pgctrl_open(struct inode *inode, struct file *file)
@@ -1251,7 +1247,13 @@ static ssize_t pktgen_if_write(struct file *file,
1251 "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", 1247 "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
1252 f, 1248 f,
1253 "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " 1249 "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
1254 "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); 1250 "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
1251 "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
1252 "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
1253#ifdef CONFIG_XFRM
1254 "IPSEC, "
1255#endif
1256 "NODE_ALLOC\n");
1255 return count; 1257 return count;
1256 } 1258 }
1257 sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); 1259 sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 000000000000..eaba0f68f860
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,141 @@
1/* PTP classifier
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12
13/* The below program is the bpf_asm (tools/net/) representation of
14 * the opcode array in the ptp_filter structure.
15 *
16 * For convenience, this can easily be altered and reviewed with
17 * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
18 * simple file containing the below program:
19 *
20 * ldh [12] ; load ethertype
21 *
22 * ; PTP over UDP over IPv4 over Ethernet
23 * test_ipv4:
24 * jneq #0x800, test_ipv6 ; ETH_P_IP ?
25 * ldb [23] ; load proto
26 * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
27 * ldh [20] ; load frag offset field
28 * jset #0x1fff, drop_ipv4 ; don't allow fragments
29 * ldxb 4*([14]&0xf) ; load IP header len
30 * ldh [x + 16] ; load UDP dst port
31 * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
32 * ldh [x + 22] ; load payload
33 * and #0xf ; mask PTP_CLASS_VMASK
34 * or #0x10 ; PTP_CLASS_IPV4
35 * ret a ; return PTP class
36 * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
37 *
38 * ; PTP over UDP over IPv6 over Ethernet
39 * test_ipv6:
40 * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
41 * ldb [20] ; load proto
42 * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
43 * ldh [56] ; load UDP dst port
44 * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
45 * ldh [62] ; load payload
46 * and #0xf ; mask PTP_CLASS_VMASK
47 * or #0x20 ; PTP_CLASS_IPV6
48 * ret a ; return PTP class
49 * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
50 *
51 * ; PTP over 802.1Q over Ethernet
52 * test_8021q:
53 * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
54 * ldh [16] ; load inner type
55 * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
56 * ldb [18] ; load payload
57 * and #0x8 ; as we don't have ports here, test
58 * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
59 * ldh [18] ; reload payload
60 * and #0xf ; mask PTP_CLASS_VMASK
61 * or #0x40 ; PTP_CLASS_V2_VLAN
62 * ret a ; return PTP class
63 *
64 * ; PTP over Ethernet
65 * test_ieee1588:
66 * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
67 * ldb [14] ; load payload
68 * and #0x8 ; as we don't have ports here, test
69 * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
70 * ldh [14] ; reload payload
71 * and #0xf ; mask PTP_CLASS_VMASK
72 * or #0x30 ; PTP_CLASS_L2
73 * ret a ; return PTP class
74 * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
75 */
76
77#include <linux/skbuff.h>
78#include <linux/filter.h>
79#include <linux/ptp_classify.h>
80
81static struct sk_filter *ptp_insns __read_mostly;
82
83unsigned int ptp_classify_raw(const struct sk_buff *skb)
84{
85 return SK_RUN_FILTER(ptp_insns, skb);
86}
87EXPORT_SYMBOL_GPL(ptp_classify_raw);
88
89void __init ptp_classifier_init(void)
90{
91 static struct sock_filter ptp_filter[] = {
92 { 0x28, 0, 0, 0x0000000c },
93 { 0x15, 0, 12, 0x00000800 },
94 { 0x30, 0, 0, 0x00000017 },
95 { 0x15, 0, 9, 0x00000011 },
96 { 0x28, 0, 0, 0x00000014 },
97 { 0x45, 7, 0, 0x00001fff },
98 { 0xb1, 0, 0, 0x0000000e },
99 { 0x48, 0, 0, 0x00000010 },
100 { 0x15, 0, 4, 0x0000013f },
101 { 0x48, 0, 0, 0x00000016 },
102 { 0x54, 0, 0, 0x0000000f },
103 { 0x44, 0, 0, 0x00000010 },
104 { 0x16, 0, 0, 0x00000000 },
105 { 0x06, 0, 0, 0x00000000 },
106 { 0x15, 0, 9, 0x000086dd },
107 { 0x30, 0, 0, 0x00000014 },
108 { 0x15, 0, 6, 0x00000011 },
109 { 0x28, 0, 0, 0x00000038 },
110 { 0x15, 0, 4, 0x0000013f },
111 { 0x28, 0, 0, 0x0000003e },
112 { 0x54, 0, 0, 0x0000000f },
113 { 0x44, 0, 0, 0x00000020 },
114 { 0x16, 0, 0, 0x00000000 },
115 { 0x06, 0, 0, 0x00000000 },
116 { 0x15, 0, 9, 0x00008100 },
117 { 0x28, 0, 0, 0x00000010 },
118 { 0x15, 0, 15, 0x000088f7 },
119 { 0x30, 0, 0, 0x00000012 },
120 { 0x54, 0, 0, 0x00000008 },
121 { 0x15, 0, 12, 0x00000000 },
122 { 0x28, 0, 0, 0x00000012 },
123 { 0x54, 0, 0, 0x0000000f },
124 { 0x44, 0, 0, 0x00000040 },
125 { 0x16, 0, 0, 0x00000000 },
126 { 0x15, 0, 7, 0x000088f7 },
127 { 0x30, 0, 0, 0x0000000e },
128 { 0x54, 0, 0, 0x00000008 },
129 { 0x15, 0, 4, 0x00000000 },
130 { 0x28, 0, 0, 0x0000000e },
131 { 0x54, 0, 0, 0x0000000f },
132 { 0x44, 0, 0, 0x00000030 },
133 { 0x16, 0, 0, 0x00000000 },
134 { 0x06, 0, 0, 0x00000000 },
135 };
136 struct sock_fprog ptp_prog = {
137 .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
138 };
139
140 BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog));
141}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 4425148d2b51..467f326126e0 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -221,5 +221,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
221out: 221out:
222 spin_unlock_bh(&fastopenq->lock); 222 spin_unlock_bh(&fastopenq->lock);
223 sock_put(lsk); 223 sock_put(lsk);
224 return;
225} 224}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 120eecc0f5a4..d4ff41739b0f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -822,6 +822,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
822 + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ 822 + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
823 + nla_total_size(1) /* IFLA_OPERSTATE */ 823 + nla_total_size(1) /* IFLA_OPERSTATE */
824 + nla_total_size(1) /* IFLA_LINKMODE */ 824 + nla_total_size(1) /* IFLA_LINKMODE */
825 + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
825 + nla_total_size(ext_filter_mask 826 + nla_total_size(ext_filter_mask
826 & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ 827 & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
827 + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ 828 + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
@@ -970,7 +971,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
970 (dev->qdisc && 971 (dev->qdisc &&
971 nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || 972 nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
972 (dev->ifalias && 973 (dev->ifalias &&
973 nla_put_string(skb, IFLA_IFALIAS, dev->ifalias))) 974 nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
975 nla_put_u32(skb, IFLA_CARRIER_CHANGES,
976 atomic_read(&dev->carrier_changes)))
974 goto nla_put_failure; 977 goto nla_put_failure;
975 978
976 if (1) { 979 if (1) {
@@ -1121,56 +1124,7 @@ nla_put_failure:
1121 return -EMSGSIZE; 1124 return -EMSGSIZE;
1122} 1125}
1123 1126
1124static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) 1127static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1125{
1126 struct net *net = sock_net(skb->sk);
1127 int h, s_h;
1128 int idx = 0, s_idx;
1129 struct net_device *dev;
1130 struct hlist_head *head;
1131 struct nlattr *tb[IFLA_MAX+1];
1132 u32 ext_filter_mask = 0;
1133
1134 s_h = cb->args[0];
1135 s_idx = cb->args[1];
1136
1137 rcu_read_lock();
1138 cb->seq = net->dev_base_seq;
1139
1140 if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
1141 ifla_policy) >= 0) {
1142
1143 if (tb[IFLA_EXT_MASK])
1144 ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
1145 }
1146
1147 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1148 idx = 0;
1149 head = &net->dev_index_head[h];
1150 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1151 if (idx < s_idx)
1152 goto cont;
1153 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
1154 NETLINK_CB(cb->skb).portid,
1155 cb->nlh->nlmsg_seq, 0,
1156 NLM_F_MULTI,
1157 ext_filter_mask) <= 0)
1158 goto out;
1159
1160 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1161cont:
1162 idx++;
1163 }
1164 }
1165out:
1166 rcu_read_unlock();
1167 cb->args[1] = idx;
1168 cb->args[0] = h;
1169
1170 return skb->len;
1171}
1172
1173const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1174 [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, 1128 [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
1175 [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, 1129 [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
1176 [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, 1130 [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
@@ -1196,8 +1150,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1196 [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, 1150 [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
1197 [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, 1151 [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
1198 [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, 1152 [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
1153 [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
1199}; 1154};
1200EXPORT_SYMBOL(ifla_policy);
1201 1155
1202static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { 1156static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
1203 [IFLA_INFO_KIND] = { .type = NLA_STRING }, 1157 [IFLA_INFO_KIND] = { .type = NLA_STRING },
@@ -1235,6 +1189,61 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
1235 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, 1189 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
1236}; 1190};
1237 1191
1192static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1193{
1194 struct net *net = sock_net(skb->sk);
1195 int h, s_h;
1196 int idx = 0, s_idx;
1197 struct net_device *dev;
1198 struct hlist_head *head;
1199 struct nlattr *tb[IFLA_MAX+1];
1200 u32 ext_filter_mask = 0;
1201
1202 s_h = cb->args[0];
1203 s_idx = cb->args[1];
1204
1205 rcu_read_lock();
1206 cb->seq = net->dev_base_seq;
1207
1208 if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
1209 ifla_policy) >= 0) {
1210
1211 if (tb[IFLA_EXT_MASK])
1212 ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
1213 }
1214
1215 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1216 idx = 0;
1217 head = &net->dev_index_head[h];
1218 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1219 if (idx < s_idx)
1220 goto cont;
1221 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
1222 NETLINK_CB(cb->skb).portid,
1223 cb->nlh->nlmsg_seq, 0,
1224 NLM_F_MULTI,
1225 ext_filter_mask) <= 0)
1226 goto out;
1227
1228 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1229cont:
1230 idx++;
1231 }
1232 }
1233out:
1234 rcu_read_unlock();
1235 cb->args[1] = idx;
1236 cb->args[0] = h;
1237
1238 return skb->len;
1239}
1240
1241int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len)
1242{
1243 return nla_parse(tb, IFLA_MAX, head, len, ifla_policy);
1244}
1245EXPORT_SYMBOL(rtnl_nla_parse_ifla);
1246
1238struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) 1247struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
1239{ 1248{
1240 struct net *net; 1249 struct net *net;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 90b96a11b974..30c7d35dd862 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3300,6 +3300,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
3300 return elt; 3300 return elt;
3301} 3301}
3302 3302
3303/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
3304 * sglist without mark the sg which contain last skb data as the end.
3305 * So the caller can mannipulate sg list as will when padding new data after
3306 * the first call without calling sg_unmark_end to expend sg list.
3307 *
3308 * Scenario to use skb_to_sgvec_nomark:
3309 * 1. sg_init_table
3310 * 2. skb_to_sgvec_nomark(payload1)
3311 * 3. skb_to_sgvec_nomark(payload2)
3312 *
3313 * This is equivalent to:
3314 * 1. sg_init_table
3315 * 2. skb_to_sgvec(payload1)
3316 * 3. sg_unmark_end
3317 * 4. skb_to_sgvec(payload2)
3318 *
3319 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
3320 * is more preferable.
3321 */
3322int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
3323 int offset, int len)
3324{
3325 return __skb_to_sgvec(skb, sg, offset, len);
3326}
3327EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
3328
3303int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 3329int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
3304{ 3330{
3305 int nsg = __skb_to_sgvec(skb, sg, offset, len); 3331 int nsg = __skb_to_sgvec(skb, sg, offset, len);
@@ -3562,15 +3588,47 @@ static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
3562 return 0; 3588 return 0;
3563} 3589}
3564 3590
3591#define MAX_TCP_HDR_LEN (15 * 4)
3592
3593static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
3594 typeof(IPPROTO_IP) proto,
3595 unsigned int off)
3596{
3597 switch (proto) {
3598 int err;
3599
3600 case IPPROTO_TCP:
3601 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
3602 off + MAX_TCP_HDR_LEN);
3603 if (!err && !skb_partial_csum_set(skb, off,
3604 offsetof(struct tcphdr,
3605 check)))
3606 err = -EPROTO;
3607 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
3608
3609 case IPPROTO_UDP:
3610 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
3611 off + sizeof(struct udphdr));
3612 if (!err && !skb_partial_csum_set(skb, off,
3613 offsetof(struct udphdr,
3614 check)))
3615 err = -EPROTO;
3616 return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
3617 }
3618
3619 return ERR_PTR(-EPROTO);
3620}
3621
3565/* This value should be large enough to cover a tagged ethernet header plus 3622/* This value should be large enough to cover a tagged ethernet header plus
3566 * maximally sized IP and TCP or UDP headers. 3623 * maximally sized IP and TCP or UDP headers.
3567 */ 3624 */
3568#define MAX_IP_HDR_LEN 128 3625#define MAX_IP_HDR_LEN 128
3569 3626
3570static int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate) 3627static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
3571{ 3628{
3572 unsigned int off; 3629 unsigned int off;
3573 bool fragment; 3630 bool fragment;
3631 __sum16 *csum;
3574 int err; 3632 int err;
3575 3633
3576 fragment = false; 3634 fragment = false;
@@ -3591,51 +3649,15 @@ static int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate)
3591 if (fragment) 3649 if (fragment)
3592 goto out; 3650 goto out;
3593 3651
3594 switch (ip_hdr(skb)->protocol) { 3652 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
3595 case IPPROTO_TCP: 3653 if (IS_ERR(csum))
3596 err = skb_maybe_pull_tail(skb, 3654 return PTR_ERR(csum);
3597 off + sizeof(struct tcphdr),
3598 MAX_IP_HDR_LEN);
3599 if (err < 0)
3600 goto out;
3601
3602 if (!skb_partial_csum_set(skb, off,
3603 offsetof(struct tcphdr, check))) {
3604 err = -EPROTO;
3605 goto out;
3606 }
3607
3608 if (recalculate)
3609 tcp_hdr(skb)->check =
3610 ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
3611 ip_hdr(skb)->daddr,
3612 skb->len - off,
3613 IPPROTO_TCP, 0);
3614 break;
3615 case IPPROTO_UDP:
3616 err = skb_maybe_pull_tail(skb,
3617 off + sizeof(struct udphdr),
3618 MAX_IP_HDR_LEN);
3619 if (err < 0)
3620 goto out;
3621
3622 if (!skb_partial_csum_set(skb, off,
3623 offsetof(struct udphdr, check))) {
3624 err = -EPROTO;
3625 goto out;
3626 }
3627
3628 if (recalculate)
3629 udp_hdr(skb)->check =
3630 ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
3631 ip_hdr(skb)->daddr,
3632 skb->len - off,
3633 IPPROTO_UDP, 0);
3634 break;
3635 default:
3636 goto out;
3637 }
3638 3655
3656 if (recalculate)
3657 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
3658 ip_hdr(skb)->daddr,
3659 skb->len - off,
3660 ip_hdr(skb)->protocol, 0);
3639 err = 0; 3661 err = 0;
3640 3662
3641out: 3663out:
@@ -3658,6 +3680,7 @@ static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
3658 unsigned int len; 3680 unsigned int len;
3659 bool fragment; 3681 bool fragment;
3660 bool done; 3682 bool done;
3683 __sum16 *csum;
3661 3684
3662 fragment = false; 3685 fragment = false;
3663 done = false; 3686 done = false;
@@ -3735,51 +3758,14 @@ static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
3735 if (!done || fragment) 3758 if (!done || fragment)
3736 goto out; 3759 goto out;
3737 3760
3738 switch (nexthdr) { 3761 csum = skb_checksum_setup_ip(skb, nexthdr, off);
3739 case IPPROTO_TCP: 3762 if (IS_ERR(csum))
3740 err = skb_maybe_pull_tail(skb, 3763 return PTR_ERR(csum);
3741 off + sizeof(struct tcphdr),
3742 MAX_IPV6_HDR_LEN);
3743 if (err < 0)
3744 goto out;
3745
3746 if (!skb_partial_csum_set(skb, off,
3747 offsetof(struct tcphdr, check))) {
3748 err = -EPROTO;
3749 goto out;
3750 }
3751
3752 if (recalculate)
3753 tcp_hdr(skb)->check =
3754 ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
3755 &ipv6_hdr(skb)->daddr,
3756 skb->len - off,
3757 IPPROTO_TCP, 0);
3758 break;
3759 case IPPROTO_UDP:
3760 err = skb_maybe_pull_tail(skb,
3761 off + sizeof(struct udphdr),
3762 MAX_IPV6_HDR_LEN);
3763 if (err < 0)
3764 goto out;
3765
3766 if (!skb_partial_csum_set(skb, off,
3767 offsetof(struct udphdr, check))) {
3768 err = -EPROTO;
3769 goto out;
3770 }
3771
3772 if (recalculate)
3773 udp_hdr(skb)->check =
3774 ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
3775 &ipv6_hdr(skb)->daddr,
3776 skb->len - off,
3777 IPPROTO_UDP, 0);
3778 break;
3779 default:
3780 goto out;
3781 }
3782 3764
3765 if (recalculate)
3766 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
3767 &ipv6_hdr(skb)->daddr,
3768 skb->len - off, nexthdr, 0);
3783 err = 0; 3769 err = 0;
3784 3770
3785out: 3771out:
@@ -3797,7 +3783,7 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
3797 3783
3798 switch (skb->protocol) { 3784 switch (skb->protocol) {
3799 case htons(ETH_P_IP): 3785 case htons(ETH_P_IP):
3800 err = skb_checksum_setup_ip(skb, recalculate); 3786 err = skb_checksum_setup_ipv4(skb, recalculate);
3801 break; 3787 break;
3802 3788
3803 case htons(ETH_P_IPV6): 3789 case htons(ETH_P_IPV6):
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a0e9cf6379de..d7af18859322 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -52,9 +52,10 @@ EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
52int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, 52int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
53 struct sk_buff *skb, int attrtype) 53 struct sk_buff *skb, int attrtype)
54{ 54{
55 struct nlattr *attr; 55 struct sock_fprog_kern *fprog;
56 struct sk_filter *filter; 56 struct sk_filter *filter;
57 unsigned int len; 57 struct nlattr *attr;
58 unsigned int flen;
58 int err = 0; 59 int err = 0;
59 60
60 if (!ns_capable(user_ns, CAP_NET_ADMIN)) { 61 if (!ns_capable(user_ns, CAP_NET_ADMIN)) {
@@ -63,24 +64,20 @@ int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
63 } 64 }
64 65
65 rcu_read_lock(); 66 rcu_read_lock();
66
67 filter = rcu_dereference(sk->sk_filter); 67 filter = rcu_dereference(sk->sk_filter);
68 len = filter ? filter->len * sizeof(struct sock_filter) : 0; 68 if (!filter)
69 goto out;
69 70
70 attr = nla_reserve(skb, attrtype, len); 71 fprog = filter->orig_prog;
72 flen = sk_filter_proglen(fprog);
73
74 attr = nla_reserve(skb, attrtype, flen);
71 if (attr == NULL) { 75 if (attr == NULL) {
72 err = -EMSGSIZE; 76 err = -EMSGSIZE;
73 goto out; 77 goto out;
74 } 78 }
75 79
76 if (filter) { 80 memcpy(nla_data(attr), fprog->filter, flen);
77 struct sock_filter *fb = (struct sock_filter *)nla_data(attr);
78 int i;
79
80 for (i = 0; i < filter->len; i++, fb++)
81 sk_decode_filter(&filter->insns[i], fb);
82 }
83
84out: 81out:
85 rcu_read_unlock(); 82 rcu_read_unlock();
86 return err; 83 return err;
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 661b5a40ec10..6521dfd8b7c8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -23,16 +23,11 @@
23#include <linux/skbuff.h> 23#include <linux/skbuff.h>
24#include <linux/export.h> 24#include <linux/export.h>
25 25
26static struct sock_filter ptp_filter[] = {
27 PTP_FILTER
28};
29
30static unsigned int classify(const struct sk_buff *skb) 26static unsigned int classify(const struct sk_buff *skb)
31{ 27{
32 if (likely(skb->dev && 28 if (likely(skb->dev && skb->dev->phydev &&
33 skb->dev->phydev &&
34 skb->dev->phydev->drv)) 29 skb->dev->phydev->drv))
35 return sk_run_filter(skb, ptp_filter); 30 return ptp_classify_raw(skb);
36 else 31 else
37 return PTP_CLASS_NONE; 32 return PTP_CLASS_NONE;
38} 33}
@@ -60,11 +55,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
60 if (likely(phydev->drv->txtstamp)) { 55 if (likely(phydev->drv->txtstamp)) {
61 if (!atomic_inc_not_zero(&sk->sk_refcnt)) 56 if (!atomic_inc_not_zero(&sk->sk_refcnt))
62 return; 57 return;
58
63 clone = skb_clone(skb, GFP_ATOMIC); 59 clone = skb_clone(skb, GFP_ATOMIC);
64 if (!clone) { 60 if (!clone) {
65 sock_put(sk); 61 sock_put(sk);
66 return; 62 return;
67 } 63 }
64
68 clone->sk = sk; 65 clone->sk = sk;
69 phydev->drv->txtstamp(phydev, clone, type); 66 phydev->drv->txtstamp(phydev, clone, type);
70 } 67 }
@@ -89,12 +86,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
89 } 86 }
90 87
91 *skb_hwtstamps(skb) = *hwtstamps; 88 *skb_hwtstamps(skb) = *hwtstamps;
89
92 serr = SKB_EXT_ERR(skb); 90 serr = SKB_EXT_ERR(skb);
93 memset(serr, 0, sizeof(*serr)); 91 memset(serr, 0, sizeof(*serr));
94 serr->ee.ee_errno = ENOMSG; 92 serr->ee.ee_errno = ENOMSG;
95 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 93 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
96 skb->sk = NULL; 94 skb->sk = NULL;
95
97 err = sock_queue_err_skb(sk, skb); 96 err = sock_queue_err_skb(sk, skb);
97
98 sock_put(sk); 98 sock_put(sk);
99 if (err) 99 if (err)
100 kfree_skb(skb); 100 kfree_skb(skb);
@@ -132,8 +132,3 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
132 return false; 132 return false;
133} 133}
134EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); 134EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
135
136void __init skb_timestamping_init(void)
137{
138 BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
139}