summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 21:39:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 21:39:49 -0400
commit1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch)
treedcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /net
parent285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff)
parent7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Add Maglev hashing scheduler to IPVS, from Inju Song. 2) Lots of new TC subsystem tests from Roman Mashak. 3) Add TCP zero copy receive and fix delayed acks and autotuning with SO_RCVLOWAT, from Eric Dumazet. 4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard Brouer. 5) Add ttl inherit support to vxlan, from Hangbin Liu. 6) Properly separate ipv6 routes into their logically independant components. fib6_info for the routing table, and fib6_nh for sets of nexthops, which thus can be shared. From David Ahern. 7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP messages from XDP programs. From Nikita V. Shirokov. 8) Lots of long overdue cleanups to the r8169 driver, from Heiner Kallweit. 9) Add BTF ("BPF Type Format"), from Martin KaFai Lau. 10) Add traffic condition monitoring to iwlwifi, from Luca Coelho. 11) Plumb extack down into fib_rules, from Roopa Prabhu. 12) Add Flower classifier offload support to igb, from Vinicius Costa Gomes. 13) Add UDP GSO support, from Willem de Bruijn. 14) Add documentation for eBPF helpers, from Quentin Monnet. 15) Add TLS tx offload to mlx5, from Ilya Lesokhin. 16) Allow applications to be given the number of bytes available to read on a socket via a control message returned from recvmsg(), from Soheil Hassas Yeganeh. 17) Add x86_32 eBPF JIT compiler, from Wang YanQing. 18) Add AF_XDP sockets, with zerocopy support infrastructure as well. From Björn Töpel. 19) Remove indirect load support from all of the BPF JITs and handle these operations in the verifier by translating them into native BPF instead. From Daniel Borkmann. 20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha. 21) Allow XDP programs to do lookups in the main kernel routing tables for forwarding. From David Ahern. 22) Allow drivers to store hardware state into an ELF section of kernel dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy. 23) Various RACK and loss detection improvements in TCP, from Yuchung Cheng. 24) Add TCP SACK compression, from Eric Dumazet. 25) Add User Mode Helper support and basic bpfilter infrastructure, from Alexei Starovoitov. 26) Support ports and protocol values in RTM_GETROUTE, from Roopa Prabhu. 27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard Brouer. 28) Add lots of forwarding selftests, from Petr Machata. 29) Add generic network device failover driver, from Sridhar Samudrala. * ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits) strparser: Add __strp_unpause and use it in ktls. rxrpc: Fix terminal retransmission connection ID to include the channel net: hns3: Optimize PF CMDQ interrupt switching process net: hns3: Fix for VF mailbox receiving unknown message net: hns3: Fix for VF mailbox cannot receiving PF response bnx2x: use the right constant Revert "net: sched: cls: Fix offloading when ingress dev is vxlan" net: dsa: b53: Fix for brcm tag issue in Cygnus SoC enic: fix UDP rss bits netdev-FAQ: clarify DaveM's position for stable backports rtnetlink: validate attributes in do_setlink() mlxsw: Add extack messages for port_{un, }split failures netdevsim: Add extack error message for devlink reload devlink: Add extack to reload and port_{un, }split operations net: metrics: add proper netlink validation ipmr: fix error path when ipmr_new_table fails ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds net: hns3: remove unused hclgevf_cfg_func_mta_filter netfilter: provide udp*_lib_lookup for nf_tproxy qed*: Utilize FW 8.37.2.0 ...
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c11
-rw-r--r--net/8021q/vlan.h3
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/8021q/vlan_netlink.c45
-rw-r--r--net/9p/mod.c2
-rw-r--r--net/Kconfig22
-rw-r--r--net/Makefile6
-rw-r--r--net/batman-adv/Kconfig6
-rw-r--r--net/batman-adv/bat_v_elp.c15
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/multicast.c29
-rw-r--r--net/batman-adv/soft-interface.c5
-rw-r--r--net/batman-adv/types.h23
-rw-r--r--net/bluetooth/hci_core.c54
-rw-r--r--net/bluetooth/hci_debugfs.c24
-rw-r--r--net/bluetooth/hci_event.c12
-rw-r--r--net/bluetooth/hci_request.c30
-rw-r--r--net/bluetooth/smp.c12
-rw-r--r--net/bpf/test_run.c3
-rw-r--r--net/bpfilter/Kconfig16
-rw-r--r--net/bpfilter/Makefile32
-rw-r--r--net/bpfilter/bpfilter_kern.c114
-rw-r--r--net/bpfilter/main.c63
-rw-r--r--net/bpfilter/msgfmt.h17
-rw-r--r--net/bridge/br.c16
-rw-r--r--net/bridge/br_fdb.c69
-rw-r--r--net/bridge/br_forward.c6
-rw-r--r--net/bridge/br_if.c11
-rw-r--r--net/bridge/br_input.c1
-rw-r--r--net/bridge/br_netlink.c9
-rw-r--r--net/bridge/br_private.h41
-rw-r--r--net/bridge/br_switchdev.c37
-rw-r--r--net/bridge/br_sysfs_if.c2
-rw-r--r--net/bridge/br_vlan.c144
-rw-r--r--net/bridge/netfilter/Kconfig7
-rw-r--r--net/bridge/netfilter/Makefile1
-rw-r--r--net/bridge/netfilter/ebtables.c63
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c135
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/dev.c148
-rw-r--r--net/core/devlink.c111
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c63
-rw-r--r--net/core/failover.c315
-rw-r--r--net/core/fib_rules.c495
-rw-r--r--net/core/filter.c1423
-rw-r--r--net/core/flow_dissector.c19
-rw-r--r--net/core/neighbour.c8
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/page_pool.c317
-rw-r--r--net/core/rtnetlink.c34
-rw-r--r--net/core/skbuff.c25
-rw-r--r--net/core/sock.c40
-rw-r--r--net/core/xdp.c299
-rw-r--r--net/dcb/dcbnl.c20
-rw-r--r--net/dccp/minisocks.c1
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/dsa/Kconfig2
-rw-r--r--net/dsa/dsa2.c24
-rw-r--r--net/dsa/dsa_priv.h9
-rw-r--r--net/dsa/master.c62
-rw-r--r--net/dsa/port.c96
-rw-r--r--net/dsa/slave.c307
-rw-r--r--net/ethernet/eth.c6
-rw-r--r--net/ipv4/Makefile5
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/bpfilter/Makefile2
-rw-r--r--net/ipv4/bpfilter/sockopt.c43
-rw-r--r--net/ipv4/devinet.c15
-rw-r--r--net/ipv4/fib_frontend.c58
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c45
-rw-r--r--net/ipv4/fib_trie.c14
-rw-r--r--net/ipv4/inet_connection_sock.c5
-rw-r--r--net/ipv4/ip_gre.c12
-rw-r--r--net/ipv4/ip_output.c45
-rw-r--r--net/ipv4/ip_sockglue.c17
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ipconfig.c150
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/ipmr_base.c8
-rw-r--r--net/ipv4/metrics.c55
-rw-r--r--net/ipv4/netfilter/Kconfig10
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ip_tables.c7
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c88
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c255
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c143
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c147
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c53
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c2
-rw-r--r--net/ipv4/netlink.c23
-rw-r--r--net/ipv4/proc.c3
-rw-r--r--net/ipv4/route.c177
-rw-r--r--net/ipv4/sysctl_net_ipv4.c22
-rw-r--r--net/ipv4/tcp.c204
-rw-r--r--net/ipv4/tcp_input.c271
-rw-r--r--net/ipv4/tcp_ipv4.c55
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c112
-rw-r--r--net/ipv4/tcp_recovery.c80
-rw-r--r--net/ipv4/tcp_timer.c27
-rw-r--r--net/ipv4/udp.c120
-rw-r--r--net/ipv4/udp_offload.c101
-rw-r--r--net/ipv6/Kconfig5
-rw-r--r--net/ipv6/addrconf.c527
-rw-r--r--net/ipv6/addrconf_core.c41
-rw-r--r--net/ipv6/af_inet6.c65
-rw-r--r--net/ipv6/anycast.c33
-rw-r--r--net/ipv6/exthdrs.c55
-rw-r--r--net/ipv6/exthdrs_core.c2
-rw-r--r--net/ipv6/fib6_rules.c145
-rw-r--r--net/ipv6/ip6_fib.c639
-rw-r--r--net/ipv6/ip6_gre.c51
-rw-r--r--net/ipv6/ip6_input.c2
-rw-r--r--net/ipv6/ip6_offload.c6
-rw-r--r--net/ipv6/ip6_output.c96
-rw-r--r--net/ipv6/ip6_vti.c2
-rw-r--r--net/ipv6/ip6mr.c24
-rw-r--r--net/ipv6/ndisc.c48
-rw-r--r--net/ipv6/netfilter/Kconfig10
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c2
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c2
-rw-r--r--net/ipv6/netfilter/ip6t_srh.c173
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c87
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c246
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c137
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_nat_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c146
-rw-r--r--net/ipv6/netfilter/nft_chain_nat_ipv6.c51
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c2
-rw-r--r--net/ipv6/reassembly.c25
-rw-r--r--net/ipv6/route.c1895
-rw-r--r--net/ipv6/seg6.c1
-rw-r--r--net/ipv6/seg6_iptunnel.c24
-rw-r--r--net/ipv6/seg6_local.c190
-rw-r--r--net/ipv6/sysctl_net_ipv6.c8
-rw-r--r--net/ipv6/tcp_ipv6.c8
-rw-r--r--net/ipv6/udp.c72
-rw-r--r--net/ipv6/udp_offload.c5
-rw-r--r--net/ipv6/xfrm6_policy.c2
-rw-r--r--net/ipv6/xfrm6_state.c6
-rw-r--r--net/l2tp/l2tp_debugfs.c20
-rw-r--r--net/l2tp/l2tp_ppp.c56
-rw-r--r--net/mac80211/cfg.c103
-rw-r--r--net/mac80211/driver-ops.h8
-rw-r--r--net/mac80211/ethtool.c13
-rw-r--r--net/mac80211/ht.c44
-rw-r--r--net/mac80211/ieee80211_i.h3
-rw-r--r--net/mac80211/main.c3
-rw-r--r--net/mac80211/mlme.c17
-rw-r--r--net/mac80211/rx.c40
-rw-r--r--net/mac80211/sta_info.c38
-rw-r--r--net/mac80211/sta_info.h5
-rw-r--r--net/mac80211/status.c2
-rw-r--r--net/mac80211/trace.h25
-rw-r--r--net/mac80211/tx.c45
-rw-r--r--net/mac80211/util.c6
-rw-r--r--net/ncsi/internal.h34
-rw-r--r--net/ncsi/ncsi-manage.c226
-rw-r--r--net/ncsi/ncsi-netlink.c21
-rw-r--r--net/ncsi/ncsi-rsp.c179
-rw-r--r--net/netfilter/Kconfig51
-rw-r--r--net/netfilter/Makefile12
-rw-r--r--net/netfilter/core.c102
-rw-r--r--net/netfilter/ipvs/Kconfig37
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c24
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c467
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c540
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c101
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c5
-rw-r--r--net/netfilter/nf_conncount.c36
-rw-r--r--net/netfilter/nf_conntrack_core.c92
-rw-r--r--net/netfilter/nf_conntrack_ftp.c3
-rw-r--r--net/netfilter/nf_conntrack_irc.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c13
-rw-r--r--net/netfilter/nf_conntrack_sane.c3
-rw-r--r--net/netfilter/nf_conntrack_sip.c2
-rw-r--r--net/netfilter/nf_conntrack_tftp.c2
-rw-r--r--net/netfilter/nf_flow_table_core.c (renamed from net/netfilter/nf_flow_table.c)309
-rw-r--r--net/netfilter/nf_flow_table_inet.c3
-rw-r--r--net/netfilter/nf_flow_table_ip.c489
-rw-r--r--net/netfilter/nf_internals.h5
-rw-r--r--net/netfilter/nf_nat_core.c321
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_common.c9
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c2
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c2
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c2
-rw-r--r--net/netfilter/nf_nat_proto_udp.c4
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c2
-rw-r--r--net/netfilter/nf_nat_redirect.c10
-rw-r--r--net/netfilter/nf_nat_sip.c2
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c1325
-rw-r--r--net/netfilter/nf_tables_core.c72
-rw-r--r--net/netfilter/nfnetlink.c44
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nfnetlink_queue.c28
-rw-r--r--net/netfilter/nft_compat.c29
-rw-r--r--net/netfilter/nft_connlimit.c297
-rw-r--r--net/netfilter/nft_counter.c4
-rw-r--r--net/netfilter/nft_ct.c3
-rw-r--r--net/netfilter/nft_dynset.c16
-rw-r--r--net/netfilter/nft_exthdr.c23
-rw-r--r--net/netfilter/nft_flow_offload.c5
-rw-r--r--net/netfilter/nft_fwd_netdev.c146
-rw-r--r--net/netfilter/nft_hash.c127
-rw-r--r--net/netfilter/nft_immediate.c27
-rw-r--r--net/netfilter/nft_log.c92
-rw-r--r--net/netfilter/nft_lookup.c47
-rw-r--r--net/netfilter/nft_meta.c112
-rw-r--r--net/netfilter/nft_nat.c2
-rw-r--r--net/netfilter/nft_numgen.c158
-rw-r--r--net/netfilter/nft_objref.c4
-rw-r--r--net/netfilter/nft_rt.c22
-rw-r--r--net/netfilter/nft_set_bitmap.c34
-rw-r--r--net/netfilter/nft_set_hash.c174
-rw-r--r--net/netfilter/nft_set_rbtree.c109
-rw-r--r--net/netfilter/nft_socket.c144
-rw-r--r--net/netfilter/xt_NETMAP.c8
-rw-r--r--net/netfilter/xt_NFLOG.c15
-rw-r--r--net/netfilter/xt_REDIRECT.c2
-rw-r--r--net/netfilter/xt_TPROXY.c366
-rw-r--r--net/netfilter/xt_nat.c72
-rw-r--r--net/netfilter/xt_osf.c202
-rw-r--r--net/netfilter/xt_socket.c4
-rw-r--r--net/nfc/netlink.c17
-rw-r--r--net/openvswitch/Kconfig3
-rw-r--r--net/openvswitch/conntrack.c555
-rw-r--r--net/openvswitch/conntrack.h9
-rw-r--r--net/openvswitch/datapath.c7
-rw-r--r--net/openvswitch/datapath.h3
-rw-r--r--net/packet/af_packet.c44
-rw-r--r--net/qrtr/Kconfig7
-rw-r--r--net/qrtr/Makefile2
-rw-r--r--net/qrtr/tun.c161
-rw-r--r--net/rfkill/core.c66
-rw-r--r--net/rxrpc/ar-internal.h2
-rw-r--r--net/rxrpc/call_event.c8
-rw-r--r--net/rxrpc/conn_event.c2
-rw-r--r--net/rxrpc/input.c10
-rw-r--r--net/sched/act_api.c20
-rw-r--r--net/sched/act_csum.c6
-rw-r--r--net/sched/cls_api.c443
-rw-r--r--net/sched/cls_basic.c24
-rw-r--r--net/sched/cls_bpf.c22
-rw-r--r--net/sched/cls_cgroup.c23
-rw-r--r--net/sched/cls_flow.c24
-rw-r--r--net/sched/cls_flower.c317
-rw-r--r--net/sched/cls_fw.c24
-rw-r--r--net/sched/cls_matchall.c21
-rw-r--r--net/sched/cls_route.c23
-rw-r--r--net/sched/cls_rsvp.h20
-rw-r--r--net/sched/cls_tcindex.c41
-rw-r--r--net/sched/cls_u32.c37
-rw-r--r--net/sched/sch_generic.c49
-rw-r--r--net/sched/sch_mq.c37
-rw-r--r--net/sctp/associola.c85
-rw-r--r--net/sctp/chunk.c12
-rw-r--r--net/sctp/output.c28
-rw-r--r--net/sctp/outqueue.c660
-rw-r--r--net/sctp/sm_make_chunk.c143
-rw-r--r--net/sctp/socket.c43
-rw-r--r--net/sctp/transport.c39
-rw-r--r--net/smc/af_smc.c803
-rw-r--r--net/smc/smc.h68
-rw-r--r--net/smc/smc_cdc.c101
-rw-r--r--net/smc/smc_cdc.h15
-rw-r--r--net/smc/smc_clc.c6
-rw-r--r--net/smc/smc_clc.h2
-rw-r--r--net/smc/smc_core.c199
-rw-r--r--net/smc/smc_core.h29
-rw-r--r--net/smc/smc_diag.c44
-rw-r--r--net/smc/smc_ib.c13
-rw-r--r--net/smc/smc_llc.c242
-rw-r--r--net/smc/smc_llc.h8
-rw-r--r--net/smc/smc_rx.c308
-rw-r--r--net/smc/smc_rx.h11
-rw-r--r--net/smc/smc_tx.c111
-rw-r--r--net/smc/smc_tx.h5
-rw-r--r--net/smc/smc_wr.c1
-rw-r--r--net/strparser/strparser.c13
-rw-r--r--net/tipc/bearer.c29
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/name_table.c103
-rw-r--r--net/tipc/node.c33
-rw-r--r--net/tipc/node.h3
-rw-r--r--net/tipc/socket.c13
-rw-r--r--net/tipc/udp_media.c4
-rw-r--r--net/tipc/udp_media.h14
-rw-r--r--net/tls/Kconfig10
-rw-r--r--net/tls/Makefile2
-rw-r--r--net/tls/tls_device.c766
-rw-r--r--net/tls/tls_device_fallback.c450
-rw-r--r--net/tls/tls_main.c139
-rw-r--r--net/tls/tls_sw.c143
-rw-r--r--net/wireless/core.c4
-rw-r--r--net/wireless/nl80211.c304
-rw-r--r--net/wireless/rdev-ops.h12
-rw-r--r--net/wireless/reg.c39
-rw-r--r--net/wireless/sme.c88
-rw-r--r--net/wireless/trace.h14
-rw-r--r--net/wireless/util.c11
-rw-r--r--net/xdp/Kconfig7
-rw-r--r--net/xdp/Makefile1
-rw-r--r--net/xdp/xdp_umem.c361
-rw-r--r--net/xdp/xdp_umem.h30
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c788
-rw-r--r--net/xdp/xsk_queue.c63
-rw-r--r--net/xdp/xsk_queue.h265
-rw-r--r--net/xfrm/xfrm_state.c9
332 files changed, 20336 insertions, 8128 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5505ee6ebdbe..73a65789271b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -118,17 +118,21 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
118} 118}
119 119
120int vlan_check_real_dev(struct net_device *real_dev, 120int vlan_check_real_dev(struct net_device *real_dev,
121 __be16 protocol, u16 vlan_id) 121 __be16 protocol, u16 vlan_id,
122 struct netlink_ext_ack *extack)
122{ 123{
123 const char *name = real_dev->name; 124 const char *name = real_dev->name;
124 125
125 if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { 126 if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
126 pr_info("VLANs not supported on %s\n", name); 127 pr_info("VLANs not supported on %s\n", name);
128 NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
127 return -EOPNOTSUPP; 129 return -EOPNOTSUPP;
128 } 130 }
129 131
130 if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) 132 if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) {
133 NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists");
131 return -EEXIST; 134 return -EEXIST;
135 }
132 136
133 return 0; 137 return 0;
134} 138}
@@ -215,7 +219,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
215 if (vlan_id >= VLAN_VID_MASK) 219 if (vlan_id >= VLAN_VID_MASK)
216 return -ERANGE; 220 return -ERANGE;
217 221
218 err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id); 222 err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id,
223 NULL);
219 if (err < 0) 224 if (err < 0)
220 return err; 225 return err;
221 226
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index e23aac3e4d37..44df1c3df02d 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,7 +109,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
109void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); 109void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
110 110
111int vlan_check_real_dev(struct net_device *real_dev, 111int vlan_check_real_dev(struct net_device *real_dev,
112 __be16 protocol, u16 vlan_id); 112 __be16 protocol, u16 vlan_id,
113 struct netlink_ext_ack *extack);
113void vlan_setup(struct net_device *dev); 114void vlan_setup(struct net_device *dev);
114int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); 115int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
115void unregister_vlan_dev(struct net_device *dev, struct list_head *head); 116void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 236452ebbd9e..546af0e73ac3 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -215,7 +215,9 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
215 return 0; 215 return 0;
216} 216}
217 217
218/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */ 218/* Flags are defined in the vlan_flags enum in
219 * include/uapi/linux/if_vlan.h file.
220 */
219int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) 221int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
220{ 222{
221 struct vlan_dev_priv *vlan = vlan_dev_priv(dev); 223 struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 6689c0b272a7..9b60c1e399e2 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -47,14 +47,20 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
47 int err; 47 int err;
48 48
49 if (tb[IFLA_ADDRESS]) { 49 if (tb[IFLA_ADDRESS]) {
50 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 50 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
51 NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
51 return -EINVAL; 52 return -EINVAL;
52 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 53 }
54 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
55 NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
53 return -EADDRNOTAVAIL; 56 return -EADDRNOTAVAIL;
57 }
54 } 58 }
55 59
56 if (!data) 60 if (!data) {
61 NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified");
57 return -EINVAL; 62 return -EINVAL;
63 }
58 64
59 if (data[IFLA_VLAN_PROTOCOL]) { 65 if (data[IFLA_VLAN_PROTOCOL]) {
60 switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { 66 switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
@@ -62,29 +68,38 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
62 case htons(ETH_P_8021AD): 68 case htons(ETH_P_8021AD):
63 break; 69 break;
64 default: 70 default:
71 NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol");
65 return -EPROTONOSUPPORT; 72 return -EPROTONOSUPPORT;
66 } 73 }
67 } 74 }
68 75
69 if (data[IFLA_VLAN_ID]) { 76 if (data[IFLA_VLAN_ID]) {
70 id = nla_get_u16(data[IFLA_VLAN_ID]); 77 id = nla_get_u16(data[IFLA_VLAN_ID]);
71 if (id >= VLAN_VID_MASK) 78 if (id >= VLAN_VID_MASK) {
79 NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id");
72 return -ERANGE; 80 return -ERANGE;
81 }
73 } 82 }
74 if (data[IFLA_VLAN_FLAGS]) { 83 if (data[IFLA_VLAN_FLAGS]) {
75 flags = nla_data(data[IFLA_VLAN_FLAGS]); 84 flags = nla_data(data[IFLA_VLAN_FLAGS]);
76 if ((flags->flags & flags->mask) & 85 if ((flags->flags & flags->mask) &
77 ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | 86 ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
78 VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) 87 VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) {
88 NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
79 return -EINVAL; 89 return -EINVAL;
90 }
80 } 91 }
81 92
82 err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); 93 err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
83 if (err < 0) 94 if (err < 0) {
95 NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map");
84 return err; 96 return err;
97 }
85 err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); 98 err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
86 if (err < 0) 99 if (err < 0) {
100 NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map");
87 return err; 101 return err;
102 }
88 return 0; 103 return 0;
89} 104}
90 105
@@ -126,14 +141,21 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
126 __be16 proto; 141 __be16 proto;
127 int err; 142 int err;
128 143
129 if (!data[IFLA_VLAN_ID]) 144 if (!data[IFLA_VLAN_ID]) {
145 NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified");
130 return -EINVAL; 146 return -EINVAL;
147 }
131 148
132 if (!tb[IFLA_LINK]) 149 if (!tb[IFLA_LINK]) {
150 NL_SET_ERR_MSG_MOD(extack, "link not specified");
133 return -EINVAL; 151 return -EINVAL;
152 }
153
134 real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); 154 real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
135 if (!real_dev) 155 if (!real_dev) {
156 NL_SET_ERR_MSG_MOD(extack, "link does not exist");
136 return -ENODEV; 157 return -ENODEV;
158 }
137 159
138 if (data[IFLA_VLAN_PROTOCOL]) 160 if (data[IFLA_VLAN_PROTOCOL])
139 proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); 161 proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
@@ -146,7 +168,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
146 dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); 168 dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
147 vlan->flags = VLAN_FLAG_REORDER_HDR; 169 vlan->flags = VLAN_FLAG_REORDER_HDR;
148 170
149 err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id); 171 err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id,
172 extack);
150 if (err < 0) 173 if (err < 0)
151 return err; 174 return err;
152 175
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 6ab36aea7727..eb9777f05755 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(v9fs_unregister_trans);
104 104
105/** 105/**
106 * v9fs_get_trans_by_name - get transport with the matching name 106 * v9fs_get_trans_by_name - get transport with the matching name
107 * @name: string identifying transport 107 * @s: string identifying transport
108 * 108 *
109 */ 109 */
110struct p9_trans_module *v9fs_get_trans_by_name(char *s) 110struct p9_trans_module *v9fs_get_trans_by_name(char *s)
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12c25c2..f738a6f27665 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
59source "net/xfrm/Kconfig" 59source "net/xfrm/Kconfig"
60source "net/iucv/Kconfig" 60source "net/iucv/Kconfig"
61source "net/smc/Kconfig" 61source "net/smc/Kconfig"
62source "net/xdp/Kconfig"
62 63
63config INET 64config INET
64 bool "TCP/IP networking" 65 bool "TCP/IP networking"
@@ -201,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
201 202
202endif 203endif
203 204
205source "net/bpfilter/Kconfig"
206
204source "net/dccp/Kconfig" 207source "net/dccp/Kconfig"
205source "net/sctp/Kconfig" 208source "net/sctp/Kconfig"
206source "net/rds/Kconfig" 209source "net/rds/Kconfig"
@@ -407,6 +410,9 @@ config GRO_CELLS
407 bool 410 bool
408 default n 411 default n
409 412
413config SOCK_VALIDATE_XMIT
414 bool
415
410config NET_DEVLINK 416config NET_DEVLINK
411 tristate "Network physical/parent device Netlink interface" 417 tristate "Network physical/parent device Netlink interface"
412 help 418 help
@@ -423,6 +429,22 @@ config MAY_USE_DEVLINK
423 on MAY_USE_DEVLINK to ensure they do not cause link errors when 429 on MAY_USE_DEVLINK to ensure they do not cause link errors when
424 devlink is a loadable module and the driver using it is built-in. 430 devlink is a loadable module and the driver using it is built-in.
425 431
432config PAGE_POOL
433 bool
434
435config FAILOVER
436 tristate "Generic failover module"
437 help
438 The failover module provides a generic interface for paravirtual
439 drivers to register a netdev and a set of ops with a failover
440 instance. The ops are used as event handlers that get called to
441 handle netdev register/unregister/link change/name change events
442 on slave pci ethernet devices with the same mac address as the
443 failover netdev. This enables paravirtual drivers to use a
444 VF as an accelerated low latency datapath. It also allows live
445 migration of VMs with direct attached VFs by failing over to the
446 paravirtual datapath when the VF is unplugged.
447
426endif # if NET 448endif # if NET
427 449
428# Used by archs to tell that they support BPF JIT compiler plus which flavour. 450# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..13ec0d5415c7 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,11 @@ obj-$(CONFIG_TLS) += tls/
20obj-$(CONFIG_XFRM) += xfrm/ 20obj-$(CONFIG_XFRM) += xfrm/
21obj-$(CONFIG_UNIX) += unix/ 21obj-$(CONFIG_UNIX) += unix/
22obj-$(CONFIG_NET) += ipv6/ 22obj-$(CONFIG_NET) += ipv6/
23ifneq ($(CC_CAN_LINK),y)
24$(warning CC cannot link executables. Skipping bpfilter.)
25else
26obj-$(CONFIG_BPFILTER) += bpfilter/
27endif
23obj-$(CONFIG_PACKET) += packet/ 28obj-$(CONFIG_PACKET) += packet/
24obj-$(CONFIG_NET_KEY) += key/ 29obj-$(CONFIG_NET_KEY) += key/
25obj-$(CONFIG_BRIDGE) += bridge/ 30obj-$(CONFIG_BRIDGE) += bridge/
@@ -85,3 +90,4 @@ obj-y += l3mdev/
85endif 90endif
86obj-$(CONFIG_QRTR) += qrtr/ 91obj-$(CONFIG_QRTR) += qrtr/
87obj-$(CONFIG_NET_NCSI) += ncsi/ 92obj-$(CONFIG_NET_NCSI) += ncsi/
93obj-$(CONFIG_XDP_SOCKETS) += xdp/
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index e4e2e02b7380..de8034d80623 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -35,7 +35,7 @@ config BATMAN_ADV
35config BATMAN_ADV_BATMAN_V 35config BATMAN_ADV_BATMAN_V
36 bool "B.A.T.M.A.N. V protocol (experimental)" 36 bool "B.A.T.M.A.N. V protocol (experimental)"
37 depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y) 37 depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
38 default n 38 default y
39 help 39 help
40 This option enables the B.A.T.M.A.N. V protocol, the successor 40 This option enables the B.A.T.M.A.N. V protocol, the successor
41 of the currently used B.A.T.M.A.N. IV protocol. The main 41 of the currently used B.A.T.M.A.N. IV protocol. The main
@@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS
94 bool "batman-adv debugfs entries" 94 bool "batman-adv debugfs entries"
95 depends on BATMAN_ADV 95 depends on BATMAN_ADV
96 depends on DEBUG_FS 96 depends on DEBUG_FS
97 default y 97 default n
98 help 98 help
99 Enable this to export routing related debug tables via debugfs. 99 Enable this to export routing related debug tables via debugfs.
100 The information for each soft-interface and used hard-interface can be 100 The information for each soft-interface and used hard-interface can be
101 found under batman_adv/ 101 found under batman_adv/
102 102
103 If unsure, say Y. 103 If unsure, say N.
104 104
105config BATMAN_ADV_DEBUG 105config BATMAN_ADV_DEBUG
106 bool "B.A.T.M.A.N. debugging" 106 bool "B.A.T.M.A.N. debugging"
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 28687493599f..71c20c1d4002 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -127,7 +127,20 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
127 rtnl_lock(); 127 rtnl_lock();
128 ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); 128 ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
129 rtnl_unlock(); 129 rtnl_unlock();
130 if (ret == 0) { 130
131 /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc
132 * tend to initialize the interface throughput with some value for the
133 * sake of having a throughput number to export via ethtool. This
134 * exported throughput leaves batman-adv to conclude the interface
135 * throughput is genuine (reflecting reality), thus no measurements
136 * are necessary.
137 *
138 * Based on the observation that those interface types also tend to set
139 * the link auto-negotiation to 'off', batman-adv shall check this
140 * setting to differentiate between genuine link throughput information
141 * and placeholders installed by virtual interfaces.
142 */
143 if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) {
131 /* link characteristics might change over time */ 144 /* link characteristics might change over time */
132 if (link_settings.base.duplex == DUPLEX_FULL) 145 if (link_settings.base.duplex == DUPLEX_FULL)
133 hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; 146 hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 057a28a9fe88..8da3c9336111 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
25#define BATADV_DRIVER_DEVICE "batman-adv" 25#define BATADV_DRIVER_DEVICE "batman-adv"
26 26
27#ifndef BATADV_SOURCE_VERSION 27#ifndef BATADV_SOURCE_VERSION
28#define BATADV_SOURCE_VERSION "2018.1" 28#define BATADV_SOURCE_VERSION "2018.2"
29#endif 29#endif
30 30
31/* B.A.T.M.A.N. parameters */ 31/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index a35f597e8c8b..86725d792e15 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -815,9 +815,6 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
815 if (!atomic_read(&bat_priv->multicast_mode)) 815 if (!atomic_read(&bat_priv->multicast_mode))
816 return -EINVAL; 816 return -EINVAL;
817 817
818 if (atomic_read(&bat_priv->mcast.num_disabled))
819 return -EINVAL;
820
821 switch (ntohs(ethhdr->h_proto)) { 818 switch (ntohs(ethhdr->h_proto)) {
822 case ETH_P_IP: 819 case ETH_P_IP:
823 return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, 820 return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
@@ -1183,33 +1180,23 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
1183{ 1180{
1184 bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); 1181 bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
1185 u8 mcast_flags = BATADV_NO_FLAGS; 1182 u8 mcast_flags = BATADV_NO_FLAGS;
1186 bool orig_initialized;
1187 1183
1188 if (orig_mcast_enabled && tvlv_value && 1184 if (orig_mcast_enabled && tvlv_value &&
1189 tvlv_value_len >= sizeof(mcast_flags)) 1185 tvlv_value_len >= sizeof(mcast_flags))
1190 mcast_flags = *(u8 *)tvlv_value; 1186 mcast_flags = *(u8 *)tvlv_value;
1191 1187
1188 if (!orig_mcast_enabled) {
1189 mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
1190 mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
1191 }
1192
1192 spin_lock_bh(&orig->mcast_handler_lock); 1193 spin_lock_bh(&orig->mcast_handler_lock);
1193 orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
1194 &orig->capa_initialized);
1195 1194
1196 /* If mcast support is turned on decrease the disabled mcast node
1197 * counter only if we had increased it for this node before. If this
1198 * is a completely new orig_node no need to decrease the counter.
1199 */
1200 if (orig_mcast_enabled && 1195 if (orig_mcast_enabled &&
1201 !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { 1196 !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
1202 if (orig_initialized)
1203 atomic_dec(&bat_priv->mcast.num_disabled);
1204 set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); 1197 set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
1205 /* If mcast support is being switched off or if this is an initial
1206 * OGM without mcast support then increase the disabled mcast
1207 * node counter.
1208 */
1209 } else if (!orig_mcast_enabled && 1198 } else if (!orig_mcast_enabled &&
1210 (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) || 1199 test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
1211 !orig_initialized)) {
1212 atomic_inc(&bat_priv->mcast.num_disabled);
1213 clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); 1200 clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
1214 } 1201 }
1215 1202
@@ -1595,10 +1582,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
1595 1582
1596 spin_lock_bh(&orig->mcast_handler_lock); 1583 spin_lock_bh(&orig->mcast_handler_lock);
1597 1584
1598 if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) &&
1599 test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized))
1600 atomic_dec(&bat_priv->mcast.num_disabled);
1601
1602 batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); 1585 batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
1603 batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); 1586 batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
1604 batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); 1587 batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index edeffcb9f3a2..1485263a348b 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -188,8 +188,8 @@ static void batadv_interface_set_rx_mode(struct net_device *dev)
188{ 188{
189} 189}
190 190
191static int batadv_interface_tx(struct sk_buff *skb, 191static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
192 struct net_device *soft_iface) 192 struct net_device *soft_iface)
193{ 193{
194 struct ethhdr *ethhdr; 194 struct ethhdr *ethhdr;
195 struct batadv_priv *bat_priv = netdev_priv(soft_iface); 195 struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -796,7 +796,6 @@ static int batadv_softif_init_late(struct net_device *dev)
796 bat_priv->mcast.querier_ipv6.shadowing = false; 796 bat_priv->mcast.querier_ipv6.shadowing = false;
797 bat_priv->mcast.flags = BATADV_NO_FLAGS; 797 bat_priv->mcast.flags = BATADV_NO_FLAGS;
798 atomic_set(&bat_priv->multicast_mode, 1); 798 atomic_set(&bat_priv->multicast_mode, 1);
799 atomic_set(&bat_priv->mcast.num_disabled, 0);
800 atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); 799 atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
801 atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); 800 atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
802 atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); 801 atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 476b052ad982..360357f83f20 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -215,10 +215,12 @@ struct batadv_hard_iface {
215 struct batadv_hard_iface_bat_v bat_v; 215 struct batadv_hard_iface_bat_v bat_v;
216#endif 216#endif
217 217
218#ifdef CONFIG_BATMAN_ADV_DEBUGFS
218 /** 219 /**
219 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs 220 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
220 */ 221 */
221 struct dentry *debug_dir; 222 struct dentry *debug_dir;
223#endif
222 224
223 /** 225 /**
224 * @neigh_list: list of unique single hop neighbors via this interface 226 * @neigh_list: list of unique single hop neighbors via this interface
@@ -1160,13 +1162,13 @@ struct batadv_priv_dat {
1160 */ 1162 */
1161struct batadv_mcast_querier_state { 1163struct batadv_mcast_querier_state {
1162 /** @exists: whether a querier exists in the mesh */ 1164 /** @exists: whether a querier exists in the mesh */
1163 bool exists; 1165 unsigned char exists:1;
1164 1166
1165 /** 1167 /**
1166 * @shadowing: if a querier exists, whether it is potentially shadowing 1168 * @shadowing: if a querier exists, whether it is potentially shadowing
1167 * multicast listeners (i.e. querier is behind our own bridge segment) 1169 * multicast listeners (i.e. querier is behind our own bridge segment)
1168 */ 1170 */
1169 bool shadowing; 1171 unsigned char shadowing:1;
1170}; 1172};
1171 1173
1172/** 1174/**
@@ -1207,13 +1209,10 @@ struct batadv_priv_mcast {
1207 u8 flags; 1209 u8 flags;
1208 1210
1209 /** @enabled: whether the multicast tvlv is currently enabled */ 1211 /** @enabled: whether the multicast tvlv is currently enabled */
1210 bool enabled; 1212 unsigned char enabled:1;
1211 1213
1212 /** @bridged: whether the soft interface has a bridge on top */ 1214 /** @bridged: whether the soft interface has a bridge on top */
1213 bool bridged; 1215 unsigned char bridged:1;
1214
1215 /** @num_disabled: number of nodes that have no mcast tvlv */
1216 atomic_t num_disabled;
1217 1216
1218 /** 1217 /**
1219 * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP 1218 * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
@@ -1245,10 +1244,12 @@ struct batadv_priv_nc {
1245 /** @work: work queue callback item for cleanup */ 1244 /** @work: work queue callback item for cleanup */
1246 struct delayed_work work; 1245 struct delayed_work work;
1247 1246
1247#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1248 /** 1248 /**
1249 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs 1249 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
1250 */ 1250 */
1251 struct dentry *debug_dir; 1251 struct dentry *debug_dir;
1252#endif
1252 1253
1253 /** 1254 /**
1254 * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq 1255 * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
@@ -1392,7 +1393,7 @@ struct batadv_tp_vars {
1392 atomic_t dup_acks; 1393 atomic_t dup_acks;
1393 1394
1394 /** @fast_recovery: true if in Fast Recovery mode */ 1395 /** @fast_recovery: true if in Fast Recovery mode */
1395 bool fast_recovery; 1396 unsigned char fast_recovery:1;
1396 1397
1397 /** @recover: last sent seqno when entering Fast Recovery */ 1398 /** @recover: last sent seqno when entering Fast Recovery */
1398 u32 recover; 1399 u32 recover;
@@ -1601,8 +1602,10 @@ struct batadv_priv {
1601 /** @mesh_obj: kobject for sysfs mesh subdirectory */ 1602 /** @mesh_obj: kobject for sysfs mesh subdirectory */
1602 struct kobject *mesh_obj; 1603 struct kobject *mesh_obj;
1603 1604
1605#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1604 /** @debug_dir: dentry for debugfs batman-adv subdirectory */ 1606 /** @debug_dir: dentry for debugfs batman-adv subdirectory */
1605 struct dentry *debug_dir; 1607 struct dentry *debug_dir;
1608#endif
1606 1609
1607 /** @forw_bat_list: list of aggregated OGMs that will be forwarded */ 1610 /** @forw_bat_list: list of aggregated OGMs that will be forwarded */
1608 struct hlist_head forw_bat_list; 1611 struct hlist_head forw_bat_list;
@@ -2049,10 +2052,10 @@ struct batadv_skb_cb {
2049 * @decoded: Marks a skb as decoded, which is checked when searching for 2052 * @decoded: Marks a skb as decoded, which is checked when searching for
2050 * coding opportunities in network-coding.c 2053 * coding opportunities in network-coding.c
2051 */ 2054 */
2052 bool decoded; 2055 unsigned char decoded:1;
2053 2056
2054 /** @num_bcasts: Counter for broadcast packet retransmissions */ 2057 /** @num_bcasts: Counter for broadcast packet retransmissions */
2055 unsigned int num_bcasts; 2058 unsigned char num_bcasts;
2056}; 2059};
2057 2060
2058/** 2061/**
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 40d260f2bea5..1dec33790198 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -76,19 +76,15 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
76{ 76{
77 struct hci_dev *hdev = file->private_data; 77 struct hci_dev *hdev = file->private_data;
78 struct sk_buff *skb; 78 struct sk_buff *skb;
79 char buf[32];
80 size_t buf_size = min(count, (sizeof(buf)-1));
81 bool enable; 79 bool enable;
80 int err;
82 81
83 if (!test_bit(HCI_UP, &hdev->flags)) 82 if (!test_bit(HCI_UP, &hdev->flags))
84 return -ENETDOWN; 83 return -ENETDOWN;
85 84
86 if (copy_from_user(buf, user_buf, buf_size)) 85 err = kstrtobool_from_user(user_buf, count, &enable);
87 return -EFAULT; 86 if (err)
88 87 return err;
89 buf[buf_size] = '\0';
90 if (strtobool(buf, &enable))
91 return -EINVAL;
92 88
93 if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) 89 if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
94 return -EALREADY; 90 return -EALREADY;
@@ -135,17 +131,12 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
135 size_t count, loff_t *ppos) 131 size_t count, loff_t *ppos)
136{ 132{
137 struct hci_dev *hdev = file->private_data; 133 struct hci_dev *hdev = file->private_data;
138 char buf[32];
139 size_t buf_size = min(count, (sizeof(buf)-1));
140 bool enable; 134 bool enable;
141 int err; 135 int err;
142 136
143 if (copy_from_user(buf, user_buf, buf_size)) 137 err = kstrtobool_from_user(user_buf, count, &enable);
144 return -EFAULT; 138 if (err)
145 139 return err;
146 buf[buf_size] = '\0';
147 if (strtobool(buf, &enable))
148 return -EINVAL;
149 140
150 /* When the diagnostic flags are not persistent and the transport 141 /* When the diagnostic flags are not persistent and the transport
151 * is not active or in user channel operation, then there is no need 142 * is not active or in user channel operation, then there is no need
@@ -3422,6 +3413,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
3422 return 0; 3413 return 0;
3423} 3414}
3424 3415
3416int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
3417 const void *param)
3418{
3419 struct sk_buff *skb;
3420
3421 if (hci_opcode_ogf(opcode) != 0x3f) {
3422 /* A controller receiving a command shall respond with either
3423 * a Command Status Event or a Command Complete Event.
3424 * Therefore, all standard HCI commands must be sent via the
3425 * standard API, using hci_send_cmd or hci_cmd_sync helpers.
3426 * Some vendors do not comply with this rule for vendor-specific
3427 * commands and do not return any event. We want to support
3428 * unresponded commands for such cases only.
3429 */
3430 bt_dev_err(hdev, "unresponded command not supported");
3431 return -EINVAL;
3432 }
3433
3434 skb = hci_prepare_cmd(hdev, opcode, plen, param);
3435 if (!skb) {
3436 bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
3437 opcode);
3438 return -ENOMEM;
3439 }
3440
3441 hci_send_frame(hdev, skb);
3442
3443 return 0;
3444}
3445EXPORT_SYMBOL(__hci_cmd_send);
3446
3425/* Get data from the previously sent command */ 3447/* Get data from the previously sent command */
3426void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) 3448void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
3427{ 3449{
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 418b76e557b0..0d8ab5b3c177 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -47,19 +47,15 @@ static ssize_t __name ## _write(struct file *file, \
47 size_t count, loff_t *ppos) \ 47 size_t count, loff_t *ppos) \
48{ \ 48{ \
49 struct hci_dev *hdev = file->private_data; \ 49 struct hci_dev *hdev = file->private_data; \
50 char buf[32]; \
51 size_t buf_size = min(count, (sizeof(buf) - 1)); \
52 bool enable; \ 50 bool enable; \
51 int err; \
53 \ 52 \
54 if (test_bit(HCI_UP, &hdev->flags)) \ 53 if (test_bit(HCI_UP, &hdev->flags)) \
55 return -EBUSY; \ 54 return -EBUSY; \
56 \ 55 \
57 if (copy_from_user(buf, user_buf, buf_size)) \ 56 err = kstrtobool_from_user(user_buf, count, &enable); \
58 return -EFAULT; \ 57 if (err) \
59 \ 58 return err; \
60 buf[buf_size] = '\0'; \
61 if (strtobool(buf, &enable)) \
62 return -EINVAL; \
63 \ 59 \
64 if (enable == test_bit(__quirk, &hdev->quirks)) \ 60 if (enable == test_bit(__quirk, &hdev->quirks)) \
65 return -EALREADY; \ 61 return -EALREADY; \
@@ -658,19 +654,15 @@ static ssize_t force_static_address_write(struct file *file,
658 size_t count, loff_t *ppos) 654 size_t count, loff_t *ppos)
659{ 655{
660 struct hci_dev *hdev = file->private_data; 656 struct hci_dev *hdev = file->private_data;
661 char buf[32];
662 size_t buf_size = min(count, (sizeof(buf)-1));
663 bool enable; 657 bool enable;
658 int err;
664 659
665 if (test_bit(HCI_UP, &hdev->flags)) 660 if (test_bit(HCI_UP, &hdev->flags))
666 return -EBUSY; 661 return -EBUSY;
667 662
668 if (copy_from_user(buf, user_buf, buf_size)) 663 err = kstrtobool_from_user(user_buf, count, &enable);
669 return -EFAULT; 664 if (err)
670 665 return err;
671 buf[buf_size] = '\0';
672 if (strtobool(buf, &enable))
673 return -EINVAL;
674 666
675 if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR)) 667 if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR))
676 return -EALREADY; 668 return -EALREADY;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 139707cd9d35..235b5aaab23d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4942,10 +4942,14 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
4942 struct hci_ev_le_advertising_info *ev = ptr; 4942 struct hci_ev_le_advertising_info *ev = ptr;
4943 s8 rssi; 4943 s8 rssi;
4944 4944
4945 rssi = ev->data[ev->length]; 4945 if (ev->length <= HCI_MAX_AD_LENGTH) {
4946 process_adv_report(hdev, ev->evt_type, &ev->bdaddr, 4946 rssi = ev->data[ev->length];
4947 ev->bdaddr_type, NULL, 0, rssi, 4947 process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
4948 ev->data, ev->length); 4948 ev->bdaddr_type, NULL, 0, rssi,
4949 ev->data, ev->length);
4950 } else {
4951 bt_dev_err(hdev, "Dropping invalid advertising data");
4952 }
4949 4953
4950 ptr += sizeof(*ev) + ev->length + 1; 4954 ptr += sizeof(*ev) + ev->length + 1;
4951 } 4955 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 66c0781773df..e44d34734834 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -122,7 +122,6 @@ void hci_req_sync_cancel(struct hci_dev *hdev, int err)
122struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, 122struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
123 const void *param, u8 event, u32 timeout) 123 const void *param, u8 event, u32 timeout)
124{ 124{
125 DECLARE_WAITQUEUE(wait, current);
126 struct hci_request req; 125 struct hci_request req;
127 struct sk_buff *skb; 126 struct sk_buff *skb;
128 int err = 0; 127 int err = 0;
@@ -135,21 +134,14 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
135 134
136 hdev->req_status = HCI_REQ_PEND; 135 hdev->req_status = HCI_REQ_PEND;
137 136
138 add_wait_queue(&hdev->req_wait_q, &wait);
139 set_current_state(TASK_INTERRUPTIBLE);
140
141 err = hci_req_run_skb(&req, hci_req_sync_complete); 137 err = hci_req_run_skb(&req, hci_req_sync_complete);
142 if (err < 0) { 138 if (err < 0)
143 remove_wait_queue(&hdev->req_wait_q, &wait);
144 set_current_state(TASK_RUNNING);
145 return ERR_PTR(err); 139 return ERR_PTR(err);
146 }
147 140
148 schedule_timeout(timeout); 141 err = wait_event_interruptible_timeout(hdev->req_wait_q,
142 hdev->req_status != HCI_REQ_PEND, timeout);
149 143
150 remove_wait_queue(&hdev->req_wait_q, &wait); 144 if (err == -ERESTARTSYS)
151
152 if (signal_pending(current))
153 return ERR_PTR(-EINTR); 145 return ERR_PTR(-EINTR);
154 146
155 switch (hdev->req_status) { 147 switch (hdev->req_status) {
@@ -197,7 +189,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
197 unsigned long opt, u32 timeout, u8 *hci_status) 189 unsigned long opt, u32 timeout, u8 *hci_status)
198{ 190{
199 struct hci_request req; 191 struct hci_request req;
200 DECLARE_WAITQUEUE(wait, current);
201 int err = 0; 192 int err = 0;
202 193
203 BT_DBG("%s start", hdev->name); 194 BT_DBG("%s start", hdev->name);
@@ -213,16 +204,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
213 return err; 204 return err;
214 } 205 }
215 206
216 add_wait_queue(&hdev->req_wait_q, &wait);
217 set_current_state(TASK_INTERRUPTIBLE);
218
219 err = hci_req_run_skb(&req, hci_req_sync_complete); 207 err = hci_req_run_skb(&req, hci_req_sync_complete);
220 if (err < 0) { 208 if (err < 0) {
221 hdev->req_status = 0; 209 hdev->req_status = 0;
222 210
223 remove_wait_queue(&hdev->req_wait_q, &wait);
224 set_current_state(TASK_RUNNING);
225
226 /* ENODATA means the HCI request command queue is empty. 211 /* ENODATA means the HCI request command queue is empty.
227 * This can happen when a request with conditionals doesn't 212 * This can happen when a request with conditionals doesn't
228 * trigger any commands to be sent. This is normal behavior 213 * trigger any commands to be sent. This is normal behavior
@@ -240,11 +225,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
240 return err; 225 return err;
241 } 226 }
242 227
243 schedule_timeout(timeout); 228 err = wait_event_interruptible_timeout(hdev->req_wait_q,
244 229 hdev->req_status != HCI_REQ_PEND, timeout);
245 remove_wait_queue(&hdev->req_wait_q, &wait);
246 230
247 if (signal_pending(current)) 231 if (err == -ERESTARTSYS)
248 return -EINTR; 232 return -EINTR;
249 233
250 switch (hdev->req_status) { 234 switch (hdev->req_status) {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index a2ddae2f37d7..ae91e2d40056 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3315,16 +3315,12 @@ static ssize_t force_bredr_smp_write(struct file *file,
3315 size_t count, loff_t *ppos) 3315 size_t count, loff_t *ppos)
3316{ 3316{
3317 struct hci_dev *hdev = file->private_data; 3317 struct hci_dev *hdev = file->private_data;
3318 char buf[32];
3319 size_t buf_size = min(count, (sizeof(buf)-1));
3320 bool enable; 3318 bool enable;
3319 int err;
3321 3320
3322 if (copy_from_user(buf, user_buf, buf_size)) 3321 err = kstrtobool_from_user(user_buf, count, &enable);
3323 return -EFAULT; 3322 if (err)
3324 3323 return err;
3325 buf[buf_size] = '\0';
3326 if (strtobool(buf, &enable))
3327 return -EINVAL;
3328 3324
3329 if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) 3325 if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
3330 return -EALREADY; 3326 return -EALREADY;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ced48662c1f..68c3578343b4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
170 xdp.rxq = &rxqueue->xdp_rxq; 170 xdp.rxq = &rxqueue->xdp_rxq;
171 171
172 retval = bpf_test_run(prog, &xdp, repeat, &duration); 172 retval = bpf_test_run(prog, &xdp, repeat, &duration);
173 if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN) 173 if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
174 xdp.data_end != xdp.data + size)
174 size = xdp.data_end - xdp.data; 175 size = xdp.data_end - xdp.data;
175 ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); 176 ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
176 kfree(data); 177 kfree(data);
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..a948b072c28f
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,16 @@
1menuconfig BPFILTER
2 bool "BPF based packet filtering framework (BPFILTER)"
3 default n
4 depends on NET && BPF && INET
5 help
6 This builds experimental bpfilter framework that is aiming to
7 provide netfilter compatible functionality via BPF
8
9if BPFILTER
10config BPFILTER_UMH
11 tristate "bpfilter kernel module with user mode helper"
12 default m
13 help
14 This builds bpfilter kernel module with embedded user mode helper
15endif
16
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..aafa72001fcd
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,32 @@
1# SPDX-License-Identifier: GPL-2.0
2#
3# Makefile for the Linux BPFILTER layer.
4#
5
6hostprogs-y := bpfilter_umh
7bpfilter_umh-objs := main.o
8HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
9HOSTCC := $(CC)
10
11ifeq ($(CONFIG_BPFILTER_UMH), y)
12# builtin bpfilter_umh should be compiled with -static
13# since rootfs isn't mounted at the time of __init
14# function is called and do_execv won't find elf interpreter
15HOSTLDFLAGS += -static
16endif
17
18# a bit of elf magic to convert bpfilter_umh binary into a binary blob
19# inside bpfilter_umh.o elf file referenced by
20# _binary_net_bpfilter_bpfilter_umh_start symbol
21# which bpfilter_kern.c passes further into umh blob loader at run-time
22quiet_cmd_copy_umh = GEN $@
23 cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
24 $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
25 -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
26 --rename-section .data=.init.rodata $< $@
27
28$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
29 $(call cmd,copy_umh)
30
31obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
32bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..b13d058f8c34
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,114 @@
1// SPDX-License-Identifier: GPL-2.0
2#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/umh.h>
6#include <linux/bpfilter.h>
7#include <linux/sched.h>
8#include <linux/sched/signal.h>
9#include <linux/fs.h>
10#include <linux/file.h>
11#include "msgfmt.h"
12
13#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
14#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
15
16extern char UMH_start;
17extern char UMH_end;
18
19static struct umh_info info;
20/* since ip_getsockopt() can run in parallel, serialize access to umh */
21static DEFINE_MUTEX(bpfilter_lock);
22
23static void shutdown_umh(struct umh_info *info)
24{
25 struct task_struct *tsk;
26
27 tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
28 if (tsk)
29 force_sig(SIGKILL, tsk);
30 fput(info->pipe_to_umh);
31 fput(info->pipe_from_umh);
32}
33
34static void __stop_umh(void)
35{
36 if (IS_ENABLED(CONFIG_INET) &&
37 bpfilter_process_sockopt) {
38 bpfilter_process_sockopt = NULL;
39 shutdown_umh(&info);
40 }
41}
42
43static void stop_umh(void)
44{
45 mutex_lock(&bpfilter_lock);
46 __stop_umh();
47 mutex_unlock(&bpfilter_lock);
48}
49
50static int __bpfilter_process_sockopt(struct sock *sk, int optname,
51 char __user *optval,
52 unsigned int optlen, bool is_set)
53{
54 struct mbox_request req;
55 struct mbox_reply reply;
56 loff_t pos;
57 ssize_t n;
58 int ret;
59
60 req.is_set = is_set;
61 req.pid = current->pid;
62 req.cmd = optname;
63 req.addr = (long)optval;
64 req.len = optlen;
65 mutex_lock(&bpfilter_lock);
66 n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
67 if (n != sizeof(req)) {
68 pr_err("write fail %zd\n", n);
69 __stop_umh();
70 ret = -EFAULT;
71 goto out;
72 }
73 pos = 0;
74 n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
75 if (n != sizeof(reply)) {
76 pr_err("read fail %zd\n", n);
77 __stop_umh();
78 ret = -EFAULT;
79 goto out;
80 }
81 ret = reply.status;
82out:
83 mutex_unlock(&bpfilter_lock);
84 return ret;
85}
86
87static int __init load_umh(void)
88{
89 int err;
90
91 /* fork usermode process */
92 err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
93 if (err)
94 return err;
95 pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
96
97 /* health check that usermode process started correctly */
98 if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
99 stop_umh();
100 return -EFAULT;
101 }
102 if (IS_ENABLED(CONFIG_INET))
103 bpfilter_process_sockopt = &__bpfilter_process_sockopt;
104
105 return 0;
106}
107
108static void __exit fini_umh(void)
109{
110 stop_umh();
111}
112module_init(load_umh);
113module_exit(fini_umh);
114MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..1317f108df8a
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@
1// SPDX-License-Identifier: GPL-2.0
2#define _GNU_SOURCE
3#include <sys/uio.h>
4#include <errno.h>
5#include <stdio.h>
6#include <sys/socket.h>
7#include <fcntl.h>
8#include <unistd.h>
9#include "include/uapi/linux/bpf.h"
10#include <asm/unistd.h>
11#include "msgfmt.h"
12
13int debug_fd;
14
15static int handle_get_cmd(struct mbox_request *cmd)
16{
17 switch (cmd->cmd) {
18 case 0:
19 return 0;
20 default:
21 break;
22 }
23 return -ENOPROTOOPT;
24}
25
26static int handle_set_cmd(struct mbox_request *cmd)
27{
28 return -ENOPROTOOPT;
29}
30
31static void loop(void)
32{
33 while (1) {
34 struct mbox_request req;
35 struct mbox_reply reply;
36 int n;
37
38 n = read(0, &req, sizeof(req));
39 if (n != sizeof(req)) {
40 dprintf(debug_fd, "invalid request %d\n", n);
41 return;
42 }
43
44 reply.status = req.is_set ?
45 handle_set_cmd(&req) :
46 handle_get_cmd(&req);
47
48 n = write(1, &reply, sizeof(reply));
49 if (n != sizeof(reply)) {
50 dprintf(debug_fd, "reply failed %d\n", n);
51 return;
52 }
53 }
54}
55
56int main(void)
57{
58 debug_fd = open("/dev/console", 00000002);
59 dprintf(debug_fd, "Started bpfilter\n");
60 loop();
61 close(debug_fd);
62 return 0;
63}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..98d121c62945
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _NET_BPFILTER_MSGFMT_H
3#define _NET_BPFILTER_MSGFMT_H
4
5struct mbox_request {
6 __u64 addr;
7 __u32 len;
8 __u32 is_set;
9 __u32 cmd;
10 __u32 pid;
11};
12
13struct mbox_reply {
14 __u32 status;
15};
16
17#endif
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 671d13c10f6f..b0a0b82e2d91 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -34,6 +34,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
34 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 34 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
35 struct net_bridge_port *p; 35 struct net_bridge_port *p;
36 struct net_bridge *br; 36 struct net_bridge *br;
37 bool notified = false;
37 bool changed_addr; 38 bool changed_addr;
38 int err; 39 int err;
39 40
@@ -67,7 +68,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
67 break; 68 break;
68 69
69 case NETDEV_CHANGE: 70 case NETDEV_CHANGE:
70 br_port_carrier_check(p); 71 br_port_carrier_check(p, &notified);
71 break; 72 break;
72 73
73 case NETDEV_FEAT_CHANGE: 74 case NETDEV_FEAT_CHANGE:
@@ -76,8 +77,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
76 77
77 case NETDEV_DOWN: 78 case NETDEV_DOWN:
78 spin_lock_bh(&br->lock); 79 spin_lock_bh(&br->lock);
79 if (br->dev->flags & IFF_UP) 80 if (br->dev->flags & IFF_UP) {
80 br_stp_disable_port(p); 81 br_stp_disable_port(p);
82 notified = true;
83 }
81 spin_unlock_bh(&br->lock); 84 spin_unlock_bh(&br->lock);
82 break; 85 break;
83 86
@@ -85,6 +88,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
85 if (netif_running(br->dev) && netif_oper_up(dev)) { 88 if (netif_running(br->dev) && netif_oper_up(dev)) {
86 spin_lock_bh(&br->lock); 89 spin_lock_bh(&br->lock);
87 br_stp_enable_port(p); 90 br_stp_enable_port(p);
91 notified = true;
88 spin_unlock_bh(&br->lock); 92 spin_unlock_bh(&br->lock);
89 } 93 }
90 break; 94 break;
@@ -110,8 +114,8 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
110 } 114 }
111 115
112 /* Events that may cause spanning tree to refresh */ 116 /* Events that may cause spanning tree to refresh */
113 if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || 117 if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
114 event == NETDEV_CHANGE || event == NETDEV_DOWN) 118 event == NETDEV_CHANGE || event == NETDEV_DOWN))
115 br_ifinfo_notify(RTM_NEWLINK, NULL, p); 119 br_ifinfo_notify(RTM_NEWLINK, NULL, p);
116 120
117 return NOTIFY_DONE; 121 return NOTIFY_DONE;
@@ -141,7 +145,7 @@ static int br_switchdev_event(struct notifier_block *unused,
141 case SWITCHDEV_FDB_ADD_TO_BRIDGE: 145 case SWITCHDEV_FDB_ADD_TO_BRIDGE:
142 fdb_info = ptr; 146 fdb_info = ptr;
143 err = br_fdb_external_learn_add(br, p, fdb_info->addr, 147 err = br_fdb_external_learn_add(br, p, fdb_info->addr,
144 fdb_info->vid); 148 fdb_info->vid, false);
145 if (err) { 149 if (err) {
146 err = notifier_from_errno(err); 150 err = notifier_from_errno(err);
147 break; 151 break;
@@ -152,7 +156,7 @@ static int br_switchdev_event(struct notifier_block *unused,
152 case SWITCHDEV_FDB_DEL_TO_BRIDGE: 156 case SWITCHDEV_FDB_DEL_TO_BRIDGE:
153 fdb_info = ptr; 157 fdb_info = ptr;
154 err = br_fdb_external_learn_del(br, p, fdb_info->addr, 158 err = br_fdb_external_learn_del(br, p, fdb_info->addr,
155 fdb_info->vid); 159 fdb_info->vid, false);
156 if (err) 160 if (err)
157 err = notifier_from_errno(err); 161 err = notifier_from_errno(err);
158 break; 162 break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..b19e3104afd6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -40,7 +40,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly;
40static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, 40static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
41 const unsigned char *addr, u16 vid); 41 const unsigned char *addr, u16 vid);
42static void fdb_notify(struct net_bridge *br, 42static void fdb_notify(struct net_bridge *br,
43 const struct net_bridge_fdb_entry *, int); 43 const struct net_bridge_fdb_entry *, int, bool);
44 44
45int __init br_fdb_init(void) 45int __init br_fdb_init(void)
46{ 46{
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
121 return fdb; 121 return fdb;
122} 122}
123 123
124struct net_device *br_fdb_find_port(const struct net_device *br_dev,
125 const unsigned char *addr,
126 __u16 vid)
127{
128 struct net_bridge_fdb_entry *f;
129 struct net_device *dev = NULL;
130 struct net_bridge *br;
131
132 ASSERT_RTNL();
133
134 if (!netif_is_bridge_master(br_dev))
135 return NULL;
136
137 br = netdev_priv(br_dev);
138 f = br_fdb_find(br, addr, vid);
139 if (f && f->dst)
140 dev = f->dst->dev;
141
142 return dev;
143}
144EXPORT_SYMBOL_GPL(br_fdb_find_port);
145
124struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, 146struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
125 const unsigned char *addr, 147 const unsigned char *addr,
126 __u16 vid) 148 __u16 vid)
@@ -173,7 +195,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
173 } 195 }
174} 196}
175 197
176static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) 198static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
199 bool swdev_notify)
177{ 200{
178 trace_fdb_delete(br, f); 201 trace_fdb_delete(br, f);
179 202
@@ -183,7 +206,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
183 hlist_del_init_rcu(&f->fdb_node); 206 hlist_del_init_rcu(&f->fdb_node);
184 rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, 207 rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
185 br_fdb_rht_params); 208 br_fdb_rht_params);
186 fdb_notify(br, f, RTM_DELNEIGH); 209 fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
187 call_rcu(&f->rcu, fdb_rcu_free); 210 call_rcu(&f->rcu, fdb_rcu_free);
188} 211}
189 212
@@ -219,7 +242,7 @@ static void fdb_delete_local(struct net_bridge *br,
219 return; 242 return;
220 } 243 }
221 244
222 fdb_delete(br, f); 245 fdb_delete(br, f, true);
223} 246}
224 247
225void br_fdb_find_delete_local(struct net_bridge *br, 248void br_fdb_find_delete_local(struct net_bridge *br,
@@ -334,7 +357,7 @@ void br_fdb_cleanup(struct work_struct *work)
334 } else { 357 } else {
335 spin_lock_bh(&br->hash_lock); 358 spin_lock_bh(&br->hash_lock);
336 if (!hlist_unhashed(&f->fdb_node)) 359 if (!hlist_unhashed(&f->fdb_node))
337 fdb_delete(br, f); 360 fdb_delete(br, f, true);
338 spin_unlock_bh(&br->hash_lock); 361 spin_unlock_bh(&br->hash_lock);
339 } 362 }
340 } 363 }
@@ -354,7 +377,7 @@ void br_fdb_flush(struct net_bridge *br)
354 spin_lock_bh(&br->hash_lock); 377 spin_lock_bh(&br->hash_lock);
355 hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { 378 hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
356 if (!f->is_static) 379 if (!f->is_static)
357 fdb_delete(br, f); 380 fdb_delete(br, f, true);
358 } 381 }
359 spin_unlock_bh(&br->hash_lock); 382 spin_unlock_bh(&br->hash_lock);
360} 383}
@@ -383,7 +406,7 @@ void br_fdb_delete_by_port(struct net_bridge *br,
383 if (f->is_local) 406 if (f->is_local)
384 fdb_delete_local(br, p, f); 407 fdb_delete_local(br, p, f);
385 else 408 else
386 fdb_delete(br, f); 409 fdb_delete(br, f, true);
387 } 410 }
388 spin_unlock_bh(&br->hash_lock); 411 spin_unlock_bh(&br->hash_lock);
389} 412}
@@ -509,7 +532,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
509 return 0; 532 return 0;
510 br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", 533 br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
511 source ? source->dev->name : br->dev->name, addr, vid); 534 source ? source->dev->name : br->dev->name, addr, vid);
512 fdb_delete(br, fdb); 535 fdb_delete(br, fdb, true);
513 } 536 }
514 537
515 fdb = fdb_create(br, source, addr, vid, 1, 1); 538 fdb = fdb_create(br, source, addr, vid, 1, 1);
@@ -517,7 +540,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
517 return -ENOMEM; 540 return -ENOMEM;
518 541
519 fdb_add_hw_addr(br, addr); 542 fdb_add_hw_addr(br, addr);
520 fdb_notify(br, fdb, RTM_NEWNEIGH); 543 fdb_notify(br, fdb, RTM_NEWNEIGH, true);
521 return 0; 544 return 0;
522} 545}
523 546
@@ -572,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
572 fdb->added_by_user = 1; 595 fdb->added_by_user = 1;
573 if (unlikely(fdb_modified)) { 596 if (unlikely(fdb_modified)) {
574 trace_br_fdb_update(br, source, addr, vid, added_by_user); 597 trace_br_fdb_update(br, source, addr, vid, added_by_user);
575 fdb_notify(br, fdb, RTM_NEWNEIGH); 598 fdb_notify(br, fdb, RTM_NEWNEIGH, true);
576 } 599 }
577 } 600 }
578 } else { 601 } else {
@@ -583,7 +606,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
583 fdb->added_by_user = 1; 606 fdb->added_by_user = 1;
584 trace_br_fdb_update(br, source, addr, vid, 607 trace_br_fdb_update(br, source, addr, vid,
585 added_by_user); 608 added_by_user);
586 fdb_notify(br, fdb, RTM_NEWNEIGH); 609 fdb_notify(br, fdb, RTM_NEWNEIGH, true);
587 } 610 }
588 /* else we lose race and someone else inserts 611 /* else we lose race and someone else inserts
589 * it first, don't bother updating 612 * it first, don't bother updating
@@ -665,13 +688,15 @@ static inline size_t fdb_nlmsg_size(void)
665} 688}
666 689
667static void fdb_notify(struct net_bridge *br, 690static void fdb_notify(struct net_bridge *br,
668 const struct net_bridge_fdb_entry *fdb, int type) 691 const struct net_bridge_fdb_entry *fdb, int type,
692 bool swdev_notify)
669{ 693{
670 struct net *net = dev_net(br->dev); 694 struct net *net = dev_net(br->dev);
671 struct sk_buff *skb; 695 struct sk_buff *skb;
672 int err = -ENOBUFS; 696 int err = -ENOBUFS;
673 697
674 br_switchdev_fdb_notify(fdb, type); 698 if (swdev_notify)
699 br_switchdev_fdb_notify(fdb, type);
675 700
676 skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); 701 skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
677 if (skb == NULL) 702 if (skb == NULL)
@@ -810,7 +835,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
810 fdb->used = jiffies; 835 fdb->used = jiffies;
811 if (modified) { 836 if (modified) {
812 fdb->updated = jiffies; 837 fdb->updated = jiffies;
813 fdb_notify(br, fdb, RTM_NEWNEIGH); 838 fdb_notify(br, fdb, RTM_NEWNEIGH, true);
814 } 839 }
815 840
816 return 0; 841 return 0;
@@ -834,7 +859,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
834 rcu_read_unlock(); 859 rcu_read_unlock();
835 local_bh_enable(); 860 local_bh_enable();
836 } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { 861 } else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
837 err = br_fdb_external_learn_add(br, p, addr, vid); 862 err = br_fdb_external_learn_add(br, p, addr, vid, true);
838 } else { 863 } else {
839 spin_lock_bh(&br->hash_lock); 864 spin_lock_bh(&br->hash_lock);
840 err = fdb_add_entry(br, p, addr, ndm->ndm_state, 865 err = fdb_add_entry(br, p, addr, ndm->ndm_state,
@@ -923,7 +948,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br,
923 if (!fdb || fdb->dst != p) 948 if (!fdb || fdb->dst != p)
924 return -ENOENT; 949 return -ENOENT;
925 950
926 fdb_delete(br, fdb); 951 fdb_delete(br, fdb, true);
927 952
928 return 0; 953 return 0;
929} 954}
@@ -1043,7 +1068,8 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
1043} 1068}
1044 1069
1045int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, 1070int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
1046 const unsigned char *addr, u16 vid) 1071 const unsigned char *addr, u16 vid,
1072 bool swdev_notify)
1047{ 1073{
1048 struct net_bridge_fdb_entry *fdb; 1074 struct net_bridge_fdb_entry *fdb;
1049 bool modified = false; 1075 bool modified = false;
@@ -1061,7 +1087,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
1061 goto err_unlock; 1087 goto err_unlock;
1062 } 1088 }
1063 fdb->added_by_external_learn = 1; 1089 fdb->added_by_external_learn = 1;
1064 fdb_notify(br, fdb, RTM_NEWNEIGH); 1090 fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
1065 } else { 1091 } else {
1066 fdb->updated = jiffies; 1092 fdb->updated = jiffies;
1067 1093
@@ -1080,7 +1106,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
1080 } 1106 }
1081 1107
1082 if (modified) 1108 if (modified)
1083 fdb_notify(br, fdb, RTM_NEWNEIGH); 1109 fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
1084 } 1110 }
1085 1111
1086err_unlock: 1112err_unlock:
@@ -1090,7 +1116,8 @@ err_unlock:
1090} 1116}
1091 1117
1092int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, 1118int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
1093 const unsigned char *addr, u16 vid) 1119 const unsigned char *addr, u16 vid,
1120 bool swdev_notify)
1094{ 1121{
1095 struct net_bridge_fdb_entry *fdb; 1122 struct net_bridge_fdb_entry *fdb;
1096 int err = 0; 1123 int err = 0;
@@ -1099,7 +1126,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
1099 1126
1100 fdb = br_fdb_find(br, addr, vid); 1127 fdb = br_fdb_find(br, addr, vid);
1101 if (fdb && fdb->added_by_external_learn) 1128 if (fdb && fdb->added_by_external_learn)
1102 fdb_delete(br, fdb); 1129 fdb_delete(br, fdb, swdev_notify);
1103 else 1130 else
1104 err = -ENOENT; 1131 err = -ENOENT;
1105 1132
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index b4eed113d2ec..9019f326fe81 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
30 vg = nbp_vlan_group_rcu(p); 30 vg = nbp_vlan_group_rcu(p);
31 return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && 31 return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
32 br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && 32 br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
33 nbp_switchdev_allowed_egress(p, skb); 33 nbp_switchdev_allowed_egress(p, skb) &&
34 !br_skb_isolated(p, skb);
34} 35}
35 36
36int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 37int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -274,8 +275,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
274 struct net_bridge_port *port, *lport, *rport; 275 struct net_bridge_port *port, *lport, *rport;
275 276
276 lport = p ? p->port : NULL; 277 lport = p ? p->port : NULL;
277 rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) : 278 rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
278 NULL;
279 279
280 if ((unsigned long)lport > (unsigned long)rport) { 280 if ((unsigned long)lport > (unsigned long)rport) {
281 port = lport; 281 port = lport;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 5bb6681fa91e..05e42d86882d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -64,7 +64,7 @@ static int port_cost(struct net_device *dev)
64 64
65 65
66/* Check for port carrier transitions. */ 66/* Check for port carrier transitions. */
67void br_port_carrier_check(struct net_bridge_port *p) 67void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
68{ 68{
69 struct net_device *dev = p->dev; 69 struct net_device *dev = p->dev;
70 struct net_bridge *br = p->br; 70 struct net_bridge *br = p->br;
@@ -73,16 +73,21 @@ void br_port_carrier_check(struct net_bridge_port *p)
73 netif_running(dev) && netif_oper_up(dev)) 73 netif_running(dev) && netif_oper_up(dev))
74 p->path_cost = port_cost(dev); 74 p->path_cost = port_cost(dev);
75 75
76 *notified = false;
76 if (!netif_running(br->dev)) 77 if (!netif_running(br->dev))
77 return; 78 return;
78 79
79 spin_lock_bh(&br->lock); 80 spin_lock_bh(&br->lock);
80 if (netif_running(dev) && netif_oper_up(dev)) { 81 if (netif_running(dev) && netif_oper_up(dev)) {
81 if (p->state == BR_STATE_DISABLED) 82 if (p->state == BR_STATE_DISABLED) {
82 br_stp_enable_port(p); 83 br_stp_enable_port(p);
84 *notified = true;
85 }
83 } else { 86 } else {
84 if (p->state != BR_STATE_DISABLED) 87 if (p->state != BR_STATE_DISABLED) {
85 br_stp_disable_port(p); 88 br_stp_disable_port(p);
89 *notified = true;
90 }
86 } 91 }
87 spin_unlock_bh(&br->lock); 92 spin_unlock_bh(&br->lock);
88} 93}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f98a7d25866..72074276c088 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
114 goto drop; 114 goto drop;
115 115
116 BR_INPUT_SKB_CB(skb)->brdev = br->dev; 116 BR_INPUT_SKB_CB(skb)->brdev = br->dev;
117 BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
117 118
118 if (IS_ENABLED(CONFIG_INET) && 119 if (IS_ENABLED(CONFIG_INET) &&
119 (skb->protocol == htons(ETH_P_ARP) || 120 (skb->protocol == htons(ETH_P_ARP) ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 015f465c514b..9f5eb05b0373 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void)
139 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ 139 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
140 + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ 140 + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
141 + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ 141 + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */
142 + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
142 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ 143 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
143 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ 144 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
144 + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ 145 + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
213 BR_VLAN_TUNNEL)) || 214 BR_VLAN_TUNNEL)) ||
214 nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || 215 nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
215 nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, 216 nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
216 !!(p->flags & BR_NEIGH_SUPPRESS))) 217 !!(p->flags & BR_NEIGH_SUPPRESS)) ||
218 nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
217 return -EMSGSIZE; 219 return -EMSGSIZE;
218 220
219 timerval = br_timer_value(&p->message_age_timer); 221 timerval = br_timer_value(&p->message_age_timer);
@@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
660 [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, 662 [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
661 [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, 663 [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
662 [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, 664 [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
665 [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
663}; 666};
664 667
665/* Change the state of the port and notify spanning tree */ 668/* Change the state of the port and notify spanning tree */
@@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
810 if (err) 813 if (err)
811 return err; 814 return err;
812 815
816 err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
817 if (err)
818 return err;
819
813 br_port_flags_change(p, old_flags ^ p->flags); 820 br_port_flags_change(p, old_flags ^ p->flags);
814 return 0; 821 return 0;
815} 822}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..5216a524b537 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -423,6 +423,7 @@ struct br_input_skb_cb {
423#endif 423#endif
424 424
425 bool proxyarp_replied; 425 bool proxyarp_replied;
426 bool src_port_isolated;
426 427
427#ifdef CONFIG_BRIDGE_VLAN_FILTERING 428#ifdef CONFIG_BRIDGE_VLAN_FILTERING
428 bool vlan_filtered; 429 bool vlan_filtered;
@@ -553,9 +554,11 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
553int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); 554int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
554void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); 555void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
555int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, 556int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
556 const unsigned char *addr, u16 vid); 557 const unsigned char *addr, u16 vid,
558 bool swdev_notify);
557int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, 559int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
558 const unsigned char *addr, u16 vid); 560 const unsigned char *addr, u16 vid,
561 bool swdev_notify);
559void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, 562void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
560 const unsigned char *addr, u16 vid); 563 const unsigned char *addr, u16 vid);
561 564
@@ -572,8 +575,16 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
572void br_flood(struct net_bridge *br, struct sk_buff *skb, 575void br_flood(struct net_bridge *br, struct sk_buff *skb,
573 enum br_pkt_type pkt_type, bool local_rcv, bool local_orig); 576 enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
574 577
578/* return true if both source port and dest port are isolated */
579static inline bool br_skb_isolated(const struct net_bridge_port *to,
580 const struct sk_buff *skb)
581{
582 return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
583 (to->flags & BR_ISOLATED);
584}
585
575/* br_if.c */ 586/* br_if.c */
576void br_port_carrier_check(struct net_bridge_port *p); 587void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
577int br_add_bridge(struct net *net, const char *name); 588int br_add_bridge(struct net *net, const char *name);
578int br_del_bridge(struct net *net, const char *name); 589int br_del_bridge(struct net *net, const char *name);
579int br_add_if(struct net_bridge *br, struct net_device *dev, 590int br_add_if(struct net_bridge *br, struct net_device *dev,
@@ -594,11 +605,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
594 return rcu_dereference(dev->rx_handler) == br_handle_frame; 605 return rcu_dereference(dev->rx_handler) == br_handle_frame;
595} 606}
596 607
608static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
609{
610 return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
611}
612
597static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev) 613static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
598{ 614{
599 return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL; 615 return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
600} 616}
601 617
618static inline struct net_bridge_port *
619br_port_get_check_rtnl(const struct net_device *dev)
620{
621 return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
622}
623
602/* br_ioctl.c */ 624/* br_ioctl.c */
603int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); 625int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
604int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, 626int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
@@ -1117,6 +1139,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
1117 unsigned long mask); 1139 unsigned long mask);
1118void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, 1140void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb,
1119 int type); 1141 int type);
1142int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags);
1143int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
1120 1144
1121static inline void br_switchdev_frame_unmark(struct sk_buff *skb) 1145static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
1122{ 1146{
@@ -1146,6 +1170,17 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
1146 return 0; 1170 return 0;
1147} 1171}
1148 1172
1173static inline int br_switchdev_port_vlan_add(struct net_device *dev,
1174 u16 vid, u16 flags)
1175{
1176 return -EOPNOTSUPP;
1177}
1178
1179static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
1180{
1181 return -EOPNOTSUPP;
1182}
1183
1149static inline void 1184static inline void
1150br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) 1185br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
1151{ 1186{
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee775f4ff76c..d77f807420c4 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
102 102
103static void 103static void
104br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, 104br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
105 u16 vid, struct net_device *dev) 105 u16 vid, struct net_device *dev,
106 bool added_by_user)
106{ 107{
107 struct switchdev_notifier_fdb_info info; 108 struct switchdev_notifier_fdb_info info;
108 unsigned long notifier_type; 109 unsigned long notifier_type;
109 110
110 info.addr = mac; 111 info.addr = mac;
111 info.vid = vid; 112 info.vid = vid;
113 info.added_by_user = added_by_user;
112 notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; 114 notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
113 call_switchdev_notifiers(notifier_type, dev, &info.info); 115 call_switchdev_notifiers(notifier_type, dev, &info.info);
114} 116}
@@ -116,19 +118,46 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
116void 118void
117br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) 119br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
118{ 120{
119 if (!fdb->added_by_user || !fdb->dst) 121 if (!fdb->dst)
120 return; 122 return;
121 123
122 switch (type) { 124 switch (type) {
123 case RTM_DELNEIGH: 125 case RTM_DELNEIGH:
124 br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, 126 br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
125 fdb->key.vlan_id, 127 fdb->key.vlan_id,
126 fdb->dst->dev); 128 fdb->dst->dev,
129 fdb->added_by_user);
127 break; 130 break;
128 case RTM_NEWNEIGH: 131 case RTM_NEWNEIGH:
129 br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, 132 br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
130 fdb->key.vlan_id, 133 fdb->key.vlan_id,
131 fdb->dst->dev); 134 fdb->dst->dev,
135 fdb->added_by_user);
132 break; 136 break;
133 } 137 }
134} 138}
139
140int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
141{
142 struct switchdev_obj_port_vlan v = {
143 .obj.orig_dev = dev,
144 .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
145 .flags = flags,
146 .vid_begin = vid,
147 .vid_end = vid,
148 };
149
150 return switchdev_port_obj_add(dev, &v.obj);
151}
152
153int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
154{
155 struct switchdev_obj_port_vlan v = {
156 .obj.orig_dev = dev,
157 .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
158 .vid_begin = vid,
159 .vid_end = vid,
160 };
161
162 return switchdev_port_obj_del(dev, &v.obj);
163}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index fd31ad83ec7b..f99c5bf5c906 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
192BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); 192BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
193BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); 193BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
194BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); 194BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
195BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
195 196
196#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 197#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
197static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) 198static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = {
243 &brport_attr_broadcast_flood, 244 &brport_attr_broadcast_flood,
244 &brport_attr_group_fwd_mask, 245 &brport_attr_group_fwd_mask,
245 &brport_attr_neigh_suppress, 246 &brport_attr_neigh_suppress,
247 &brport_attr_isolated,
246 NULL 248 NULL
247}; 249};
248 250
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..7df269092103 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -82,19 +82,12 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
82static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, 82static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
83 u16 vid, u16 flags) 83 u16 vid, u16 flags)
84{ 84{
85 struct switchdev_obj_port_vlan v = {
86 .obj.orig_dev = dev,
87 .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
88 .flags = flags,
89 .vid_begin = vid,
90 .vid_end = vid,
91 };
92 int err; 85 int err;
93 86
94 /* Try switchdev op first. In case it is not supported, fallback to 87 /* Try switchdev op first. In case it is not supported, fallback to
95 * 8021q add. 88 * 8021q add.
96 */ 89 */
97 err = switchdev_port_obj_add(dev, &v.obj); 90 err = br_switchdev_port_vlan_add(dev, vid, flags);
98 if (err == -EOPNOTSUPP) 91 if (err == -EOPNOTSUPP)
99 return vlan_vid_add(dev, br->vlan_proto, vid); 92 return vlan_vid_add(dev, br->vlan_proto, vid);
100 return err; 93 return err;
@@ -130,18 +123,12 @@ static void __vlan_del_list(struct net_bridge_vlan *v)
130static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, 123static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
131 u16 vid) 124 u16 vid)
132{ 125{
133 struct switchdev_obj_port_vlan v = {
134 .obj.orig_dev = dev,
135 .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
136 .vid_begin = vid,
137 .vid_end = vid,
138 };
139 int err; 126 int err;
140 127
141 /* Try switchdev op first. In case it is not supported, fallback to 128 /* Try switchdev op first. In case it is not supported, fallback to
142 * 8021q del. 129 * 8021q del.
143 */ 130 */
144 err = switchdev_port_obj_del(dev, &v.obj); 131 err = br_switchdev_port_vlan_del(dev, vid);
145 if (err == -EOPNOTSUPP) { 132 if (err == -EOPNOTSUPP) {
146 vlan_vid_del(dev, br->vlan_proto, vid); 133 vlan_vid_del(dev, br->vlan_proto, vid);
147 return 0; 134 return 0;
@@ -259,6 +246,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
259 goto out_filt; 246 goto out_filt;
260 v->brvlan = masterv; 247 v->brvlan = masterv;
261 v->stats = masterv->stats; 248 v->stats = masterv->stats;
249 } else {
250 err = br_switchdev_port_vlan_add(dev, v->vid, flags);
251 if (err && err != -EOPNOTSUPP)
252 goto out;
262 } 253 }
263 254
264 /* Add the dev mac and count the vlan only if it's usable */ 255 /* Add the dev mac and count the vlan only if it's usable */
@@ -294,6 +285,8 @@ out_filt:
294 br_vlan_put_master(masterv); 285 br_vlan_put_master(masterv);
295 v->brvlan = NULL; 286 v->brvlan = NULL;
296 } 287 }
288 } else {
289 br_switchdev_port_vlan_del(dev, v->vid);
297 } 290 }
298 291
299 goto out; 292 goto out;
@@ -319,6 +312,11 @@ static int __vlan_del(struct net_bridge_vlan *v)
319 err = __vlan_vid_del(p->dev, p->br, v->vid); 312 err = __vlan_vid_del(p->dev, p->br, v->vid);
320 if (err) 313 if (err)
321 goto out; 314 goto out;
315 } else {
316 err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
317 if (err && err != -EOPNOTSUPP)
318 goto out;
319 err = 0;
322 } 320 }
323 321
324 if (br_vlan_should_use(v)) { 322 if (br_vlan_should_use(v)) {
@@ -564,6 +562,48 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
564 return false; 562 return false;
565} 563}
566 564
565static int br_vlan_add_existing(struct net_bridge *br,
566 struct net_bridge_vlan_group *vg,
567 struct net_bridge_vlan *vlan,
568 u16 flags, bool *changed)
569{
570 int err;
571
572 err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
573 if (err && err != -EOPNOTSUPP)
574 return err;
575
576 if (!br_vlan_is_brentry(vlan)) {
577 /* Trying to change flags of non-existent bridge vlan */
578 if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
579 err = -EINVAL;
580 goto err_flags;
581 }
582 /* It was only kept for port vlans, now make it real */
583 err = br_fdb_insert(br, NULL, br->dev->dev_addr,
584 vlan->vid);
585 if (err) {
586 br_err(br, "failed to insert local address into bridge forwarding table\n");
587 goto err_fdb_insert;
588 }
589
590 refcount_inc(&vlan->refcnt);
591 vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
592 vg->num_vlans++;
593 *changed = true;
594 }
595
596 if (__vlan_add_flags(vlan, flags))
597 *changed = true;
598
599 return 0;
600
601err_fdb_insert:
602err_flags:
603 br_switchdev_port_vlan_del(br->dev, vlan->vid);
604 return err;
605}
606
567/* Must be protected by RTNL. 607/* Must be protected by RTNL.
568 * Must be called with vid in range from 1 to 4094 inclusive. 608 * Must be called with vid in range from 1 to 4094 inclusive.
569 * changed must be true only if the vlan was created or updated 609 * changed must be true only if the vlan was created or updated
@@ -579,28 +619,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
579 *changed = false; 619 *changed = false;
580 vg = br_vlan_group(br); 620 vg = br_vlan_group(br);
581 vlan = br_vlan_find(vg, vid); 621 vlan = br_vlan_find(vg, vid);
582 if (vlan) { 622 if (vlan)
583 if (!br_vlan_is_brentry(vlan)) { 623 return br_vlan_add_existing(br, vg, vlan, flags, changed);
584 /* Trying to change flags of non-existent bridge vlan */
585 if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
586 return -EINVAL;
587 /* It was only kept for port vlans, now make it real */
588 ret = br_fdb_insert(br, NULL, br->dev->dev_addr,
589 vlan->vid);
590 if (ret) {
591 br_err(br, "failed insert local address into bridge forwarding table\n");
592 return ret;
593 }
594 refcount_inc(&vlan->refcnt);
595 vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
596 vg->num_vlans++;
597 *changed = true;
598 }
599 if (__vlan_add_flags(vlan, flags))
600 *changed = true;
601
602 return 0;
603 }
604 624
605 vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); 625 vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
606 if (!vlan) 626 if (!vlan)
@@ -1053,13 +1073,6 @@ err_vlan_enabled:
1053int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, 1073int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
1054 bool *changed) 1074 bool *changed)
1055{ 1075{
1056 struct switchdev_obj_port_vlan v = {
1057 .obj.orig_dev = port->dev,
1058 .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
1059 .flags = flags,
1060 .vid_begin = vid,
1061 .vid_end = vid,
1062 };
1063 struct net_bridge_vlan *vlan; 1076 struct net_bridge_vlan *vlan;
1064 int ret; 1077 int ret;
1065 1078
@@ -1069,7 +1082,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
1069 vlan = br_vlan_find(nbp_vlan_group(port), vid); 1082 vlan = br_vlan_find(nbp_vlan_group(port), vid);
1070 if (vlan) { 1083 if (vlan) {
1071 /* Pass the flags to the hardware bridge */ 1084 /* Pass the flags to the hardware bridge */
1072 ret = switchdev_port_obj_add(port->dev, &v.obj); 1085 ret = br_switchdev_port_vlan_add(port->dev, vid, flags);
1073 if (ret && ret != -EOPNOTSUPP) 1086 if (ret && ret != -EOPNOTSUPP)
1074 return ret; 1087 return ret;
1075 *changed = __vlan_add_flags(vlan, flags); 1088 *changed = __vlan_add_flags(vlan, flags);
@@ -1149,3 +1162,44 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
1149 stats->tx_packets += txpackets; 1162 stats->tx_packets += txpackets;
1150 } 1163 }
1151} 1164}
1165
1166int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
1167{
1168 struct net_bridge_vlan_group *vg;
1169
1170 ASSERT_RTNL();
1171 if (netif_is_bridge_master(dev))
1172 vg = br_vlan_group(netdev_priv(dev));
1173 else
1174 return -EINVAL;
1175
1176 *p_pvid = br_get_pvid(vg);
1177 return 0;
1178}
1179EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
1180
1181int br_vlan_get_info(const struct net_device *dev, u16 vid,
1182 struct bridge_vlan_info *p_vinfo)
1183{
1184 struct net_bridge_vlan_group *vg;
1185 struct net_bridge_vlan *v;
1186 struct net_bridge_port *p;
1187
1188 ASSERT_RTNL();
1189 p = br_port_get_check_rtnl(dev);
1190 if (p)
1191 vg = nbp_vlan_group(p);
1192 else if (netif_is_bridge_master(dev))
1193 vg = br_vlan_group(netdev_priv(dev));
1194 else
1195 return -EINVAL;
1196
1197 v = br_vlan_find(vg, vid);
1198 if (!v)
1199 return -ENOENT;
1200
1201 p_vinfo->vid = vid;
1202 p_vinfo->flags = v->flags;
1203 return 0;
1204}
1205EXPORT_SYMBOL_GPL(br_vlan_get_info);
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index f212447794bd..9a0159aebe1a 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE
8 bool "Ethernet Bridge nf_tables support" 8 bool "Ethernet Bridge nf_tables support"
9 9
10if NF_TABLES_BRIDGE 10if NF_TABLES_BRIDGE
11
12config NFT_BRIDGE_META
13 tristate "Netfilter nf_table bridge meta support"
14 depends on NFT_META
15 help
16 Add support for bridge dedicated meta key.
17
18config NFT_BRIDGE_REJECT 11config NFT_BRIDGE_REJECT
19 tristate "Netfilter nf_tables bridge reject support" 12 tristate "Netfilter nf_tables bridge reject support"
20 depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 13 depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 4bc758dd4a8c..9b868861f21a 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,7 +3,6 @@
3# Makefile for the netfilter modules for Link Layer filtering on a bridge. 3# Makefile for the netfilter modules for Link Layer filtering on a bridge.
4# 4#
5 5
6obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
7obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o 6obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
8 7
9# packet logging 8# packet logging
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0e27c51331fb..28f68a2ec911 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
101{ 101{
102 par->match = m->u.match; 102 par->match = m->u.match;
103 par->matchinfo = m->data; 103 par->matchinfo = m->data;
104 return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH; 104 return !m->u.match->match(skb, par);
105} 105}
106 106
107static inline int 107static inline int
@@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
177 return (void *)entry + entry->next_offset; 177 return (void *)entry + entry->next_offset;
178} 178}
179 179
180static inline const struct ebt_entry_target *
181ebt_get_target_c(const struct ebt_entry *e)
182{
183 return ebt_get_target((struct ebt_entry *)e);
184}
185
180/* Do some firewalling */ 186/* Do some firewalling */
181unsigned int ebt_do_table(struct sk_buff *skb, 187unsigned int ebt_do_table(struct sk_buff *skb,
182 const struct nf_hook_state *state, 188 const struct nf_hook_state *state,
@@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
230 */ 236 */
231 EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); 237 EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
232 238
233 t = (struct ebt_entry_target *) 239 t = ebt_get_target_c(point);
234 (((char *)point) + point->target_offset);
235 /* standard target */ 240 /* standard target */
236 if (!t->u.target->target) 241 if (!t->u.target->target)
237 verdict = ((struct ebt_standard_target *)t)->verdict; 242 verdict = ((struct ebt_standard_target *)t)->verdict;
@@ -343,6 +348,16 @@ find_table_lock(struct net *net, const char *name, int *error,
343 "ebtable_", error, mutex); 348 "ebtable_", error, mutex);
344} 349}
345 350
351static inline void ebt_free_table_info(struct ebt_table_info *info)
352{
353 int i;
354
355 if (info->chainstack) {
356 for_each_possible_cpu(i)
357 vfree(info->chainstack[i]);
358 vfree(info->chainstack);
359 }
360}
346static inline int 361static inline int
347ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, 362ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
348 unsigned int *cnt) 363 unsigned int *cnt)
@@ -627,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
627 return 1; 642 return 1;
628 EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); 643 EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
629 EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); 644 EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
630 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); 645 t = ebt_get_target(e);
631 646
632 par.net = net; 647 par.net = net;
633 par.target = t->u.target; 648 par.target = t->u.target;
@@ -706,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
706 ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); 721 ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
707 if (ret != 0) 722 if (ret != 0)
708 goto cleanup_watchers; 723 goto cleanup_watchers;
709 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); 724 t = ebt_get_target(e);
710 gap = e->next_offset - e->target_offset; 725 gap = e->next_offset - e->target_offset;
711 726
712 target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0); 727 target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
@@ -779,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
779 if (pos == nentries) 794 if (pos == nentries)
780 continue; 795 continue;
781 } 796 }
782 t = (struct ebt_entry_target *) 797 t = ebt_get_target_c(e);
783 (((char *)e) + e->target_offset);
784 if (strcmp(t->u.name, EBT_STANDARD_TARGET)) 798 if (strcmp(t->u.name, EBT_STANDARD_TARGET))
785 goto letscontinue; 799 goto letscontinue;
786 if (e->target_offset + sizeof(struct ebt_standard_target) > 800 if (e->target_offset + sizeof(struct ebt_standard_target) >
@@ -975,7 +989,7 @@ static void get_counters(const struct ebt_counter *oldcounters,
975static int do_replace_finish(struct net *net, struct ebt_replace *repl, 989static int do_replace_finish(struct net *net, struct ebt_replace *repl,
976 struct ebt_table_info *newinfo) 990 struct ebt_table_info *newinfo)
977{ 991{
978 int ret, i; 992 int ret;
979 struct ebt_counter *counterstmp = NULL; 993 struct ebt_counter *counterstmp = NULL;
980 /* used to be able to unlock earlier */ 994 /* used to be able to unlock earlier */
981 struct ebt_table_info *table; 995 struct ebt_table_info *table;
@@ -1051,13 +1065,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
1051 ebt_cleanup_entry, net, NULL); 1065 ebt_cleanup_entry, net, NULL);
1052 1066
1053 vfree(table->entries); 1067 vfree(table->entries);
1054 if (table->chainstack) { 1068 ebt_free_table_info(table);
1055 for_each_possible_cpu(i)
1056 vfree(table->chainstack[i]);
1057 vfree(table->chainstack);
1058 }
1059 vfree(table); 1069 vfree(table);
1060
1061 vfree(counterstmp); 1070 vfree(counterstmp);
1062 1071
1063#ifdef CONFIG_AUDIT 1072#ifdef CONFIG_AUDIT
@@ -1078,11 +1087,7 @@ free_iterate:
1078free_counterstmp: 1087free_counterstmp:
1079 vfree(counterstmp); 1088 vfree(counterstmp);
1080 /* can be initialized in translate_table() */ 1089 /* can be initialized in translate_table() */
1081 if (newinfo->chainstack) { 1090 ebt_free_table_info(newinfo);
1082 for_each_possible_cpu(i)
1083 vfree(newinfo->chainstack[i]);
1084 vfree(newinfo->chainstack);
1085 }
1086 return ret; 1091 return ret;
1087} 1092}
1088 1093
@@ -1147,8 +1152,6 @@ free_newinfo:
1147 1152
1148static void __ebt_unregister_table(struct net *net, struct ebt_table *table) 1153static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
1149{ 1154{
1150 int i;
1151
1152 mutex_lock(&ebt_mutex); 1155 mutex_lock(&ebt_mutex);
1153 list_del(&table->list); 1156 list_del(&table->list);
1154 mutex_unlock(&ebt_mutex); 1157 mutex_unlock(&ebt_mutex);
@@ -1157,11 +1160,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
1157 if (table->private->nentries) 1160 if (table->private->nentries)
1158 module_put(table->me); 1161 module_put(table->me);
1159 vfree(table->private->entries); 1162 vfree(table->private->entries);
1160 if (table->private->chainstack) { 1163 ebt_free_table_info(table->private);
1161 for_each_possible_cpu(i)
1162 vfree(table->private->chainstack[i]);
1163 vfree(table->private->chainstack);
1164 }
1165 vfree(table->private); 1164 vfree(table->private);
1166 kfree(table); 1165 kfree(table);
1167} 1166}
@@ -1263,11 +1262,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
1263free_unlock: 1262free_unlock:
1264 mutex_unlock(&ebt_mutex); 1263 mutex_unlock(&ebt_mutex);
1265free_chainstack: 1264free_chainstack:
1266 if (newinfo->chainstack) { 1265 ebt_free_table_info(newinfo);
1267 for_each_possible_cpu(i)
1268 vfree(newinfo->chainstack[i]);
1269 vfree(newinfo->chainstack);
1270 }
1271 vfree(newinfo->entries); 1266 vfree(newinfo->entries);
1272free_newinfo: 1267free_newinfo:
1273 vfree(newinfo); 1268 vfree(newinfo);
@@ -1405,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
1405 return -EFAULT; 1400 return -EFAULT;
1406 1401
1407 hlp = ubase + (((char *)e + e->target_offset) - base); 1402 hlp = ubase + (((char *)e + e->target_offset) - base);
1408 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); 1403 t = ebt_get_target_c(e);
1409 1404
1410 ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase); 1405 ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
1411 if (ret != 0) 1406 if (ret != 0)
@@ -1746,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
1746 return ret; 1741 return ret;
1747 target_offset = e->target_offset - (origsize - *size); 1742 target_offset = e->target_offset - (origsize - *size);
1748 1743
1749 t = (struct ebt_entry_target *) ((char *) e + e->target_offset); 1744 t = ebt_get_target(e);
1750 1745
1751 ret = compat_target_to_user(t, dstptr, size); 1746 ret = compat_target_to_user(t, dstptr, size);
1752 if (ret) 1747 if (ret)
@@ -1794,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
1794 EBT_MATCH_ITERATE(e, compat_calc_match, &off); 1789 EBT_MATCH_ITERATE(e, compat_calc_match, &off);
1795 EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off); 1790 EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
1796 1791
1797 t = (const struct ebt_entry_target *) ((char *) e + e->target_offset); 1792 t = ebt_get_target_c(e);
1798 1793
1799 off += xt_compat_target_offset(t->u.target); 1794 off += xt_compat_target_offset(t->u.target);
1800 off += ebt_compat_entry_padsize(); 1795 off += ebt_compat_entry_padsize();
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
deleted file mode 100644
index bb63c9aed55d..000000000000
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ /dev/null
@@ -1,135 +0,0 @@
1/*
2 * Copyright (c) 2014 Intel Corporation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 */
9
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables.h>
17#include <net/netfilter/nft_meta.h>
18
19#include "../br_private.h"
20
21static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
22 struct nft_regs *regs,
23 const struct nft_pktinfo *pkt)
24{
25 const struct nft_meta *priv = nft_expr_priv(expr);
26 const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
27 u32 *dest = &regs->data[priv->dreg];
28 const struct net_bridge_port *p;
29
30 switch (priv->key) {
31 case NFT_META_BRI_IIFNAME:
32 if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
33 goto err;
34 break;
35 case NFT_META_BRI_OIFNAME:
36 if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
37 goto err;
38 break;
39 default:
40 goto out;
41 }
42
43 strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
44 return;
45out:
46 return nft_meta_get_eval(expr, regs, pkt);
47err:
48 regs->verdict.code = NFT_BREAK;
49}
50
51static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
52 const struct nft_expr *expr,
53 const struct nlattr * const tb[])
54{
55 struct nft_meta *priv = nft_expr_priv(expr);
56 unsigned int len;
57
58 priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
59 switch (priv->key) {
60 case NFT_META_BRI_IIFNAME:
61 case NFT_META_BRI_OIFNAME:
62 len = IFNAMSIZ;
63 break;
64 default:
65 return nft_meta_get_init(ctx, expr, tb);
66 }
67
68 priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
69 return nft_validate_register_store(ctx, priv->dreg, NULL,
70 NFT_DATA_VALUE, len);
71}
72
73static struct nft_expr_type nft_meta_bridge_type;
74static const struct nft_expr_ops nft_meta_bridge_get_ops = {
75 .type = &nft_meta_bridge_type,
76 .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
77 .eval = nft_meta_bridge_get_eval,
78 .init = nft_meta_bridge_get_init,
79 .dump = nft_meta_get_dump,
80};
81
82static const struct nft_expr_ops nft_meta_bridge_set_ops = {
83 .type = &nft_meta_bridge_type,
84 .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
85 .eval = nft_meta_set_eval,
86 .init = nft_meta_set_init,
87 .destroy = nft_meta_set_destroy,
88 .dump = nft_meta_set_dump,
89 .validate = nft_meta_set_validate,
90};
91
92static const struct nft_expr_ops *
93nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
94 const struct nlattr * const tb[])
95{
96 if (tb[NFTA_META_KEY] == NULL)
97 return ERR_PTR(-EINVAL);
98
99 if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
100 return ERR_PTR(-EINVAL);
101
102 if (tb[NFTA_META_DREG])
103 return &nft_meta_bridge_get_ops;
104
105 if (tb[NFTA_META_SREG])
106 return &nft_meta_bridge_set_ops;
107
108 return ERR_PTR(-EINVAL);
109}
110
111static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
112 .family = NFPROTO_BRIDGE,
113 .name = "meta",
114 .select_ops = nft_meta_bridge_select_ops,
115 .policy = nft_meta_policy,
116 .maxattr = NFTA_META_MAX,
117 .owner = THIS_MODULE,
118};
119
120static int __init nft_meta_bridge_module_init(void)
121{
122 return nft_register_expr(&nft_meta_bridge_type);
123}
124
125static void __exit nft_meta_bridge_module_exit(void)
126{
127 nft_unregister_expr(&nft_meta_bridge_type);
128}
129
130module_init(nft_meta_bridge_module_init);
131module_exit(nft_meta_bridge_module_exit);
132
133MODULE_LICENSE("GPL");
134MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
135MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
14 fib_notifier.o xdp.o 14 fib_notifier.o xdp.o
15 15
16obj-y += net-sysfs.o 16obj-y += net-sysfs.o
17obj-$(CONFIG_PAGE_POOL) += page_pool.o
17obj-$(CONFIG_PROC_FS) += net-procfs.o 18obj-$(CONFIG_PROC_FS) += net-procfs.o
18obj-$(CONFIG_NET_PKTGEN) += pktgen.o 19obj-$(CONFIG_NET_PKTGEN) += pktgen.o
19obj-$(CONFIG_NETPOLL) += netpoll.o 20obj-$(CONFIG_NETPOLL) += netpoll.o
@@ -30,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
30obj-$(CONFIG_HWBM) += hwbm.o 31obj-$(CONFIG_HWBM) += hwbm.o
31obj-$(CONFIG_NET_DEVLINK) += devlink.o 32obj-$(CONFIG_NET_DEVLINK) += devlink.o
32obj-$(CONFIG_GRO_CELLS) += gro_cells.o 33obj-$(CONFIG_GRO_CELLS) += gro_cells.o
34obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c149238a4ce..6e18242a1cae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1285 1285
1286 return len; 1286 return len;
1287} 1287}
1288EXPORT_SYMBOL(dev_set_alias);
1288 1289
1289/** 1290/**
1290 * dev_get_alias - get ifalias of a device 1291 * dev_get_alias - get ifalias of a device
@@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1586 N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) 1587 N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1587 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) 1588 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1588 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) 1589 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1589 }; 1590 }
1590#undef N 1591#undef N
1591 return "UNKNOWN_NETDEV_EVENT"; 1592 return "UNKNOWN_NETDEV_EVENT";
1592} 1593}
@@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1754EXPORT_SYMBOL(call_netdevice_notifiers); 1755EXPORT_SYMBOL(call_netdevice_notifiers);
1755 1756
1756#ifdef CONFIG_NET_INGRESS 1757#ifdef CONFIG_NET_INGRESS
1757static struct static_key ingress_needed __read_mostly; 1758static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1758 1759
1759void net_inc_ingress_queue(void) 1760void net_inc_ingress_queue(void)
1760{ 1761{
1761 static_key_slow_inc(&ingress_needed); 1762 static_branch_inc(&ingress_needed_key);
1762} 1763}
1763EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1764EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1764 1765
1765void net_dec_ingress_queue(void) 1766void net_dec_ingress_queue(void)
1766{ 1767{
1767 static_key_slow_dec(&ingress_needed); 1768 static_branch_dec(&ingress_needed_key);
1768} 1769}
1769EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1770EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1770#endif 1771#endif
1771 1772
1772#ifdef CONFIG_NET_EGRESS 1773#ifdef CONFIG_NET_EGRESS
1773static struct static_key egress_needed __read_mostly; 1774static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1774 1775
1775void net_inc_egress_queue(void) 1776void net_inc_egress_queue(void)
1776{ 1777{
1777 static_key_slow_inc(&egress_needed); 1778 static_branch_inc(&egress_needed_key);
1778} 1779}
1779EXPORT_SYMBOL_GPL(net_inc_egress_queue); 1780EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1780 1781
1781void net_dec_egress_queue(void) 1782void net_dec_egress_queue(void)
1782{ 1783{
1783 static_key_slow_dec(&egress_needed); 1784 static_branch_dec(&egress_needed_key);
1784} 1785}
1785EXPORT_SYMBOL_GPL(net_dec_egress_queue); 1786EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1786#endif 1787#endif
1787 1788
1788static struct static_key netstamp_needed __read_mostly; 1789static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1789#ifdef HAVE_JUMP_LABEL 1790#ifdef HAVE_JUMP_LABEL
1790static atomic_t netstamp_needed_deferred; 1791static atomic_t netstamp_needed_deferred;
1791static atomic_t netstamp_wanted; 1792static atomic_t netstamp_wanted;
@@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work)
1796 1797
1797 wanted = atomic_add_return(deferred, &netstamp_wanted); 1798 wanted = atomic_add_return(deferred, &netstamp_wanted);
1798 if (wanted > 0) 1799 if (wanted > 0)
1799 static_key_enable(&netstamp_needed); 1800 static_branch_enable(&netstamp_needed_key);
1800 else 1801 else
1801 static_key_disable(&netstamp_needed); 1802 static_branch_disable(&netstamp_needed_key);
1802} 1803}
1803static DECLARE_WORK(netstamp_work, netstamp_clear); 1804static DECLARE_WORK(netstamp_work, netstamp_clear);
1804#endif 1805#endif
@@ -1818,7 +1819,7 @@ void net_enable_timestamp(void)
1818 atomic_inc(&netstamp_needed_deferred); 1819 atomic_inc(&netstamp_needed_deferred);
1819 schedule_work(&netstamp_work); 1820 schedule_work(&netstamp_work);
1820#else 1821#else
1821 static_key_slow_inc(&netstamp_needed); 1822 static_branch_inc(&netstamp_needed_key);
1822#endif 1823#endif
1823} 1824}
1824EXPORT_SYMBOL(net_enable_timestamp); 1825EXPORT_SYMBOL(net_enable_timestamp);
@@ -1838,7 +1839,7 @@ void net_disable_timestamp(void)
1838 atomic_dec(&netstamp_needed_deferred); 1839 atomic_dec(&netstamp_needed_deferred);
1839 schedule_work(&netstamp_work); 1840 schedule_work(&netstamp_work);
1840#else 1841#else
1841 static_key_slow_dec(&netstamp_needed); 1842 static_branch_dec(&netstamp_needed_key);
1842#endif 1843#endif
1843} 1844}
1844EXPORT_SYMBOL(net_disable_timestamp); 1845EXPORT_SYMBOL(net_disable_timestamp);
@@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
1846static inline void net_timestamp_set(struct sk_buff *skb) 1847static inline void net_timestamp_set(struct sk_buff *skb)
1847{ 1848{
1848 skb->tstamp = 0; 1849 skb->tstamp = 0;
1849 if (static_key_false(&netstamp_needed)) 1850 if (static_branch_unlikely(&netstamp_needed_key))
1850 __net_timestamp(skb); 1851 __net_timestamp(skb);
1851} 1852}
1852 1853
1853#define net_timestamp_check(COND, SKB) \ 1854#define net_timestamp_check(COND, SKB) \
1854 if (static_key_false(&netstamp_needed)) { \ 1855 if (static_branch_unlikely(&netstamp_needed_key)) { \
1855 if ((COND) && !(SKB)->tstamp) \ 1856 if ((COND) && !(SKB)->tstamp) \
1856 __net_timestamp(SKB); \ 1857 __net_timestamp(SKB); \
1857 } \ 1858 } \
1858 1859
1859bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) 1860bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1860{ 1861{
@@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
2614 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2615 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2615 * to be used as a distribution range. 2616 * to be used as a distribution range.
2616 */ 2617 */
2617u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2618static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
2618 unsigned int num_tx_queues)
2619{ 2619{
2620 u32 hash; 2620 u32 hash;
2621 u16 qoffset = 0; 2621 u16 qoffset = 0;
2622 u16 qcount = num_tx_queues; 2622 u16 qcount = dev->real_num_tx_queues;
2623 2623
2624 if (skb_rx_queue_recorded(skb)) { 2624 if (skb_rx_queue_recorded(skb)) {
2625 hash = skb_get_rx_queue(skb); 2625 hash = skb_get_rx_queue(skb);
2626 while (unlikely(hash >= num_tx_queues)) 2626 while (unlikely(hash >= qcount))
2627 hash -= num_tx_queues; 2627 hash -= qcount;
2628 return hash; 2628 return hash;
2629 } 2629 }
2630 2630
@@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2637 2637
2638 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2638 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2639} 2639}
2640EXPORT_SYMBOL(__skb_tx_hash);
2641 2640
2642static void skb_warn_bad_offload(const struct sk_buff *skb) 2641static void skb_warn_bad_offload(const struct sk_buff *skb)
2643{ 2642{
@@ -3095,6 +3094,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
3095 if (unlikely(!skb)) 3094 if (unlikely(!skb))
3096 goto out_null; 3095 goto out_null;
3097 3096
3097 skb = sk_validate_xmit_skb(skb, dev);
3098 if (unlikely(!skb))
3099 goto out_null;
3100
3098 if (netif_needs_gso(skb, features)) { 3101 if (netif_needs_gso(skb, features)) {
3099 struct sk_buff *segs; 3102 struct sk_buff *segs;
3100 3103
@@ -3223,7 +3226,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3223 rc = NET_XMIT_DROP; 3226 rc = NET_XMIT_DROP;
3224 } else { 3227 } else {
3225 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; 3228 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3226 __qdisc_run(q); 3229 qdisc_run(q);
3227 } 3230 }
3228 3231
3229 if (unlikely(to_free)) 3232 if (unlikely(to_free))
@@ -3511,7 +3514,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3511#ifdef CONFIG_NET_CLS_ACT 3514#ifdef CONFIG_NET_CLS_ACT
3512 skb->tc_at_ingress = 0; 3515 skb->tc_at_ingress = 0;
3513# ifdef CONFIG_NET_EGRESS 3516# ifdef CONFIG_NET_EGRESS
3514 if (static_key_false(&egress_needed)) { 3517 if (static_branch_unlikely(&egress_needed_key)) {
3515 skb = sch_handle_egress(skb, &rc, dev); 3518 skb = sch_handle_egress(skb, &rc, dev);
3516 if (!skb) 3519 if (!skb)
3517 goto out; 3520 goto out;
@@ -3606,6 +3609,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3606} 3609}
3607EXPORT_SYMBOL(dev_queue_xmit_accel); 3610EXPORT_SYMBOL(dev_queue_xmit_accel);
3608 3611
3612int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
3613{
3614 struct net_device *dev = skb->dev;
3615 struct sk_buff *orig_skb = skb;
3616 struct netdev_queue *txq;
3617 int ret = NETDEV_TX_BUSY;
3618 bool again = false;
3619
3620 if (unlikely(!netif_running(dev) ||
3621 !netif_carrier_ok(dev)))
3622 goto drop;
3623
3624 skb = validate_xmit_skb_list(skb, dev, &again);
3625 if (skb != orig_skb)
3626 goto drop;
3627
3628 skb_set_queue_mapping(skb, queue_id);
3629 txq = skb_get_tx_queue(dev, skb);
3630
3631 local_bh_disable();
3632
3633 HARD_TX_LOCK(dev, txq, smp_processor_id());
3634 if (!netif_xmit_frozen_or_drv_stopped(txq))
3635 ret = netdev_start_xmit(skb, dev, txq, false);
3636 HARD_TX_UNLOCK(dev, txq);
3637
3638 local_bh_enable();
3639
3640 if (!dev_xmit_complete(ret))
3641 kfree_skb(skb);
3642
3643 return ret;
3644drop:
3645 atomic_long_inc(&dev->tx_dropped);
3646 kfree_skb_list(skb);
3647 return NET_XMIT_DROP;
3648}
3649EXPORT_SYMBOL(dev_direct_xmit);
3609 3650
3610/************************************************************************* 3651/*************************************************************************
3611 * Receiver routines 3652 * Receiver routines
@@ -3975,12 +4016,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
3975} 4016}
3976 4017
3977static u32 netif_receive_generic_xdp(struct sk_buff *skb, 4018static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4019 struct xdp_buff *xdp,
3978 struct bpf_prog *xdp_prog) 4020 struct bpf_prog *xdp_prog)
3979{ 4021{
3980 struct netdev_rx_queue *rxqueue; 4022 struct netdev_rx_queue *rxqueue;
4023 void *orig_data, *orig_data_end;
3981 u32 metalen, act = XDP_DROP; 4024 u32 metalen, act = XDP_DROP;
3982 struct xdp_buff xdp;
3983 void *orig_data;
3984 int hlen, off; 4025 int hlen, off;
3985 u32 mac_len; 4026 u32 mac_len;
3986 4027
@@ -4015,31 +4056,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4015 */ 4056 */
4016 mac_len = skb->data - skb_mac_header(skb); 4057 mac_len = skb->data - skb_mac_header(skb);
4017 hlen = skb_headlen(skb) + mac_len; 4058 hlen = skb_headlen(skb) + mac_len;
4018 xdp.data = skb->data - mac_len; 4059 xdp->data = skb->data - mac_len;
4019 xdp.data_meta = xdp.data; 4060 xdp->data_meta = xdp->data;
4020 xdp.data_end = xdp.data + hlen; 4061 xdp->data_end = xdp->data + hlen;
4021 xdp.data_hard_start = skb->data - skb_headroom(skb); 4062 xdp->data_hard_start = skb->data - skb_headroom(skb);
4022 orig_data = xdp.data; 4063 orig_data_end = xdp->data_end;
4064 orig_data = xdp->data;
4023 4065
4024 rxqueue = netif_get_rxqueue(skb); 4066 rxqueue = netif_get_rxqueue(skb);
4025 xdp.rxq = &rxqueue->xdp_rxq; 4067 xdp->rxq = &rxqueue->xdp_rxq;
4026 4068
4027 act = bpf_prog_run_xdp(xdp_prog, &xdp); 4069 act = bpf_prog_run_xdp(xdp_prog, xdp);
4028 4070
4029 off = xdp.data - orig_data; 4071 off = xdp->data - orig_data;
4030 if (off > 0) 4072 if (off > 0)
4031 __skb_pull(skb, off); 4073 __skb_pull(skb, off);
4032 else if (off < 0) 4074 else if (off < 0)
4033 __skb_push(skb, -off); 4075 __skb_push(skb, -off);
4034 skb->mac_header += off; 4076 skb->mac_header += off;
4035 4077
4078 /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4079 * pckt.
4080 */
4081 off = orig_data_end - xdp->data_end;
4082 if (off != 0) {
4083 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4084 skb->len -= off;
4085
4086 }
4087
4036 switch (act) { 4088 switch (act) {
4037 case XDP_REDIRECT: 4089 case XDP_REDIRECT:
4038 case XDP_TX: 4090 case XDP_TX:
4039 __skb_push(skb, mac_len); 4091 __skb_push(skb, mac_len);
4040 break; 4092 break;
4041 case XDP_PASS: 4093 case XDP_PASS:
4042 metalen = xdp.data - xdp.data_meta; 4094 metalen = xdp->data - xdp->data_meta;
4043 if (metalen) 4095 if (metalen)
4044 skb_metadata_set(skb, metalen); 4096 skb_metadata_set(skb, metalen);
4045 break; 4097 break;
@@ -4084,22 +4136,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4084} 4136}
4085EXPORT_SYMBOL_GPL(generic_xdp_tx); 4137EXPORT_SYMBOL_GPL(generic_xdp_tx);
4086 4138
4087static struct static_key generic_xdp_needed __read_mostly; 4139static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4088 4140
4089int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) 4141int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4090{ 4142{
4091 if (xdp_prog) { 4143 if (xdp_prog) {
4092 u32 act = netif_receive_generic_xdp(skb, xdp_prog); 4144 struct xdp_buff xdp;
4145 u32 act;
4093 int err; 4146 int err;
4094 4147
4148 act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4095 if (act != XDP_PASS) { 4149 if (act != XDP_PASS) {
4096 switch (act) { 4150 switch (act) {
4097 case XDP_REDIRECT: 4151 case XDP_REDIRECT:
4098 err = xdp_do_generic_redirect(skb->dev, skb, 4152 err = xdp_do_generic_redirect(skb->dev, skb,
4099 xdp_prog); 4153 &xdp, xdp_prog);
4100 if (err) 4154 if (err)
4101 goto out_redir; 4155 goto out_redir;
4102 /* fallthru to submit skb */ 4156 break;
4103 case XDP_TX: 4157 case XDP_TX:
4104 generic_xdp_tx(skb, xdp_prog); 4158 generic_xdp_tx(skb, xdp_prog);
4105 break; 4159 break;
@@ -4122,7 +4176,7 @@ static int netif_rx_internal(struct sk_buff *skb)
4122 4176
4123 trace_netif_rx(skb); 4177 trace_netif_rx(skb);
4124 4178
4125 if (static_key_false(&generic_xdp_needed)) { 4179 if (static_branch_unlikely(&generic_xdp_needed_key)) {
4126 int ret; 4180 int ret;
4127 4181
4128 preempt_disable(); 4182 preempt_disable();
@@ -4494,7 +4548,7 @@ another_round:
4494 4548
4495skip_taps: 4549skip_taps:
4496#ifdef CONFIG_NET_INGRESS 4550#ifdef CONFIG_NET_INGRESS
4497 if (static_key_false(&ingress_needed)) { 4551 if (static_branch_unlikely(&ingress_needed_key)) {
4498 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 4552 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4499 if (!skb) 4553 if (!skb)
4500 goto out; 4554 goto out;
@@ -4654,9 +4708,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4654 bpf_prog_put(old); 4708 bpf_prog_put(old);
4655 4709
4656 if (old && !new) { 4710 if (old && !new) {
4657 static_key_slow_dec(&generic_xdp_needed); 4711 static_branch_dec(&generic_xdp_needed_key);
4658 } else if (new && !old) { 4712 } else if (new && !old) {
4659 static_key_slow_inc(&generic_xdp_needed); 4713 static_branch_inc(&generic_xdp_needed_key);
4660 dev_disable_lro(dev); 4714 dev_disable_lro(dev);
4661 dev_disable_gro_hw(dev); 4715 dev_disable_gro_hw(dev);
4662 } 4716 }
@@ -4684,7 +4738,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
4684 if (skb_defer_rx_timestamp(skb)) 4738 if (skb_defer_rx_timestamp(skb))
4685 return NET_RX_SUCCESS; 4739 return NET_RX_SUCCESS;
4686 4740
4687 if (static_key_false(&generic_xdp_needed)) { 4741 if (static_branch_unlikely(&generic_xdp_needed_key)) {
4688 int ret; 4742 int ret;
4689 4743
4690 preempt_disable(); 4744 preempt_disable();
@@ -7852,6 +7906,8 @@ int register_netdevice(struct net_device *dev)
7852 int ret; 7906 int ret;
7853 struct net *net = dev_net(dev); 7907 struct net *net = dev_net(dev);
7854 7908
7909 BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
7910 NETDEV_FEATURE_COUNT);
7855 BUG_ON(dev_boot_phase); 7911 BUG_ON(dev_boot_phase);
7856 ASSERT_RTNL(); 7912 ASSERT_RTNL();
7857 7913
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..22099705cc41 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
453 msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); 453 msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
454} 454}
455 455
456static int devlink_nl_port_attrs_put(struct sk_buff *msg,
457 struct devlink_port *devlink_port)
458{
459 struct devlink_port_attrs *attrs = &devlink_port->attrs;
460
461 if (!attrs->set)
462 return 0;
463 if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
464 return -EMSGSIZE;
465 if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
466 return -EMSGSIZE;
467 if (!attrs->split)
468 return 0;
469 if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
470 return -EMSGSIZE;
471 if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
472 attrs->split_subport_number))
473 return -EMSGSIZE;
474 return 0;
475}
476
456static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, 477static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
457 struct devlink_port *devlink_port, 478 struct devlink_port *devlink_port,
458 enum devlink_command cmd, u32 portid, 479 enum devlink_command cmd, u32 portid,
@@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
492 ibdev->name)) 513 ibdev->name))
493 goto nla_put_failure; 514 goto nla_put_failure;
494 } 515 }
495 if (devlink_port->split && 516 if (devlink_nl_port_attrs_put(msg, devlink_port))
496 nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
497 devlink_port->split_group))
498 goto nla_put_failure; 517 goto nla_put_failure;
499 518
500 genlmsg_end(msg, hdr); 519 genlmsg_end(msg, hdr);
@@ -683,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
683 return 0; 702 return 0;
684} 703}
685 704
686static int devlink_port_split(struct devlink *devlink, 705static int devlink_port_split(struct devlink *devlink, u32 port_index,
687 u32 port_index, u32 count) 706 u32 count, struct netlink_ext_ack *extack)
688 707
689{ 708{
690 if (devlink->ops && devlink->ops->port_split) 709 if (devlink->ops && devlink->ops->port_split)
691 return devlink->ops->port_split(devlink, port_index, count); 710 return devlink->ops->port_split(devlink, port_index, count,
711 extack);
692 return -EOPNOTSUPP; 712 return -EOPNOTSUPP;
693} 713}
694 714
@@ -705,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
705 725
706 port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); 726 port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
707 count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); 727 count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
708 return devlink_port_split(devlink, port_index, count); 728 return devlink_port_split(devlink, port_index, count, info->extack);
709} 729}
710 730
711static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) 731static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
732 struct netlink_ext_ack *extack)
712 733
713{ 734{
714 if (devlink->ops && devlink->ops->port_unsplit) 735 if (devlink->ops && devlink->ops->port_unsplit)
715 return devlink->ops->port_unsplit(devlink, port_index); 736 return devlink->ops->port_unsplit(devlink, port_index, extack);
716 return -EOPNOTSUPP; 737 return -EOPNOTSUPP;
717} 738}
718 739
@@ -726,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
726 return -EINVAL; 747 return -EINVAL;
727 748
728 port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); 749 port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
729 return devlink_port_unsplit(devlink, port_index); 750 return devlink_port_unsplit(devlink, port_index, info->extack);
730} 751}
731 752
732static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, 753static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -1807,7 +1828,6 @@ send_done:
1807nla_put_failure: 1828nla_put_failure:
1808 err = -EMSGSIZE; 1829 err = -EMSGSIZE;
1809err_table_put: 1830err_table_put:
1810 genlmsg_cancel(skb, hdr);
1811 nlmsg_free(skb); 1831 nlmsg_free(skb);
1812 return err; 1832 return err;
1813} 1833}
@@ -2013,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
2013 return 0; 2033 return 0;
2014 2034
2015nla_put_failure: 2035nla_put_failure:
2016 genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr);
2017 nlmsg_free(dump_ctx->skb); 2036 nlmsg_free(dump_ctx->skb);
2018 return -EMSGSIZE; 2037 return -EMSGSIZE;
2019} 2038}
@@ -2230,7 +2249,6 @@ send_done:
2230nla_put_failure: 2249nla_put_failure:
2231 err = -EMSGSIZE; 2250 err = -EMSGSIZE;
2232err_table_put: 2251err_table_put:
2233 genlmsg_cancel(skb, hdr);
2234 nlmsg_free(skb); 2252 nlmsg_free(skb);
2235 return err; 2253 return err;
2236} 2254}
@@ -2532,7 +2550,6 @@ nla_put_failure:
2532 err = -EMSGSIZE; 2550 err = -EMSGSIZE;
2533err_resource_put: 2551err_resource_put:
2534err_skb_send_alloc: 2552err_skb_send_alloc:
2535 genlmsg_cancel(skb, hdr);
2536 nlmsg_free(skb); 2553 nlmsg_free(skb);
2537 return err; 2554 return err;
2538} 2555}
@@ -2584,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
2584 NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); 2601 NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
2585 return err; 2602 return err;
2586 } 2603 }
2587 return devlink->ops->reload(devlink); 2604 return devlink->ops->reload(devlink, info->extack);
2588} 2605}
2589 2606
2590static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { 2607static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -2737,7 +2754,8 @@ static const struct genl_ops devlink_nl_ops[] = {
2737 .doit = devlink_nl_cmd_eswitch_set_doit, 2754 .doit = devlink_nl_cmd_eswitch_set_doit,
2738 .policy = devlink_nl_policy, 2755 .policy = devlink_nl_policy,
2739 .flags = GENL_ADMIN_PERM, 2756 .flags = GENL_ADMIN_PERM,
2740 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, 2757 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
2758 DEVLINK_NL_FLAG_NO_LOCK,
2741 }, 2759 },
2742 { 2760 {
2743 .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, 2761 .cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
@@ -2971,19 +2989,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
2971EXPORT_SYMBOL_GPL(devlink_port_type_clear); 2989EXPORT_SYMBOL_GPL(devlink_port_type_clear);
2972 2990
2973/** 2991/**
2974 * devlink_port_split_set - Set port is split 2992 * devlink_port_attrs_set - Set port attributes
2975 * 2993 *
2976 * @devlink_port: devlink port 2994 * @devlink_port: devlink port
2977 * @split_group: split group - identifies group split port is part of 2995 * @flavour: flavour of the port
2996 * @port_number: number of the port that is facing user, for example
2997 * the front panel port number
2998 * @split: indicates if this is split port
2999 * @split_subport_number: if the port is split, this is the number
3000 * of subport.
2978 */ 3001 */
2979void devlink_port_split_set(struct devlink_port *devlink_port, 3002void devlink_port_attrs_set(struct devlink_port *devlink_port,
2980 u32 split_group) 3003 enum devlink_port_flavour flavour,
2981{ 3004 u32 port_number, bool split,
2982 devlink_port->split = true; 3005 u32 split_subport_number)
2983 devlink_port->split_group = split_group; 3006{
3007 struct devlink_port_attrs *attrs = &devlink_port->attrs;
3008
3009 attrs->set = true;
3010 attrs->flavour = flavour;
3011 attrs->port_number = port_number;
3012 attrs->split = split;
3013 attrs->split_subport_number = split_subport_number;
2984 devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); 3014 devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
2985} 3015}
2986EXPORT_SYMBOL_GPL(devlink_port_split_set); 3016EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
3017
3018int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
3019 char *name, size_t len)
3020{
3021 struct devlink_port_attrs *attrs = &devlink_port->attrs;
3022 int n = 0;
3023
3024 if (!attrs->set)
3025 return -EOPNOTSUPP;
3026
3027 switch (attrs->flavour) {
3028 case DEVLINK_PORT_FLAVOUR_PHYSICAL:
3029 if (!attrs->split)
3030 n = snprintf(name, len, "p%u", attrs->port_number);
3031 else
3032 n = snprintf(name, len, "p%us%u", attrs->port_number,
3033 attrs->split_subport_number);
3034 break;
3035 case DEVLINK_PORT_FLAVOUR_CPU:
3036 case DEVLINK_PORT_FLAVOUR_DSA:
3037 /* As CPU and DSA ports do not have a netdevice associated
3038 * case should not ever happen.
3039 */
3040 WARN_ON(1);
3041 return -EINVAL;
3042 }
3043
3044 if (n >= len)
3045 return -EINVAL;
3046
3047 return 0;
3048}
3049EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
2987 3050
2988int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, 3051int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
2989 u32 size, u16 ingress_pools_count, 3052 u32 size, u16 ingress_pools_count,
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b08291..2d9b37f8944a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {
58 */ 58 */
59 .refcnt = REFCOUNT_INIT(1), 59 .refcnt = REFCOUNT_INIT(1),
60}; 60};
61EXPORT_SYMBOL(dst_default_metrics);
61 62
62void dst_init(struct dst_entry *dst, struct dst_ops *ops, 63void dst_init(struct dst_entry *dst, struct dst_ops *ops,
63 struct net_device *dev, int initial_ref, int initial_obsolete, 64 struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ba02f0dfe85c..c15075dc7572 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
92 [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", 92 [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
93 [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", 93 [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
94 [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", 94 [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
95 [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
95 96
96 [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", 97 [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
97 [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", 98 [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
109 [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", 110 [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
110 [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", 111 [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
111 [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", 112 [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
113 [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
112}; 114};
113 115
114static const char 116static const char
@@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
210 return ret; 212 return ret;
211} 213}
212 214
213static int phy_get_sset_count(struct phy_device *phydev)
214{
215 int ret;
216
217 if (phydev->drv->get_sset_count &&
218 phydev->drv->get_strings &&
219 phydev->drv->get_stats) {
220 mutex_lock(&phydev->lock);
221 ret = phydev->drv->get_sset_count(phydev);
222 mutex_unlock(&phydev->lock);
223
224 return ret;
225 }
226
227 return -EOPNOTSUPP;
228}
229
230static int __ethtool_get_sset_count(struct net_device *dev, int sset) 215static int __ethtool_get_sset_count(struct net_device *dev, int sset)
231{ 216{
232 const struct ethtool_ops *ops = dev->ethtool_ops; 217 const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
243 if (sset == ETH_SS_PHY_TUNABLES) 228 if (sset == ETH_SS_PHY_TUNABLES)
244 return ARRAY_SIZE(phy_tunable_strings); 229 return ARRAY_SIZE(phy_tunable_strings);
245 230
246 if (sset == ETH_SS_PHY_STATS) { 231 if (sset == ETH_SS_PHY_STATS && dev->phydev &&
247 if (dev->phydev) 232 !ops->get_ethtool_phy_stats)
248 return phy_get_sset_count(dev->phydev); 233 return phy_ethtool_get_sset_count(dev->phydev);
249 else
250 return -EOPNOTSUPP;
251 }
252 234
253 if (ops->get_sset_count && ops->get_strings) 235 if (ops->get_sset_count && ops->get_strings)
254 return ops->get_sset_count(dev, sset); 236 return ops->get_sset_count(dev, sset);
@@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev,
271 memcpy(data, tunable_strings, sizeof(tunable_strings)); 253 memcpy(data, tunable_strings, sizeof(tunable_strings));
272 else if (stringset == ETH_SS_PHY_TUNABLES) 254 else if (stringset == ETH_SS_PHY_TUNABLES)
273 memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); 255 memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
274 else if (stringset == ETH_SS_PHY_STATS) { 256 else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
275 struct phy_device *phydev = dev->phydev; 257 !ops->get_ethtool_phy_stats)
276 258 phy_ethtool_get_strings(dev->phydev, data);
277 if (phydev) { 259 else
278 mutex_lock(&phydev->lock);
279 phydev->drv->get_strings(phydev, data);
280 mutex_unlock(&phydev->lock);
281 } else {
282 return;
283 }
284 } else
285 /* ops->get_strings is valid because checked earlier */ 260 /* ops->get_strings is valid because checked earlier */
286 ops->get_strings(dev, stringset, data); 261 ops->get_strings(dev, stringset, data);
287} 262}
@@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
1998 1973
1999static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) 1974static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
2000{ 1975{
2001 struct ethtool_stats stats; 1976 const struct ethtool_ops *ops = dev->ethtool_ops;
2002 struct phy_device *phydev = dev->phydev; 1977 struct phy_device *phydev = dev->phydev;
1978 struct ethtool_stats stats;
2003 u64 *data; 1979 u64 *data;
2004 int ret, n_stats; 1980 int ret, n_stats;
2005 1981
2006 if (!phydev) 1982 if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
2007 return -EOPNOTSUPP; 1983 return -EOPNOTSUPP;
2008 1984
2009 n_stats = phy_get_sset_count(phydev); 1985 if (dev->phydev && !ops->get_ethtool_phy_stats)
1986 n_stats = phy_ethtool_get_sset_count(dev->phydev);
1987 else
1988 n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
2010 if (n_stats < 0) 1989 if (n_stats < 0)
2011 return n_stats; 1990 return n_stats;
2012 if (n_stats > S32_MAX / sizeof(u64)) 1991 if (n_stats > S32_MAX / sizeof(u64))
@@ -2021,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
2021 if (n_stats && !data) 2000 if (n_stats && !data)
2022 return -ENOMEM; 2001 return -ENOMEM;
2023 2002
2024 mutex_lock(&phydev->lock); 2003 if (dev->phydev && !ops->get_ethtool_phy_stats) {
2025 phydev->drv->get_stats(phydev, &stats, data); 2004 ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
2026 mutex_unlock(&phydev->lock); 2005 if (ret < 0)
2006 return ret;
2007 } else {
2008 ops->get_ethtool_phy_stats(dev, &stats, data);
2009 }
2027 2010
2028 ret = -EFAULT; 2011 ret = -EFAULT;
2029 if (copy_to_user(useraddr, &stats, sizeof(stats))) 2012 if (copy_to_user(useraddr, &stats, sizeof(stats)))
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..4a92a98ccce9
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2018, Intel Corporation. */
3
4/* A common module to handle registrations and notifications for paravirtual
5 * drivers to enable accelerated datapath and support VF live migration.
6 *
7 * The notifier and event handling code is based on netvsc driver.
8 */
9
10#include <linux/module.h>
11#include <linux/etherdevice.h>
12#include <uapi/linux/if_arp.h>
13#include <linux/rtnetlink.h>
14#include <linux/if_vlan.h>
15#include <net/failover.h>
16
17static LIST_HEAD(failover_list);
18static DEFINE_SPINLOCK(failover_lock);
19
20static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
21{
22 struct net_device *failover_dev;
23 struct failover *failover;
24
25 spin_lock(&failover_lock);
26 list_for_each_entry(failover, &failover_list, list) {
27 failover_dev = rtnl_dereference(failover->failover_dev);
28 if (ether_addr_equal(failover_dev->perm_addr, mac)) {
29 *ops = rtnl_dereference(failover->ops);
30 spin_unlock(&failover_lock);
31 return failover_dev;
32 }
33 }
34 spin_unlock(&failover_lock);
35 return NULL;
36}
37
38/**
39 * failover_slave_register - Register a slave netdev
40 *
41 * @slave_dev: slave netdev that is being registered
42 *
43 * Registers a slave device to a failover instance. Only ethernet devices
44 * are supported.
45 */
46static int failover_slave_register(struct net_device *slave_dev)
47{
48 struct netdev_lag_upper_info lag_upper_info;
49 struct net_device *failover_dev;
50 struct failover_ops *fops;
51 int err;
52
53 if (slave_dev->type != ARPHRD_ETHER)
54 goto done;
55
56 ASSERT_RTNL();
57
58 failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
59 if (!failover_dev)
60 goto done;
61
62 if (fops && fops->slave_pre_register &&
63 fops->slave_pre_register(slave_dev, failover_dev))
64 goto done;
65
66 err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
67 failover_dev);
68 if (err) {
69 netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
70 err);
71 goto done;
72 }
73
74 lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
75 err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
76 &lag_upper_info, NULL);
77 if (err) {
78 netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
79 failover_dev->name, err);
80 goto err_upper_link;
81 }
82
83 slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
84
85 if (fops && fops->slave_register &&
86 !fops->slave_register(slave_dev, failover_dev))
87 return NOTIFY_OK;
88
89 netdev_upper_dev_unlink(slave_dev, failover_dev);
90 slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
91err_upper_link:
92 netdev_rx_handler_unregister(slave_dev);
93done:
94 return NOTIFY_DONE;
95}
96
97/**
98 * failover_slave_unregister - Unregister a slave netdev
99 *
100 * @slave_dev: slave netdev that is being unregistered
101 *
102 * Unregisters a slave device from a failover instance.
103 */
104int failover_slave_unregister(struct net_device *slave_dev)
105{
106 struct net_device *failover_dev;
107 struct failover_ops *fops;
108
109 if (!netif_is_failover_slave(slave_dev))
110 goto done;
111
112 ASSERT_RTNL();
113
114 failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
115 if (!failover_dev)
116 goto done;
117
118 if (fops && fops->slave_pre_unregister &&
119 fops->slave_pre_unregister(slave_dev, failover_dev))
120 goto done;
121
122 netdev_rx_handler_unregister(slave_dev);
123 netdev_upper_dev_unlink(slave_dev, failover_dev);
124 slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
125
126 if (fops && fops->slave_unregister &&
127 !fops->slave_unregister(slave_dev, failover_dev))
128 return NOTIFY_OK;
129
130done:
131 return NOTIFY_DONE;
132}
133EXPORT_SYMBOL_GPL(failover_slave_unregister);
134
135static int failover_slave_link_change(struct net_device *slave_dev)
136{
137 struct net_device *failover_dev;
138 struct failover_ops *fops;
139
140 if (!netif_is_failover_slave(slave_dev))
141 goto done;
142
143 ASSERT_RTNL();
144
145 failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
146 if (!failover_dev)
147 goto done;
148
149 if (!netif_running(failover_dev))
150 goto done;
151
152 if (fops && fops->slave_link_change &&
153 !fops->slave_link_change(slave_dev, failover_dev))
154 return NOTIFY_OK;
155
156done:
157 return NOTIFY_DONE;
158}
159
160static int failover_slave_name_change(struct net_device *slave_dev)
161{
162 struct net_device *failover_dev;
163 struct failover_ops *fops;
164
165 if (!netif_is_failover_slave(slave_dev))
166 goto done;
167
168 ASSERT_RTNL();
169
170 failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
171 if (!failover_dev)
172 goto done;
173
174 if (!netif_running(failover_dev))
175 goto done;
176
177 if (fops && fops->slave_name_change &&
178 !fops->slave_name_change(slave_dev, failover_dev))
179 return NOTIFY_OK;
180
181done:
182 return NOTIFY_DONE;
183}
184
185static int
186failover_event(struct notifier_block *this, unsigned long event, void *ptr)
187{
188 struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
189
190 /* Skip parent events */
191 if (netif_is_failover(event_dev))
192 return NOTIFY_DONE;
193
194 switch (event) {
195 case NETDEV_REGISTER:
196 return failover_slave_register(event_dev);
197 case NETDEV_UNREGISTER:
198 return failover_slave_unregister(event_dev);
199 case NETDEV_UP:
200 case NETDEV_DOWN:
201 case NETDEV_CHANGE:
202 return failover_slave_link_change(event_dev);
203 case NETDEV_CHANGENAME:
204 return failover_slave_name_change(event_dev);
205 default:
206 return NOTIFY_DONE;
207 }
208}
209
210static struct notifier_block failover_notifier = {
211 .notifier_call = failover_event,
212};
213
214static void
215failover_existing_slave_register(struct net_device *failover_dev)
216{
217 struct net *net = dev_net(failover_dev);
218 struct net_device *dev;
219
220 rtnl_lock();
221 for_each_netdev(net, dev) {
222 if (netif_is_failover(dev))
223 continue;
224 if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
225 failover_slave_register(dev);
226 }
227 rtnl_unlock();
228}
229
230/**
231 * failover_register - Register a failover instance
232 *
233 * @dev: failover netdev
234 * @ops: failover ops
235 *
236 * Allocate and register a failover instance for a failover netdev. ops
237 * provides handlers for slave device register/unregister/link change/
238 * name change events.
239 *
240 * Return: pointer to failover instance
241 */
242struct failover *failover_register(struct net_device *dev,
243 struct failover_ops *ops)
244{
245 struct failover *failover;
246
247 if (dev->type != ARPHRD_ETHER)
248 return ERR_PTR(-EINVAL);
249
250 failover = kzalloc(sizeof(*failover), GFP_KERNEL);
251 if (!failover)
252 return ERR_PTR(-ENOMEM);
253
254 rcu_assign_pointer(failover->ops, ops);
255 dev_hold(dev);
256 dev->priv_flags |= IFF_FAILOVER;
257 rcu_assign_pointer(failover->failover_dev, dev);
258
259 spin_lock(&failover_lock);
260 list_add_tail(&failover->list, &failover_list);
261 spin_unlock(&failover_lock);
262
263 netdev_info(dev, "failover master:%s registered\n", dev->name);
264
265 failover_existing_slave_register(dev);
266
267 return failover;
268}
269EXPORT_SYMBOL_GPL(failover_register);
270
271/**
272 * failover_unregister - Unregister a failover instance
273 *
274 * @failover: pointer to failover instance
275 *
276 * Unregisters and frees a failover instance.
277 */
278void failover_unregister(struct failover *failover)
279{
280 struct net_device *failover_dev;
281
282 failover_dev = rcu_dereference(failover->failover_dev);
283
284 netdev_info(failover_dev, "failover master:%s unregistered\n",
285 failover_dev->name);
286
287 failover_dev->priv_flags &= ~IFF_FAILOVER;
288 dev_put(failover_dev);
289
290 spin_lock(&failover_lock);
291 list_del(&failover->list);
292 spin_unlock(&failover_lock);
293
294 kfree(failover);
295}
296EXPORT_SYMBOL_GPL(failover_unregister);
297
298static __init int
299failover_init(void)
300{
301 register_netdevice_notifier(&failover_notifier);
302
303 return 0;
304}
305module_init(failover_init);
306
307static __exit
308void failover_exit(void)
309{
310 unregister_netdevice_notifier(&failover_notifier);
311}
312module_exit(failover_exit);
313
314MODULE_DESCRIPTION("Generic failover infrastructure/interface");
315MODULE_LICENSE("GPL v2");
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f84c173..126ffc5bc630 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
387} 387}
388EXPORT_SYMBOL_GPL(fib_rules_seq_read); 388EXPORT_SYMBOL_GPL(fib_rules_seq_read);
389 389
390static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, 390static struct fib_rule *rule_find(struct fib_rules_ops *ops,
391 struct fib_rules_ops *ops) 391 struct fib_rule_hdr *frh,
392{ 392 struct nlattr **tb,
393 int err = -EINVAL; 393 struct fib_rule *rule,
394 394 bool user_priority)
395 if (frh->src_len)
396 if (tb[FRA_SRC] == NULL ||
397 frh->src_len > (ops->addr_size * 8) ||
398 nla_len(tb[FRA_SRC]) != ops->addr_size)
399 goto errout;
400
401 if (frh->dst_len)
402 if (tb[FRA_DST] == NULL ||
403 frh->dst_len > (ops->addr_size * 8) ||
404 nla_len(tb[FRA_DST]) != ops->addr_size)
405 goto errout;
406
407 err = 0;
408errout:
409 return err;
410}
411
412static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
413 struct nlattr **tb, struct fib_rule *rule)
414{ 395{
415 struct fib_rule *r; 396 struct fib_rule *r;
416 397
417 list_for_each_entry(r, &ops->rules_list, list) { 398 list_for_each_entry(r, &ops->rules_list, list) {
418 if (r->action != rule->action) 399 if (rule->action && r->action != rule->action)
419 continue; 400 continue;
420 401
421 if (r->table != rule->table) 402 if (rule->table && r->table != rule->table)
422 continue; 403 continue;
423 404
424 if (r->pref != rule->pref) 405 if (user_priority && r->pref != rule->pref)
425 continue; 406 continue;
426 407
427 if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) 408 if (rule->iifname[0] &&
409 memcmp(r->iifname, rule->iifname, IFNAMSIZ))
428 continue; 410 continue;
429 411
430 if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) 412 if (rule->oifname[0] &&
413 memcmp(r->oifname, rule->oifname, IFNAMSIZ))
431 continue; 414 continue;
432 415
433 if (r->mark != rule->mark) 416 if (rule->mark && r->mark != rule->mark)
434 continue; 417 continue;
435 418
436 if (r->mark_mask != rule->mark_mask) 419 if (rule->mark_mask && r->mark_mask != rule->mark_mask)
437 continue; 420 continue;
438 421
439 if (r->tun_id != rule->tun_id) 422 if (rule->tun_id && r->tun_id != rule->tun_id)
440 continue; 423 continue;
441 424
442 if (r->fr_net != rule->fr_net) 425 if (r->fr_net != rule->fr_net)
443 continue; 426 continue;
444 427
445 if (r->l3mdev != rule->l3mdev) 428 if (rule->l3mdev && r->l3mdev != rule->l3mdev)
446 continue; 429 continue;
447 430
448 if (!uid_eq(r->uid_range.start, rule->uid_range.start) || 431 if (uid_range_set(&rule->uid_range) &&
449 !uid_eq(r->uid_range.end, rule->uid_range.end)) 432 (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
433 !uid_eq(r->uid_range.end, rule->uid_range.end)))
450 continue; 434 continue;
451 435
452 if (r->ip_proto != rule->ip_proto) 436 if (rule->ip_proto && r->ip_proto != rule->ip_proto)
453 continue; 437 continue;
454 438
455 if (!fib_rule_port_range_compare(&r->sport_range, 439 if (fib_rule_port_range_set(&rule->sport_range) &&
440 !fib_rule_port_range_compare(&r->sport_range,
456 &rule->sport_range)) 441 &rule->sport_range))
457 continue; 442 continue;
458 443
459 if (!fib_rule_port_range_compare(&r->dport_range, 444 if (fib_rule_port_range_set(&rule->dport_range) &&
445 !fib_rule_port_range_compare(&r->dport_range,
460 &rule->dport_range)) 446 &rule->dport_range))
461 continue; 447 continue;
462 448
463 if (!ops->compare(r, frh, tb)) 449 if (!ops->compare(r, frh, tb))
464 continue; 450 continue;
465 return 1; 451 return r;
452 }
453
454 return NULL;
455}
456
457#ifdef CONFIG_NET_L3_MASTER_DEV
458static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
459 struct netlink_ext_ack *extack)
460{
461 nlrule->l3mdev = nla_get_u8(nla);
462 if (nlrule->l3mdev != 1) {
463 NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
464 return -1;
466 } 465 }
466
467 return 0; 467 return 0;
468} 468}
469#else
470static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
471 struct netlink_ext_ack *extack)
472{
473 NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
474 return -1;
475}
476#endif
469 477
470int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, 478static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
471 struct netlink_ext_ack *extack) 479 struct netlink_ext_ack *extack,
480 struct fib_rules_ops *ops,
481 struct nlattr *tb[],
482 struct fib_rule **rule,
483 bool *user_priority)
472{ 484{
473 struct net *net = sock_net(skb->sk); 485 struct net *net = sock_net(skb->sk);
474 struct fib_rule_hdr *frh = nlmsg_data(nlh); 486 struct fib_rule_hdr *frh = nlmsg_data(nlh);
475 struct fib_rules_ops *ops = NULL; 487 struct fib_rule *nlrule = NULL;
476 struct fib_rule *rule, *r, *last = NULL; 488 int err = -EINVAL;
477 struct nlattr *tb[FRA_MAX+1];
478 int err = -EINVAL, unresolved = 0;
479
480 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
481 goto errout;
482 489
483 ops = lookup_rules_ops(net, frh->family); 490 if (frh->src_len)
484 if (ops == NULL) { 491 if (!tb[FRA_SRC] ||
485 err = -EAFNOSUPPORT; 492 frh->src_len > (ops->addr_size * 8) ||
486 goto errout; 493 nla_len(tb[FRA_SRC]) != ops->addr_size) {
494 NL_SET_ERR_MSG(extack, "Invalid source address");
495 goto errout;
487 } 496 }
488 497
489 err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); 498 if (frh->dst_len)
490 if (err < 0) 499 if (!tb[FRA_DST] ||
491 goto errout; 500 frh->dst_len > (ops->addr_size * 8) ||
492 501 nla_len(tb[FRA_DST]) != ops->addr_size) {
493 err = validate_rulemsg(frh, tb, ops); 502 NL_SET_ERR_MSG(extack, "Invalid dst address");
494 if (err < 0) 503 goto errout;
495 goto errout; 504 }
496 505
497 rule = kzalloc(ops->rule_size, GFP_KERNEL); 506 nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
498 if (rule == NULL) { 507 if (!nlrule) {
499 err = -ENOMEM; 508 err = -ENOMEM;
500 goto errout; 509 goto errout;
501 } 510 }
502 refcount_set(&rule->refcnt, 1); 511 refcount_set(&nlrule->refcnt, 1);
503 rule->fr_net = net; 512 nlrule->fr_net = net;
504 513
505 rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) 514 if (tb[FRA_PRIORITY]) {
506 : fib_default_rule_pref(ops); 515 nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
516 *user_priority = true;
517 } else {
518 nlrule->pref = fib_default_rule_pref(ops);
519 }
507 520
508 rule->proto = tb[FRA_PROTOCOL] ? 521 nlrule->proto = tb[FRA_PROTOCOL] ?
509 nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; 522 nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
510 523
511 if (tb[FRA_IIFNAME]) { 524 if (tb[FRA_IIFNAME]) {
512 struct net_device *dev; 525 struct net_device *dev;
513 526
514 rule->iifindex = -1; 527 nlrule->iifindex = -1;
515 nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); 528 nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
516 dev = __dev_get_by_name(net, rule->iifname); 529 dev = __dev_get_by_name(net, nlrule->iifname);
517 if (dev) 530 if (dev)
518 rule->iifindex = dev->ifindex; 531 nlrule->iifindex = dev->ifindex;
519 } 532 }
520 533
521 if (tb[FRA_OIFNAME]) { 534 if (tb[FRA_OIFNAME]) {
522 struct net_device *dev; 535 struct net_device *dev;
523 536
524 rule->oifindex = -1; 537 nlrule->oifindex = -1;
525 nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); 538 nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
526 dev = __dev_get_by_name(net, rule->oifname); 539 dev = __dev_get_by_name(net, nlrule->oifname);
527 if (dev) 540 if (dev)
528 rule->oifindex = dev->ifindex; 541 nlrule->oifindex = dev->ifindex;
529 } 542 }
530 543
531 if (tb[FRA_FWMARK]) { 544 if (tb[FRA_FWMARK]) {
532 rule->mark = nla_get_u32(tb[FRA_FWMARK]); 545 nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
533 if (rule->mark) 546 if (nlrule->mark)
534 /* compatibility: if the mark value is non-zero all bits 547 /* compatibility: if the mark value is non-zero all bits
535 * are compared unless a mask is explicitly specified. 548 * are compared unless a mask is explicitly specified.
536 */ 549 */
537 rule->mark_mask = 0xFFFFFFFF; 550 nlrule->mark_mask = 0xFFFFFFFF;
538 } 551 }
539 552
540 if (tb[FRA_FWMASK]) 553 if (tb[FRA_FWMASK])
541 rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); 554 nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
542 555
543 if (tb[FRA_TUN_ID]) 556 if (tb[FRA_TUN_ID])
544 rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); 557 nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
545 558
546 err = -EINVAL; 559 err = -EINVAL;
547 if (tb[FRA_L3MDEV]) { 560 if (tb[FRA_L3MDEV] &&
548#ifdef CONFIG_NET_L3_MASTER_DEV 561 fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
549 rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); 562 goto errout_free;
550 if (rule->l3mdev != 1)
551#endif
552 goto errout_free;
553 }
554 563
555 rule->action = frh->action; 564 nlrule->action = frh->action;
556 rule->flags = frh->flags; 565 nlrule->flags = frh->flags;
557 rule->table = frh_get_table(frh, tb); 566 nlrule->table = frh_get_table(frh, tb);
558 if (tb[FRA_SUPPRESS_PREFIXLEN]) 567 if (tb[FRA_SUPPRESS_PREFIXLEN])
559 rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); 568 nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
560 else 569 else
561 rule->suppress_prefixlen = -1; 570 nlrule->suppress_prefixlen = -1;
562 571
563 if (tb[FRA_SUPPRESS_IFGROUP]) 572 if (tb[FRA_SUPPRESS_IFGROUP])
564 rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); 573 nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
565 else 574 else
566 rule->suppress_ifgroup = -1; 575 nlrule->suppress_ifgroup = -1;
567 576
568 if (tb[FRA_GOTO]) { 577 if (tb[FRA_GOTO]) {
569 if (rule->action != FR_ACT_GOTO) 578 if (nlrule->action != FR_ACT_GOTO) {
579 NL_SET_ERR_MSG(extack, "Unexpected goto");
570 goto errout_free; 580 goto errout_free;
581 }
571 582
572 rule->target = nla_get_u32(tb[FRA_GOTO]); 583 nlrule->target = nla_get_u32(tb[FRA_GOTO]);
573 /* Backward jumps are prohibited to avoid endless loops */ 584 /* Backward jumps are prohibited to avoid endless loops */
574 if (rule->target <= rule->pref) 585 if (nlrule->target <= nlrule->pref) {
586 NL_SET_ERR_MSG(extack, "Backward goto not supported");
575 goto errout_free; 587 goto errout_free;
576
577 list_for_each_entry(r, &ops->rules_list, list) {
578 if (r->pref == rule->target) {
579 RCU_INIT_POINTER(rule->ctarget, r);
580 break;
581 }
582 } 588 }
583 589 } else if (nlrule->action == FR_ACT_GOTO) {
584 if (rcu_dereference_protected(rule->ctarget, 1) == NULL) 590 NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
585 unresolved = 1;
586 } else if (rule->action == FR_ACT_GOTO)
587 goto errout_free; 591 goto errout_free;
592 }
588 593
589 if (rule->l3mdev && rule->table) 594 if (nlrule->l3mdev && nlrule->table) {
595 NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
590 goto errout_free; 596 goto errout_free;
597 }
591 598
592 if (tb[FRA_UID_RANGE]) { 599 if (tb[FRA_UID_RANGE]) {
593 if (current_user_ns() != net->user_ns) { 600 if (current_user_ns() != net->user_ns) {
594 err = -EPERM; 601 err = -EPERM;
602 NL_SET_ERR_MSG(extack, "No permission to set uid");
595 goto errout_free; 603 goto errout_free;
596 } 604 }
597 605
598 rule->uid_range = nla_get_kuid_range(tb); 606 nlrule->uid_range = nla_get_kuid_range(tb);
599 607
600 if (!uid_range_set(&rule->uid_range) || 608 if (!uid_range_set(&nlrule->uid_range) ||
601 !uid_lte(rule->uid_range.start, rule->uid_range.end)) 609 !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
610 NL_SET_ERR_MSG(extack, "Invalid uid range");
602 goto errout_free; 611 goto errout_free;
612 }
603 } else { 613 } else {
604 rule->uid_range = fib_kuid_range_unset; 614 nlrule->uid_range = fib_kuid_range_unset;
605 } 615 }
606 616
607 if (tb[FRA_IP_PROTO]) 617 if (tb[FRA_IP_PROTO])
608 rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); 618 nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
609 619
610 if (tb[FRA_SPORT_RANGE]) { 620 if (tb[FRA_SPORT_RANGE]) {
611 err = nla_get_port_range(tb[FRA_SPORT_RANGE], 621 err = nla_get_port_range(tb[FRA_SPORT_RANGE],
612 &rule->sport_range); 622 &nlrule->sport_range);
613 if (err) 623 if (err) {
624 NL_SET_ERR_MSG(extack, "Invalid sport range");
614 goto errout_free; 625 goto errout_free;
626 }
615 } 627 }
616 628
617 if (tb[FRA_DPORT_RANGE]) { 629 if (tb[FRA_DPORT_RANGE]) {
618 err = nla_get_port_range(tb[FRA_DPORT_RANGE], 630 err = nla_get_port_range(tb[FRA_DPORT_RANGE],
619 &rule->dport_range); 631 &nlrule->dport_range);
620 if (err) 632 if (err) {
633 NL_SET_ERR_MSG(extack, "Invalid dport range");
621 goto errout_free; 634 goto errout_free;
635 }
622 } 636 }
623 637
638 *rule = nlrule;
639
640 return 0;
641
642errout_free:
643 kfree(nlrule);
644errout:
645 return err;
646}
647
648int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
649 struct netlink_ext_ack *extack)
650{
651 struct net *net = sock_net(skb->sk);
652 struct fib_rule_hdr *frh = nlmsg_data(nlh);
653 struct fib_rules_ops *ops = NULL;
654 struct fib_rule *rule = NULL, *r, *last = NULL;
655 struct nlattr *tb[FRA_MAX + 1];
656 int err = -EINVAL, unresolved = 0;
657 bool user_priority = false;
658
659 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
660 NL_SET_ERR_MSG(extack, "Invalid msg length");
661 goto errout;
662 }
663
664 ops = lookup_rules_ops(net, frh->family);
665 if (!ops) {
666 err = -EAFNOSUPPORT;
667 NL_SET_ERR_MSG(extack, "Rule family not supported");
668 goto errout;
669 }
670
671 err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
672 if (err < 0) {
673 NL_SET_ERR_MSG(extack, "Error parsing msg");
674 goto errout;
675 }
676
677 err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
678 if (err)
679 goto errout;
680
624 if ((nlh->nlmsg_flags & NLM_F_EXCL) && 681 if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
625 rule_exists(ops, frh, tb, rule)) { 682 rule_find(ops, frh, tb, rule, user_priority)) {
626 err = -EEXIST; 683 err = -EEXIST;
627 goto errout_free; 684 goto errout_free;
628 } 685 }
629 686
630 err = ops->configure(rule, skb, frh, tb); 687 err = ops->configure(rule, skb, frh, tb, extack);
631 if (err < 0) 688 if (err < 0)
632 goto errout_free; 689 goto errout_free;
633 690
@@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
637 goto errout_free; 694 goto errout_free;
638 695
639 list_for_each_entry(r, &ops->rules_list, list) { 696 list_for_each_entry(r, &ops->rules_list, list) {
697 if (r->pref == rule->target) {
698 RCU_INIT_POINTER(rule->ctarget, r);
699 break;
700 }
701 }
702
703 if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
704 unresolved = 1;
705
706 list_for_each_entry(r, &ops->rules_list, list) {
640 if (r->pref > rule->pref) 707 if (r->pref > rule->pref)
641 break; 708 break;
642 last = r; 709 last = r;
@@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
690{ 757{
691 struct net *net = sock_net(skb->sk); 758 struct net *net = sock_net(skb->sk);
692 struct fib_rule_hdr *frh = nlmsg_data(nlh); 759 struct fib_rule_hdr *frh = nlmsg_data(nlh);
693 struct fib_rule_port_range sprange = {0, 0};
694 struct fib_rule_port_range dprange = {0, 0};
695 struct fib_rules_ops *ops = NULL; 760 struct fib_rules_ops *ops = NULL;
696 struct fib_rule *rule, *r; 761 struct fib_rule *rule = NULL, *r, *nlrule = NULL;
697 struct nlattr *tb[FRA_MAX+1]; 762 struct nlattr *tb[FRA_MAX+1];
698 struct fib_kuid_range range;
699 int err = -EINVAL; 763 int err = -EINVAL;
764 bool user_priority = false;
700 765
701 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) 766 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
767 NL_SET_ERR_MSG(extack, "Invalid msg length");
702 goto errout; 768 goto errout;
769 }
703 770
704 ops = lookup_rules_ops(net, frh->family); 771 ops = lookup_rules_ops(net, frh->family);
705 if (ops == NULL) { 772 if (ops == NULL) {
706 err = -EAFNOSUPPORT; 773 err = -EAFNOSUPPORT;
774 NL_SET_ERR_MSG(extack, "Rule family not supported");
707 goto errout; 775 goto errout;
708 } 776 }
709 777
710 err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); 778 err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
711 if (err < 0) 779 if (err < 0) {
780 NL_SET_ERR_MSG(extack, "Error parsing msg");
712 goto errout; 781 goto errout;
782 }
713 783
714 err = validate_rulemsg(frh, tb, ops); 784 err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
715 if (err < 0) 785 if (err)
716 goto errout; 786 goto errout;
717 787
718 if (tb[FRA_UID_RANGE]) { 788 rule = rule_find(ops, frh, tb, nlrule, user_priority);
719 range = nla_get_kuid_range(tb); 789 if (!rule) {
720 if (!uid_range_set(&range)) { 790 err = -ENOENT;
721 err = -EINVAL; 791 goto errout;
722 goto errout;
723 }
724 } else {
725 range = fib_kuid_range_unset;
726 } 792 }
727 793
728 if (tb[FRA_SPORT_RANGE]) { 794 if (rule->flags & FIB_RULE_PERMANENT) {
729 err = nla_get_port_range(tb[FRA_SPORT_RANGE], 795 err = -EPERM;
730 &sprange); 796 goto errout;
731 if (err)
732 goto errout;
733 } 797 }
734 798
735 if (tb[FRA_DPORT_RANGE]) { 799 if (ops->delete) {
736 err = nla_get_port_range(tb[FRA_DPORT_RANGE], 800 err = ops->delete(rule);
737 &dprange);
738 if (err) 801 if (err)
739 goto errout; 802 goto errout;
740 } 803 }
741 804
742 list_for_each_entry(rule, &ops->rules_list, list) { 805 if (rule->tun_id)
743 if (tb[FRA_PROTOCOL] && 806 ip_tunnel_unneed_metadata();
744 (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
745 continue;
746
747 if (frh->action && (frh->action != rule->action))
748 continue;
749
750 if (frh_get_table(frh, tb) &&
751 (frh_get_table(frh, tb) != rule->table))
752 continue;
753
754 if (tb[FRA_PRIORITY] &&
755 (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
756 continue;
757
758 if (tb[FRA_IIFNAME] &&
759 nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
760 continue;
761
762 if (tb[FRA_OIFNAME] &&
763 nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
764 continue;
765
766 if (tb[FRA_FWMARK] &&
767 (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
768 continue;
769
770 if (tb[FRA_FWMASK] &&
771 (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
772 continue;
773
774 if (tb[FRA_TUN_ID] &&
775 (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
776 continue;
777
778 if (tb[FRA_L3MDEV] &&
779 (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
780 continue;
781
782 if (uid_range_set(&range) &&
783 (!uid_eq(rule->uid_range.start, range.start) ||
784 !uid_eq(rule->uid_range.end, range.end)))
785 continue;
786
787 if (tb[FRA_IP_PROTO] &&
788 (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
789 continue;
790
791 if (fib_rule_port_range_set(&sprange) &&
792 !fib_rule_port_range_compare(&rule->sport_range, &sprange))
793 continue;
794
795 if (fib_rule_port_range_set(&dprange) &&
796 !fib_rule_port_range_compare(&rule->dport_range, &dprange))
797 continue;
798
799 if (!ops->compare(rule, frh, tb))
800 continue;
801
802 if (rule->flags & FIB_RULE_PERMANENT) {
803 err = -EPERM;
804 goto errout;
805 }
806
807 if (ops->delete) {
808 err = ops->delete(rule);
809 if (err)
810 goto errout;
811 }
812 807
813 if (rule->tun_id) 808 list_del_rcu(&rule->list);
814 ip_tunnel_unneed_metadata();
815 809
816 list_del_rcu(&rule->list); 810 if (rule->action == FR_ACT_GOTO) {
817 811 ops->nr_goto_rules--;
818 if (rule->action == FR_ACT_GOTO) { 812 if (rtnl_dereference(rule->ctarget) == NULL)
819 ops->nr_goto_rules--; 813 ops->unresolved_rules--;
820 if (rtnl_dereference(rule->ctarget) == NULL) 814 }
821 ops->unresolved_rules--;
822 }
823 815
824 /* 816 /*
825 * Check if this rule is a target to any of them. If so, 817 * Check if this rule is a target to any of them. If so,
826 * adjust to the next one with the same preference or 818 * adjust to the next one with the same preference or
827 * disable them. As this operation is eventually very 819 * disable them. As this operation is eventually very
828 * expensive, it is only performed if goto rules, except 820 * expensive, it is only performed if goto rules, except
829 * current if it is goto rule, have actually been added. 821 * current if it is goto rule, have actually been added.
830 */ 822 */
831 if (ops->nr_goto_rules > 0) { 823 if (ops->nr_goto_rules > 0) {
832 struct fib_rule *n; 824 struct fib_rule *n;
833 825
834 n = list_next_entry(rule, list); 826 n = list_next_entry(rule, list);
835 if (&n->list == &ops->rules_list || n->pref != rule->pref) 827 if (&n->list == &ops->rules_list || n->pref != rule->pref)
836 n = NULL; 828 n = NULL;
837 list_for_each_entry(r, &ops->rules_list, list) { 829 list_for_each_entry(r, &ops->rules_list, list) {
838 if (rtnl_dereference(r->ctarget) != rule) 830 if (rtnl_dereference(r->ctarget) != rule)
839 continue; 831 continue;
840 rcu_assign_pointer(r->ctarget, n); 832 rcu_assign_pointer(r->ctarget, n);
841 if (!n) 833 if (!n)
842 ops->unresolved_rules++; 834 ops->unresolved_rules++;
843 }
844 } 835 }
845
846 call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
847 NULL);
848 notify_rule_change(RTM_DELRULE, rule, ops, nlh,
849 NETLINK_CB(skb).portid);
850 fib_rule_put(rule);
851 flush_route_cache(ops);
852 rules_ops_put(ops);
853 return 0;
854 } 836 }
855 837
856 err = -ENOENT; 838 call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
839 NULL);
840 notify_rule_change(RTM_DELRULE, rule, ops, nlh,
841 NETLINK_CB(skb).portid);
842 fib_rule_put(rule);
843 flush_route_cache(ops);
844 rules_ops_put(ops);
845 kfree(nlrule);
846 return 0;
847
857errout: 848errout:
849 if (nlrule)
850 kfree(nlrule);
858 rules_ops_put(ops); 851 rules_ops_put(ops);
859 return err; 852 return err;
860} 853}
diff --git a/net/core/filter.c b/net/core/filter.c
index 201ff36b17a8..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,7 +57,17 @@
57#include <net/sock_reuseport.h> 57#include <net/sock_reuseport.h>
58#include <net/busy_poll.h> 58#include <net/busy_poll.h>
59#include <net/tcp.h> 59#include <net/tcp.h>
60#include <net/xfrm.h>
60#include <linux/bpf_trace.h> 61#include <linux/bpf_trace.h>
62#include <net/xdp_sock.h>
63#include <linux/inetdevice.h>
64#include <net/ip_fib.h>
65#include <net/flow.h>
66#include <net/arp.h>
67#include <net/ipv6.h>
68#include <linux/seg6_local.h>
69#include <net/seg6.h>
70#include <net/seg6_local.h>
61 71
62/** 72/**
63 * sk_filter_trim_cap - run a packet through a socket filter 73 * sk_filter_trim_cap - run a packet through a socket filter
@@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
111} 121}
112EXPORT_SYMBOL(sk_filter_trim_cap); 122EXPORT_SYMBOL(sk_filter_trim_cap);
113 123
114BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) 124BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
115{ 125{
116 return skb_get_poff(skb); 126 return skb_get_poff(skb);
117} 127}
118 128
119BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 129BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
120{ 130{
121 struct nlattr *nla; 131 struct nlattr *nla;
122 132
@@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
136 return 0; 146 return 0;
137} 147}
138 148
139BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 149BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
140{ 150{
141 struct nlattr *nla; 151 struct nlattr *nla;
142 152
@@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
160 return 0; 170 return 0;
161} 171}
162 172
163BPF_CALL_0(__get_raw_cpu_id) 173BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
174 data, int, headlen, int, offset)
175{
176 u8 tmp, *ptr;
177 const int len = sizeof(tmp);
178
179 if (offset >= 0) {
180 if (headlen - offset >= len)
181 return *(u8 *)(data + offset);
182 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
183 return tmp;
184 } else {
185 ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
186 if (likely(ptr))
187 return *(u8 *)ptr;
188 }
189
190 return -EFAULT;
191}
192
193BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
194 int, offset)
195{
196 return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
197 offset);
198}
199
200BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
201 data, int, headlen, int, offset)
202{
203 u16 tmp, *ptr;
204 const int len = sizeof(tmp);
205
206 if (offset >= 0) {
207 if (headlen - offset >= len)
208 return get_unaligned_be16(data + offset);
209 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
210 return be16_to_cpu(tmp);
211 } else {
212 ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
213 if (likely(ptr))
214 return get_unaligned_be16(ptr);
215 }
216
217 return -EFAULT;
218}
219
220BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
221 int, offset)
222{
223 return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
224 offset);
225}
226
227BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
228 data, int, headlen, int, offset)
229{
230 u32 tmp, *ptr;
231 const int len = sizeof(tmp);
232
233 if (likely(offset >= 0)) {
234 if (headlen - offset >= len)
235 return get_unaligned_be32(data + offset);
236 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
237 return be32_to_cpu(tmp);
238 } else {
239 ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
240 if (likely(ptr))
241 return get_unaligned_be32(ptr);
242 }
243
244 return -EFAULT;
245}
246
247BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
248 int, offset)
249{
250 return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
251 offset);
252}
253
254BPF_CALL_0(bpf_get_raw_cpu_id)
164{ 255{
165 return raw_smp_processor_id(); 256 return raw_smp_processor_id();
166} 257}
167 258
168static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 259static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
169 .func = __get_raw_cpu_id, 260 .func = bpf_get_raw_cpu_id,
170 .gpl_only = false, 261 .gpl_only = false,
171 .ret_type = RET_INTEGER, 262 .ret_type = RET_INTEGER,
172}; 263};
@@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
316 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 407 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
317 switch (fp->k) { 408 switch (fp->k) {
318 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 409 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
319 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 410 *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
320 break; 411 break;
321 case SKF_AD_OFF + SKF_AD_NLATTR: 412 case SKF_AD_OFF + SKF_AD_NLATTR:
322 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 413 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
323 break; 414 break;
324 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 415 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
325 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 416 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
326 break; 417 break;
327 case SKF_AD_OFF + SKF_AD_CPU: 418 case SKF_AD_OFF + SKF_AD_CPU:
328 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 419 *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
329 break; 420 break;
330 case SKF_AD_OFF + SKF_AD_RANDOM: 421 case SKF_AD_OFF + SKF_AD_RANDOM:
331 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 422 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
@@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
352 return true; 443 return true;
353} 444}
354 445
446static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
447{
448 const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
449 int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
450 bool endian = BPF_SIZE(fp->code) == BPF_H ||
451 BPF_SIZE(fp->code) == BPF_W;
452 bool indirect = BPF_MODE(fp->code) == BPF_IND;
453 const int ip_align = NET_IP_ALIGN;
454 struct bpf_insn *insn = *insnp;
455 int offset = fp->k;
456
457 if (!indirect &&
458 ((unaligned_ok && offset >= 0) ||
459 (!unaligned_ok && offset >= 0 &&
460 offset + ip_align >= 0 &&
461 offset + ip_align % size == 0))) {
462 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
463 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
464 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
465 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
466 offset);
467 if (endian)
468 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
469 *insn++ = BPF_JMP_A(8);
470 }
471
472 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
473 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
474 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
475 if (!indirect) {
476 *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
477 } else {
478 *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
479 if (fp->k)
480 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
481 }
482
483 switch (BPF_SIZE(fp->code)) {
484 case BPF_B:
485 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
486 break;
487 case BPF_H:
488 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
489 break;
490 case BPF_W:
491 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
492 break;
493 default:
494 return false;
495 }
496
497 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
498 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
499 *insn = BPF_EXIT_INSN();
500
501 *insnp = insn;
502 return true;
503}
504
355/** 505/**
356 * bpf_convert_filter - convert filter program 506 * bpf_convert_filter - convert filter program
357 * @prog: the user passed filter program 507 * @prog: the user passed filter program
358 * @len: the length of the user passed filter program 508 * @len: the length of the user passed filter program
359 * @new_prog: allocated 'struct bpf_prog' or NULL 509 * @new_prog: allocated 'struct bpf_prog' or NULL
360 * @new_len: pointer to store length of converted program 510 * @new_len: pointer to store length of converted program
511 * @seen_ld_abs: bool whether we've seen ld_abs/ind
361 * 512 *
362 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' 513 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
363 * style extended BPF (eBPF). 514 * style extended BPF (eBPF).
364 * Conversion workflow: 515 * Conversion workflow:
365 * 516 *
366 * 1) First pass for calculating the new program length: 517 * 1) First pass for calculating the new program length:
367 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 518 * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
368 * 519 *
369 * 2) 2nd pass to remap in two passes: 1st pass finds new 520 * 2) 2nd pass to remap in two passes: 1st pass finds new
370 * jump offsets, 2nd pass remapping: 521 * jump offsets, 2nd pass remapping:
371 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 522 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
372 */ 523 */
373static int bpf_convert_filter(struct sock_filter *prog, int len, 524static int bpf_convert_filter(struct sock_filter *prog, int len,
374 struct bpf_prog *new_prog, int *new_len) 525 struct bpf_prog *new_prog, int *new_len,
526 bool *seen_ld_abs)
375{ 527{
376 int new_flen = 0, pass = 0, target, i, stack_off; 528 int new_flen = 0, pass = 0, target, i, stack_off;
377 struct bpf_insn *new_insn, *first_insn = NULL; 529 struct bpf_insn *new_insn, *first_insn = NULL;
@@ -410,12 +562,27 @@ do_pass:
410 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 562 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
411 */ 563 */
412 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 564 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
565 if (*seen_ld_abs) {
566 /* For packet access in classic BPF, cache skb->data
567 * in callee-saved BPF R8 and skb->len - skb->data_len
568 * (headlen) in BPF R9. Since classic BPF is read-only
569 * on CTX, we only need to cache it once.
570 */
571 *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
572 BPF_REG_D, BPF_REG_CTX,
573 offsetof(struct sk_buff, data));
574 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
575 offsetof(struct sk_buff, len));
576 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
577 offsetof(struct sk_buff, data_len));
578 *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
579 }
413 } else { 580 } else {
414 new_insn += 3; 581 new_insn += 3;
415 } 582 }
416 583
417 for (i = 0; i < len; fp++, i++) { 584 for (i = 0; i < len; fp++, i++) {
418 struct bpf_insn tmp_insns[6] = { }; 585 struct bpf_insn tmp_insns[32] = { };
419 struct bpf_insn *insn = tmp_insns; 586 struct bpf_insn *insn = tmp_insns;
420 587
421 if (addrs) 588 if (addrs)
@@ -458,6 +625,11 @@ do_pass:
458 BPF_MODE(fp->code) == BPF_ABS && 625 BPF_MODE(fp->code) == BPF_ABS &&
459 convert_bpf_extensions(fp, &insn)) 626 convert_bpf_extensions(fp, &insn))
460 break; 627 break;
628 if (BPF_CLASS(fp->code) == BPF_LD &&
629 convert_bpf_ld_abs(fp, &insn)) {
630 *seen_ld_abs = true;
631 break;
632 }
461 633
462 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 634 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
463 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { 635 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -567,21 +739,31 @@ jmp_rest:
567 break; 739 break;
568 740
569 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 741 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
570 case BPF_LDX | BPF_MSH | BPF_B: 742 case BPF_LDX | BPF_MSH | BPF_B: {
571 /* tmp = A */ 743 struct sock_filter tmp = {
572 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 744 .code = BPF_LD | BPF_ABS | BPF_B,
745 .k = fp->k,
746 };
747
748 *seen_ld_abs = true;
749
750 /* X = A */
751 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
573 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 752 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
574 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 753 convert_bpf_ld_abs(&tmp, &insn);
754 insn++;
575 /* A &= 0xf */ 755 /* A &= 0xf */
576 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 756 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
577 /* A <<= 2 */ 757 /* A <<= 2 */
578 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 758 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
759 /* tmp = X */
760 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
579 /* X = A */ 761 /* X = A */
580 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 762 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
581 /* A = tmp */ 763 /* A = tmp */
582 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 764 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
583 break; 765 break;
584 766 }
585 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 767 /* RET_K is remaped into 2 insns. RET_A case doesn't need an
586 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 768 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
587 */ 769 */
@@ -663,6 +845,8 @@ jmp_rest:
663 if (!new_prog) { 845 if (!new_prog) {
664 /* Only calculating new length. */ 846 /* Only calculating new length. */
665 *new_len = new_insn - first_insn; 847 *new_len = new_insn - first_insn;
848 if (*seen_ld_abs)
849 *new_len += 4; /* Prologue bits. */
666 return 0; 850 return 0;
667 } 851 }
668 852
@@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1024 struct sock_filter *old_prog; 1208 struct sock_filter *old_prog;
1025 struct bpf_prog *old_fp; 1209 struct bpf_prog *old_fp;
1026 int err, new_len, old_len = fp->len; 1210 int err, new_len, old_len = fp->len;
1211 bool seen_ld_abs = false;
1027 1212
1028 /* We are free to overwrite insns et al right here as it 1213 /* We are free to overwrite insns et al right here as it
1029 * won't be used at this point in time anymore internally 1214 * won't be used at this point in time anymore internally
@@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1045 } 1230 }
1046 1231
1047 /* 1st pass: calculate the new program length. */ 1232 /* 1st pass: calculate the new program length. */
1048 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 1233 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
1234 &seen_ld_abs);
1049 if (err) 1235 if (err)
1050 goto out_err_free; 1236 goto out_err_free;
1051 1237
@@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1064 fp->len = new_len; 1250 fp->len = new_len;
1065 1251
1066 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1252 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1067 err = bpf_convert_filter(old_prog, old_len, fp, &new_len); 1253 err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
1254 &seen_ld_abs);
1068 if (err) 1255 if (err)
1069 /* 2nd bpf_convert_filter() can fail only if it fails 1256 /* 2nd bpf_convert_filter() can fail only if it fails
1070 * to allocate memory, remapping must succeed. Note, 1257 * to allocate memory, remapping must succeed. Note,
@@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1512 .arg4_type = ARG_CONST_SIZE, 1699 .arg4_type = ARG_CONST_SIZE,
1513}; 1700};
1514 1701
1702BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1703 u32, offset, void *, to, u32, len, u32, start_header)
1704{
1705 u8 *ptr;
1706
1707 if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
1708 goto err_clear;
1709
1710 switch (start_header) {
1711 case BPF_HDR_START_MAC:
1712 ptr = skb_mac_header(skb) + offset;
1713 break;
1714 case BPF_HDR_START_NET:
1715 ptr = skb_network_header(skb) + offset;
1716 break;
1717 default:
1718 goto err_clear;
1719 }
1720
1721 if (likely(ptr >= skb_mac_header(skb) &&
1722 ptr + len <= skb_tail_pointer(skb))) {
1723 memcpy(to, ptr, len);
1724 return 0;
1725 }
1726
1727err_clear:
1728 memset(to, 0, len);
1729 return -EFAULT;
1730}
1731
1732static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
1733 .func = bpf_skb_load_bytes_relative,
1734 .gpl_only = false,
1735 .ret_type = RET_INTEGER,
1736 .arg1_type = ARG_PTR_TO_CTX,
1737 .arg2_type = ARG_ANYTHING,
1738 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1739 .arg4_type = ARG_CONST_SIZE,
1740 .arg5_type = ARG_ANYTHING,
1741};
1742
1515BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1743BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1516{ 1744{
1517 /* Idea is the following: should the needed direct read/write 1745 /* Idea is the following: should the needed direct read/write
@@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
1857 .arg2_type = ARG_ANYTHING, 2085 .arg2_type = ARG_ANYTHING,
1858}; 2086};
1859 2087
2088BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
2089 struct bpf_map *, map, void *, key, u64, flags)
2090{
2091 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2092
2093 /* If user passes invalid input drop the packet. */
2094 if (unlikely(flags & ~(BPF_F_INGRESS)))
2095 return SK_DROP;
2096
2097 tcb->bpf.flags = flags;
2098 tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
2099 if (!tcb->bpf.sk_redir)
2100 return SK_DROP;
2101
2102 return SK_PASS;
2103}
2104
2105static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
2106 .func = bpf_sk_redirect_hash,
2107 .gpl_only = false,
2108 .ret_type = RET_INTEGER,
2109 .arg1_type = ARG_PTR_TO_CTX,
2110 .arg2_type = ARG_CONST_MAP_PTR,
2111 .arg3_type = ARG_PTR_TO_MAP_KEY,
2112 .arg4_type = ARG_ANYTHING,
2113};
2114
1860BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, 2115BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
1861 struct bpf_map *, map, u32, key, u64, flags) 2116 struct bpf_map *, map, u32, key, u64, flags)
1862{ 2117{
@@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
1866 if (unlikely(flags & ~(BPF_F_INGRESS))) 2121 if (unlikely(flags & ~(BPF_F_INGRESS)))
1867 return SK_DROP; 2122 return SK_DROP;
1868 2123
1869 tcb->bpf.key = key;
1870 tcb->bpf.flags = flags; 2124 tcb->bpf.flags = flags;
1871 tcb->bpf.map = map; 2125 tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
2126 if (!tcb->bpf.sk_redir)
2127 return SK_DROP;
1872 2128
1873 return SK_PASS; 2129 return SK_PASS;
1874} 2130}
@@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
1876struct sock *do_sk_redirect_map(struct sk_buff *skb) 2132struct sock *do_sk_redirect_map(struct sk_buff *skb)
1877{ 2133{
1878 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 2134 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
1879 struct sock *sk = NULL;
1880
1881 if (tcb->bpf.map) {
1882 sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
1883 2135
1884 tcb->bpf.key = 0; 2136 return tcb->bpf.sk_redir;
1885 tcb->bpf.map = NULL;
1886 }
1887
1888 return sk;
1889} 2137}
1890 2138
1891static const struct bpf_func_proto bpf_sk_redirect_map_proto = { 2139static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
1898 .arg4_type = ARG_ANYTHING, 2146 .arg4_type = ARG_ANYTHING,
1899}; 2147};
1900 2148
1901BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, 2149BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
1902 struct bpf_map *, map, u32, key, u64, flags) 2150 struct bpf_map *, map, void *, key, u64, flags)
1903{ 2151{
1904 /* If user passes invalid input drop the packet. */ 2152 /* If user passes invalid input drop the packet. */
1905 if (unlikely(flags & ~(BPF_F_INGRESS))) 2153 if (unlikely(flags & ~(BPF_F_INGRESS)))
1906 return SK_DROP; 2154 return SK_DROP;
1907 2155
1908 msg->key = key;
1909 msg->flags = flags; 2156 msg->flags = flags;
1910 msg->map = map; 2157 msg->sk_redir = __sock_hash_lookup_elem(map, key);
2158 if (!msg->sk_redir)
2159 return SK_DROP;
1911 2160
1912 return SK_PASS; 2161 return SK_PASS;
1913} 2162}
1914 2163
1915struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) 2164static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
2165 .func = bpf_msg_redirect_hash,
2166 .gpl_only = false,
2167 .ret_type = RET_INTEGER,
2168 .arg1_type = ARG_PTR_TO_CTX,
2169 .arg2_type = ARG_CONST_MAP_PTR,
2170 .arg3_type = ARG_PTR_TO_MAP_KEY,
2171 .arg4_type = ARG_ANYTHING,
2172};
2173
2174BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
2175 struct bpf_map *, map, u32, key, u64, flags)
1916{ 2176{
1917 struct sock *sk = NULL; 2177 /* If user passes invalid input drop the packet. */
2178 if (unlikely(flags & ~(BPF_F_INGRESS)))
2179 return SK_DROP;
1918 2180
1919 if (msg->map) { 2181 msg->flags = flags;
1920 sk = __sock_map_lookup_elem(msg->map, msg->key); 2182 msg->sk_redir = __sock_map_lookup_elem(map, key);
2183 if (!msg->sk_redir)
2184 return SK_DROP;
1921 2185
1922 msg->key = 0; 2186 return SK_PASS;
1923 msg->map = NULL; 2187}
1924 }
1925 2188
1926 return sk; 2189struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
2190{
2191 return msg->sk_redir;
1927} 2192}
1928 2193
1929static const struct bpf_func_proto bpf_msg_redirect_map_proto = { 2194static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
@@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
2186 return ret; 2451 return ret;
2187} 2452}
2188 2453
2189const struct bpf_func_proto bpf_skb_vlan_push_proto = { 2454static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
2190 .func = bpf_skb_vlan_push, 2455 .func = bpf_skb_vlan_push,
2191 .gpl_only = false, 2456 .gpl_only = false,
2192 .ret_type = RET_INTEGER, 2457 .ret_type = RET_INTEGER,
@@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
2194 .arg2_type = ARG_ANYTHING, 2459 .arg2_type = ARG_ANYTHING,
2195 .arg3_type = ARG_ANYTHING, 2460 .arg3_type = ARG_ANYTHING,
2196}; 2461};
2197EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
2198 2462
2199BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 2463BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
2200{ 2464{
@@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
2208 return ret; 2472 return ret;
2209} 2473}
2210 2474
2211const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 2475static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
2212 .func = bpf_skb_vlan_pop, 2476 .func = bpf_skb_vlan_pop,
2213 .gpl_only = false, 2477 .gpl_only = false,
2214 .ret_type = RET_INTEGER, 2478 .ret_type = RET_INTEGER,
2215 .arg1_type = ARG_PTR_TO_CTX, 2479 .arg1_type = ARG_PTR_TO_CTX,
2216}; 2480};
2217EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
2218 2481
2219static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 2482static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
2220{ 2483{
@@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
2699 2962
2700BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) 2963BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
2701{ 2964{
2965 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
2702 unsigned long metalen = xdp_get_metalen(xdp); 2966 unsigned long metalen = xdp_get_metalen(xdp);
2703 void *data_start = xdp->data_hard_start + metalen; 2967 void *data_start = xdp_frame_end + metalen;
2704 void *data = xdp->data + offset; 2968 void *data = xdp->data + offset;
2705 2969
2706 if (unlikely(data < data_start || 2970 if (unlikely(data < data_start ||
@@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
2724 .arg2_type = ARG_ANYTHING, 2988 .arg2_type = ARG_ANYTHING,
2725}; 2989};
2726 2990
2991BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
2992{
2993 void *data_end = xdp->data_end + offset;
2994
2995 /* only shrinking is allowed for now. */
2996 if (unlikely(offset >= 0))
2997 return -EINVAL;
2998
2999 if (unlikely(data_end < xdp->data + ETH_HLEN))
3000 return -EINVAL;
3001
3002 xdp->data_end = data_end;
3003
3004 return 0;
3005}
3006
3007static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
3008 .func = bpf_xdp_adjust_tail,
3009 .gpl_only = false,
3010 .ret_type = RET_INTEGER,
3011 .arg1_type = ARG_PTR_TO_CTX,
3012 .arg2_type = ARG_ANYTHING,
3013};
3014
2727BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) 3015BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
2728{ 3016{
3017 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
2729 void *meta = xdp->data_meta + offset; 3018 void *meta = xdp->data_meta + offset;
2730 unsigned long metalen = xdp->data - meta; 3019 unsigned long metalen = xdp->data - meta;
2731 3020
2732 if (xdp_data_meta_unsupported(xdp)) 3021 if (xdp_data_meta_unsupported(xdp))
2733 return -ENOTSUPP; 3022 return -ENOTSUPP;
2734 if (unlikely(meta < xdp->data_hard_start || 3023 if (unlikely(meta < xdp_frame_end ||
2735 meta > xdp->data)) 3024 meta > xdp->data))
2736 return -EINVAL; 3025 return -EINVAL;
2737 if (unlikely((metalen & (sizeof(__u32) - 1)) || 3026 if (unlikely((metalen & (sizeof(__u32) - 1)) ||
@@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev,
2756 struct xdp_buff *xdp, 3045 struct xdp_buff *xdp,
2757 u32 index) 3046 u32 index)
2758{ 3047{
2759 int err; 3048 struct xdp_frame *xdpf;
3049 int sent;
2760 3050
2761 if (!dev->netdev_ops->ndo_xdp_xmit) { 3051 if (!dev->netdev_ops->ndo_xdp_xmit) {
2762 return -EOPNOTSUPP; 3052 return -EOPNOTSUPP;
2763 } 3053 }
2764 3054
2765 err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); 3055 xdpf = convert_to_xdp_frame(xdp);
2766 if (err) 3056 if (unlikely(!xdpf))
2767 return err; 3057 return -EOVERFLOW;
2768 dev->netdev_ops->ndo_xdp_flush(dev); 3058
3059 sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
3060 if (sent <= 0)
3061 return sent;
2769 return 0; 3062 return 0;
2770} 3063}
2771 3064
@@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
2776{ 3069{
2777 int err; 3070 int err;
2778 3071
2779 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 3072 switch (map->map_type) {
2780 struct net_device *dev = fwd; 3073 case BPF_MAP_TYPE_DEVMAP: {
2781 3074 struct bpf_dtab_netdev *dst = fwd;
2782 if (!dev->netdev_ops->ndo_xdp_xmit)
2783 return -EOPNOTSUPP;
2784 3075
2785 err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); 3076 err = dev_map_enqueue(dst, xdp, dev_rx);
2786 if (err) 3077 if (err)
2787 return err; 3078 return err;
2788 __dev_map_insert_ctx(map, index); 3079 __dev_map_insert_ctx(map, index);
2789 3080 break;
2790 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { 3081 }
3082 case BPF_MAP_TYPE_CPUMAP: {
2791 struct bpf_cpu_map_entry *rcpu = fwd; 3083 struct bpf_cpu_map_entry *rcpu = fwd;
2792 3084
2793 err = cpu_map_enqueue(rcpu, xdp, dev_rx); 3085 err = cpu_map_enqueue(rcpu, xdp, dev_rx);
2794 if (err) 3086 if (err)
2795 return err; 3087 return err;
2796 __cpu_map_insert_ctx(map, index); 3088 __cpu_map_insert_ctx(map, index);
3089 break;
3090 }
3091 case BPF_MAP_TYPE_XSKMAP: {
3092 struct xdp_sock *xs = fwd;
3093
3094 err = __xsk_map_redirect(map, xdp, xs);
3095 return err;
3096 }
3097 default:
3098 break;
2797 } 3099 }
2798 return 0; 3100 return 0;
2799} 3101}
@@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void)
2812 case BPF_MAP_TYPE_CPUMAP: 3114 case BPF_MAP_TYPE_CPUMAP:
2813 __cpu_map_flush(map); 3115 __cpu_map_flush(map);
2814 break; 3116 break;
3117 case BPF_MAP_TYPE_XSKMAP:
3118 __xsk_map_flush(map);
3119 break;
2815 default: 3120 default:
2816 break; 3121 break;
2817 } 3122 }
@@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
2826 return __dev_map_lookup_elem(map, index); 3131 return __dev_map_lookup_elem(map, index);
2827 case BPF_MAP_TYPE_CPUMAP: 3132 case BPF_MAP_TYPE_CPUMAP:
2828 return __cpu_map_lookup_elem(map, index); 3133 return __cpu_map_lookup_elem(map, index);
3134 case BPF_MAP_TYPE_XSKMAP:
3135 return __xsk_map_lookup_elem(map, index);
2829 default: 3136 default:
2830 return NULL; 3137 return NULL;
2831 } 3138 }
@@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
2923 3230
2924static int xdp_do_generic_redirect_map(struct net_device *dev, 3231static int xdp_do_generic_redirect_map(struct net_device *dev,
2925 struct sk_buff *skb, 3232 struct sk_buff *skb,
3233 struct xdp_buff *xdp,
2926 struct bpf_prog *xdp_prog) 3234 struct bpf_prog *xdp_prog)
2927{ 3235{
2928 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3236 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
2929 unsigned long map_owner = ri->map_owner; 3237 unsigned long map_owner = ri->map_owner;
2930 struct bpf_map *map = ri->map; 3238 struct bpf_map *map = ri->map;
2931 struct net_device *fwd = NULL;
2932 u32 index = ri->ifindex; 3239 u32 index = ri->ifindex;
3240 void *fwd = NULL;
2933 int err = 0; 3241 int err = 0;
2934 3242
2935 ri->ifindex = 0; 3243 ri->ifindex = 0;
@@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
2951 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 3259 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
2952 goto err; 3260 goto err;
2953 skb->dev = fwd; 3261 skb->dev = fwd;
3262 generic_xdp_tx(skb, xdp_prog);
3263 } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
3264 struct xdp_sock *xs = fwd;
3265
3266 err = xsk_generic_rcv(xs, xdp);
3267 if (err)
3268 goto err;
3269 consume_skb(skb);
2954 } else { 3270 } else {
2955 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ 3271 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
2956 err = -EBADRQC; 3272 err = -EBADRQC;
@@ -2965,7 +3281,7 @@ err:
2965} 3281}
2966 3282
2967int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 3283int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
2968 struct bpf_prog *xdp_prog) 3284 struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
2969{ 3285{
2970 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3286 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
2971 u32 index = ri->ifindex; 3287 u32 index = ri->ifindex;
@@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
2973 int err = 0; 3289 int err = 0;
2974 3290
2975 if (ri->map) 3291 if (ri->map)
2976 return xdp_do_generic_redirect_map(dev, skb, xdp_prog); 3292 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
2977 3293
2978 ri->ifindex = 0; 3294 ri->ifindex = 0;
2979 fwd = dev_get_by_index_rcu(dev_net(dev), index); 3295 fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
2987 3303
2988 skb->dev = fwd; 3304 skb->dev = fwd;
2989 _trace_xdp_redirect(dev, xdp_prog, index); 3305 _trace_xdp_redirect(dev, xdp_prog, index);
3306 generic_xdp_tx(skb, xdp_prog);
2990 return 0; 3307 return 0;
2991err: 3308err:
2992 _trace_xdp_redirect_err(dev, xdp_prog, index, err); 3309 _trace_xdp_redirect_err(dev, xdp_prog, index, err);
@@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
3045 .arg3_type = ARG_ANYTHING, 3362 .arg3_type = ARG_ANYTHING,
3046}; 3363};
3047 3364
3048bool bpf_helper_changes_pkt_data(void *func)
3049{
3050 if (func == bpf_skb_vlan_push ||
3051 func == bpf_skb_vlan_pop ||
3052 func == bpf_skb_store_bytes ||
3053 func == bpf_skb_change_proto ||
3054 func == bpf_skb_change_head ||
3055 func == bpf_skb_change_tail ||
3056 func == bpf_skb_adjust_room ||
3057 func == bpf_skb_pull_data ||
3058 func == bpf_clone_redirect ||
3059 func == bpf_l3_csum_replace ||
3060 func == bpf_l4_csum_replace ||
3061 func == bpf_xdp_adjust_head ||
3062 func == bpf_xdp_adjust_meta ||
3063 func == bpf_msg_pull_data)
3064 return true;
3065
3066 return false;
3067}
3068
3069static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 3365static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
3070 unsigned long off, unsigned long len) 3366 unsigned long off, unsigned long len)
3071{ 3367{
@@ -3148,6 +3444,7 @@ set_compat:
3148 to->tunnel_id = be64_to_cpu(info->key.tun_id); 3444 to->tunnel_id = be64_to_cpu(info->key.tun_id);
3149 to->tunnel_tos = info->key.tos; 3445 to->tunnel_tos = info->key.tos;
3150 to->tunnel_ttl = info->key.ttl; 3446 to->tunnel_ttl = info->key.ttl;
3447 to->tunnel_ext = 0;
3151 3448
3152 if (flags & BPF_F_TUNINFO_IPV6) { 3449 if (flags & BPF_F_TUNINFO_IPV6) {
3153 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 3450 memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3155,6 +3452,8 @@ set_compat:
3155 to->tunnel_label = be32_to_cpu(info->key.label); 3452 to->tunnel_label = be32_to_cpu(info->key.label);
3156 } else { 3453 } else {
3157 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 3454 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
3455 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
3456 to->tunnel_label = 0;
3158 } 3457 }
3159 3458
3160 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 3459 if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
3364 .arg3_type = ARG_ANYTHING, 3663 .arg3_type = ARG_ANYTHING,
3365}; 3664};
3366 3665
3666#ifdef CONFIG_SOCK_CGROUP_DATA
3667BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
3668{
3669 struct sock *sk = skb_to_full_sk(skb);
3670 struct cgroup *cgrp;
3671
3672 if (!sk || !sk_fullsock(sk))
3673 return 0;
3674
3675 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
3676 return cgrp->kn->id.id;
3677}
3678
3679static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
3680 .func = bpf_skb_cgroup_id,
3681 .gpl_only = false,
3682 .ret_type = RET_INTEGER,
3683 .arg1_type = ARG_PTR_TO_CTX,
3684};
3685#endif
3686
3367static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, 3687static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
3368 unsigned long off, unsigned long len) 3688 unsigned long off, unsigned long len)
3369{ 3689{
@@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = {
3711 .arg3_type = ARG_CONST_SIZE, 4031 .arg3_type = ARG_CONST_SIZE,
3712}; 4032};
3713 4033
4034#ifdef CONFIG_XFRM
4035BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
4036 struct bpf_xfrm_state *, to, u32, size, u64, flags)
4037{
4038 const struct sec_path *sp = skb_sec_path(skb);
4039 const struct xfrm_state *x;
4040
4041 if (!sp || unlikely(index >= sp->len || flags))
4042 goto err_clear;
4043
4044 x = sp->xvec[index];
4045
4046 if (unlikely(size != sizeof(struct bpf_xfrm_state)))
4047 goto err_clear;
4048
4049 to->reqid = x->props.reqid;
4050 to->spi = x->id.spi;
4051 to->family = x->props.family;
4052 to->ext = 0;
4053
4054 if (to->family == AF_INET6) {
4055 memcpy(to->remote_ipv6, x->props.saddr.a6,
4056 sizeof(to->remote_ipv6));
4057 } else {
4058 to->remote_ipv4 = x->props.saddr.a4;
4059 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
4060 }
4061
4062 return 0;
4063err_clear:
4064 memset(to, 0, size);
4065 return -EINVAL;
4066}
4067
4068static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
4069 .func = bpf_skb_get_xfrm_state,
4070 .gpl_only = false,
4071 .ret_type = RET_INTEGER,
4072 .arg1_type = ARG_PTR_TO_CTX,
4073 .arg2_type = ARG_ANYTHING,
4074 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
4075 .arg4_type = ARG_CONST_SIZE,
4076 .arg5_type = ARG_ANYTHING,
4077};
4078#endif
4079
4080#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
4081static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
4082 const struct neighbour *neigh,
4083 const struct net_device *dev)
4084{
4085 memcpy(params->dmac, neigh->ha, ETH_ALEN);
4086 memcpy(params->smac, dev->dev_addr, ETH_ALEN);
4087 params->h_vlan_TCI = 0;
4088 params->h_vlan_proto = 0;
4089
4090 return dev->ifindex;
4091}
4092#endif
4093
4094#if IS_ENABLED(CONFIG_INET)
4095static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4096 u32 flags, bool check_mtu)
4097{
4098 struct in_device *in_dev;
4099 struct neighbour *neigh;
4100 struct net_device *dev;
4101 struct fib_result res;
4102 struct fib_nh *nh;
4103 struct flowi4 fl4;
4104 int err;
4105 u32 mtu;
4106
4107 dev = dev_get_by_index_rcu(net, params->ifindex);
4108 if (unlikely(!dev))
4109 return -ENODEV;
4110
4111 /* verify forwarding is enabled on this interface */
4112 in_dev = __in_dev_get_rcu(dev);
4113 if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
4114 return 0;
4115
4116 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4117 fl4.flowi4_iif = 1;
4118 fl4.flowi4_oif = params->ifindex;
4119 } else {
4120 fl4.flowi4_iif = params->ifindex;
4121 fl4.flowi4_oif = 0;
4122 }
4123 fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
4124 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
4125 fl4.flowi4_flags = 0;
4126
4127 fl4.flowi4_proto = params->l4_protocol;
4128 fl4.daddr = params->ipv4_dst;
4129 fl4.saddr = params->ipv4_src;
4130 fl4.fl4_sport = params->sport;
4131 fl4.fl4_dport = params->dport;
4132
4133 if (flags & BPF_FIB_LOOKUP_DIRECT) {
4134 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
4135 struct fib_table *tb;
4136
4137 tb = fib_get_table(net, tbid);
4138 if (unlikely(!tb))
4139 return 0;
4140
4141 err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
4142 } else {
4143 fl4.flowi4_mark = 0;
4144 fl4.flowi4_secid = 0;
4145 fl4.flowi4_tun_key.tun_id = 0;
4146 fl4.flowi4_uid = sock_net_uid(net, NULL);
4147
4148 err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
4149 }
4150
4151 if (err || res.type != RTN_UNICAST)
4152 return 0;
4153
4154 if (res.fi->fib_nhs > 1)
4155 fib_select_path(net, &res, &fl4, NULL);
4156
4157 if (check_mtu) {
4158 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
4159 if (params->tot_len > mtu)
4160 return 0;
4161 }
4162
4163 nh = &res.fi->fib_nh[res.nh_sel];
4164
4165 /* do not handle lwt encaps right now */
4166 if (nh->nh_lwtstate)
4167 return 0;
4168
4169 dev = nh->nh_dev;
4170 if (unlikely(!dev))
4171 return 0;
4172
4173 if (nh->nh_gw)
4174 params->ipv4_dst = nh->nh_gw;
4175
4176 params->rt_metric = res.fi->fib_priority;
4177
4178 /* xdp and cls_bpf programs are run in RCU-bh so
4179 * rcu_read_lock_bh is not needed here
4180 */
4181 neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
4182 if (neigh)
4183 return bpf_fib_set_fwd_params(params, neigh, dev);
4184
4185 return 0;
4186}
4187#endif
4188
4189#if IS_ENABLED(CONFIG_IPV6)
4190static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4191 u32 flags, bool check_mtu)
4192{
4193 struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
4194 struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
4195 struct neighbour *neigh;
4196 struct net_device *dev;
4197 struct inet6_dev *idev;
4198 struct fib6_info *f6i;
4199 struct flowi6 fl6;
4200 int strict = 0;
4201 int oif;
4202 u32 mtu;
4203
4204 /* link local addresses are never forwarded */
4205 if (rt6_need_strict(dst) || rt6_need_strict(src))
4206 return 0;
4207
4208 dev = dev_get_by_index_rcu(net, params->ifindex);
4209 if (unlikely(!dev))
4210 return -ENODEV;
4211
4212 idev = __in6_dev_get_safely(dev);
4213 if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
4214 return 0;
4215
4216 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4217 fl6.flowi6_iif = 1;
4218 oif = fl6.flowi6_oif = params->ifindex;
4219 } else {
4220 oif = fl6.flowi6_iif = params->ifindex;
4221 fl6.flowi6_oif = 0;
4222 strict = RT6_LOOKUP_F_HAS_SADDR;
4223 }
4224 fl6.flowlabel = params->flowinfo;
4225 fl6.flowi6_scope = 0;
4226 fl6.flowi6_flags = 0;
4227 fl6.mp_hash = 0;
4228
4229 fl6.flowi6_proto = params->l4_protocol;
4230 fl6.daddr = *dst;
4231 fl6.saddr = *src;
4232 fl6.fl6_sport = params->sport;
4233 fl6.fl6_dport = params->dport;
4234
4235 if (flags & BPF_FIB_LOOKUP_DIRECT) {
4236 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
4237 struct fib6_table *tb;
4238
4239 tb = ipv6_stub->fib6_get_table(net, tbid);
4240 if (unlikely(!tb))
4241 return 0;
4242
4243 f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
4244 } else {
4245 fl6.flowi6_mark = 0;
4246 fl6.flowi6_secid = 0;
4247 fl6.flowi6_tun_key.tun_id = 0;
4248 fl6.flowi6_uid = sock_net_uid(net, NULL);
4249
4250 f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
4251 }
4252
4253 if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
4254 return 0;
4255
4256 if (unlikely(f6i->fib6_flags & RTF_REJECT ||
4257 f6i->fib6_type != RTN_UNICAST))
4258 return 0;
4259
4260 if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
4261 f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
4262 fl6.flowi6_oif, NULL,
4263 strict);
4264
4265 if (check_mtu) {
4266 mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
4267 if (params->tot_len > mtu)
4268 return 0;
4269 }
4270
4271 if (f6i->fib6_nh.nh_lwtstate)
4272 return 0;
4273
4274 if (f6i->fib6_flags & RTF_GATEWAY)
4275 *dst = f6i->fib6_nh.nh_gw;
4276
4277 dev = f6i->fib6_nh.nh_dev;
4278 params->rt_metric = f6i->fib6_metric;
4279
4280 /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
4281 * not needed here. Can not use __ipv6_neigh_lookup_noref here
4282 * because we need to get nd_tbl via the stub
4283 */
4284 neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
4285 ndisc_hashfn, dst, dev);
4286 if (neigh)
4287 return bpf_fib_set_fwd_params(params, neigh, dev);
4288
4289 return 0;
4290}
4291#endif
4292
4293BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
4294 struct bpf_fib_lookup *, params, int, plen, u32, flags)
4295{
4296 if (plen < sizeof(*params))
4297 return -EINVAL;
4298
4299 if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
4300 return -EINVAL;
4301
4302 switch (params->family) {
4303#if IS_ENABLED(CONFIG_INET)
4304 case AF_INET:
4305 return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
4306 flags, true);
4307#endif
4308#if IS_ENABLED(CONFIG_IPV6)
4309 case AF_INET6:
4310 return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
4311 flags, true);
4312#endif
4313 }
4314 return -EAFNOSUPPORT;
4315}
4316
4317static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
4318 .func = bpf_xdp_fib_lookup,
4319 .gpl_only = true,
4320 .ret_type = RET_INTEGER,
4321 .arg1_type = ARG_PTR_TO_CTX,
4322 .arg2_type = ARG_PTR_TO_MEM,
4323 .arg3_type = ARG_CONST_SIZE,
4324 .arg4_type = ARG_ANYTHING,
4325};
4326
4327BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
4328 struct bpf_fib_lookup *, params, int, plen, u32, flags)
4329{
4330 struct net *net = dev_net(skb->dev);
4331 int index = -EAFNOSUPPORT;
4332
4333 if (plen < sizeof(*params))
4334 return -EINVAL;
4335
4336 if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
4337 return -EINVAL;
4338
4339 switch (params->family) {
4340#if IS_ENABLED(CONFIG_INET)
4341 case AF_INET:
4342 index = bpf_ipv4_fib_lookup(net, params, flags, false);
4343 break;
4344#endif
4345#if IS_ENABLED(CONFIG_IPV6)
4346 case AF_INET6:
4347 index = bpf_ipv6_fib_lookup(net, params, flags, false);
4348 break;
4349#endif
4350 }
4351
4352 if (index > 0) {
4353 struct net_device *dev;
4354
4355 dev = dev_get_by_index_rcu(net, index);
4356 if (!is_skb_forwardable(dev, skb))
4357 index = 0;
4358 }
4359
4360 return index;
4361}
4362
4363static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
4364 .func = bpf_skb_fib_lookup,
4365 .gpl_only = true,
4366 .ret_type = RET_INTEGER,
4367 .arg1_type = ARG_PTR_TO_CTX,
4368 .arg2_type = ARG_PTR_TO_MEM,
4369 .arg3_type = ARG_CONST_SIZE,
4370 .arg4_type = ARG_ANYTHING,
4371};
4372
4373#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4374static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
4375{
4376 int err;
4377 struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
4378
4379 if (!seg6_validate_srh(srh, len))
4380 return -EINVAL;
4381
4382 switch (type) {
4383 case BPF_LWT_ENCAP_SEG6_INLINE:
4384 if (skb->protocol != htons(ETH_P_IPV6))
4385 return -EBADMSG;
4386
4387 err = seg6_do_srh_inline(skb, srh);
4388 break;
4389 case BPF_LWT_ENCAP_SEG6:
4390 skb_reset_inner_headers(skb);
4391 skb->encapsulation = 1;
4392 err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
4393 break;
4394 default:
4395 return -EINVAL;
4396 }
4397
4398 bpf_compute_data_pointers(skb);
4399 if (err)
4400 return err;
4401
4402 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
4403 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
4404
4405 return seg6_lookup_nexthop(skb, NULL, 0);
4406}
4407#endif /* CONFIG_IPV6_SEG6_BPF */
4408
4409BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
4410 u32, len)
4411{
4412 switch (type) {
4413#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4414 case BPF_LWT_ENCAP_SEG6:
4415 case BPF_LWT_ENCAP_SEG6_INLINE:
4416 return bpf_push_seg6_encap(skb, type, hdr, len);
4417#endif
4418 default:
4419 return -EINVAL;
4420 }
4421}
4422
4423static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
4424 .func = bpf_lwt_push_encap,
4425 .gpl_only = false,
4426 .ret_type = RET_INTEGER,
4427 .arg1_type = ARG_PTR_TO_CTX,
4428 .arg2_type = ARG_ANYTHING,
4429 .arg3_type = ARG_PTR_TO_MEM,
4430 .arg4_type = ARG_CONST_SIZE
4431};
4432
4433BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
4434 const void *, from, u32, len)
4435{
4436#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4437 struct seg6_bpf_srh_state *srh_state =
4438 this_cpu_ptr(&seg6_bpf_srh_states);
4439 void *srh_tlvs, *srh_end, *ptr;
4440 struct ipv6_sr_hdr *srh;
4441 int srhoff = 0;
4442
4443 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
4444 return -EINVAL;
4445
4446 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4447 srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
4448 srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
4449
4450 ptr = skb->data + offset;
4451 if (ptr >= srh_tlvs && ptr + len <= srh_end)
4452 srh_state->valid = 0;
4453 else if (ptr < (void *)&srh->flags ||
4454 ptr + len > (void *)&srh->segments)
4455 return -EFAULT;
4456
4457 if (unlikely(bpf_try_make_writable(skb, offset + len)))
4458 return -EFAULT;
4459
4460 memcpy(skb->data + offset, from, len);
4461 return 0;
4462#else /* CONFIG_IPV6_SEG6_BPF */
4463 return -EOPNOTSUPP;
4464#endif
4465}
4466
4467static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
4468 .func = bpf_lwt_seg6_store_bytes,
4469 .gpl_only = false,
4470 .ret_type = RET_INTEGER,
4471 .arg1_type = ARG_PTR_TO_CTX,
4472 .arg2_type = ARG_ANYTHING,
4473 .arg3_type = ARG_PTR_TO_MEM,
4474 .arg4_type = ARG_CONST_SIZE
4475};
4476
4477BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
4478 u32, action, void *, param, u32, param_len)
4479{
4480#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4481 struct seg6_bpf_srh_state *srh_state =
4482 this_cpu_ptr(&seg6_bpf_srh_states);
4483 struct ipv6_sr_hdr *srh;
4484 int srhoff = 0;
4485 int err;
4486
4487 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
4488 return -EINVAL;
4489 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4490
4491 if (!srh_state->valid) {
4492 if (unlikely((srh_state->hdrlen & 7) != 0))
4493 return -EBADMSG;
4494
4495 srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
4496 if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
4497 return -EBADMSG;
4498
4499 srh_state->valid = 1;
4500 }
4501
4502 switch (action) {
4503 case SEG6_LOCAL_ACTION_END_X:
4504 if (param_len != sizeof(struct in6_addr))
4505 return -EINVAL;
4506 return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
4507 case SEG6_LOCAL_ACTION_END_T:
4508 if (param_len != sizeof(int))
4509 return -EINVAL;
4510 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
4511 case SEG6_LOCAL_ACTION_END_B6:
4512 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
4513 param, param_len);
4514 if (!err)
4515 srh_state->hdrlen =
4516 ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
4517 return err;
4518 case SEG6_LOCAL_ACTION_END_B6_ENCAP:
4519 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
4520 param, param_len);
4521 if (!err)
4522 srh_state->hdrlen =
4523 ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
4524 return err;
4525 default:
4526 return -EINVAL;
4527 }
4528#else /* CONFIG_IPV6_SEG6_BPF */
4529 return -EOPNOTSUPP;
4530#endif
4531}
4532
4533static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
4534 .func = bpf_lwt_seg6_action,
4535 .gpl_only = false,
4536 .ret_type = RET_INTEGER,
4537 .arg1_type = ARG_PTR_TO_CTX,
4538 .arg2_type = ARG_ANYTHING,
4539 .arg3_type = ARG_PTR_TO_MEM,
4540 .arg4_type = ARG_CONST_SIZE
4541};
4542
4543BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
4544 s32, len)
4545{
4546#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4547 struct seg6_bpf_srh_state *srh_state =
4548 this_cpu_ptr(&seg6_bpf_srh_states);
4549 void *srh_end, *srh_tlvs, *ptr;
4550 struct ipv6_sr_hdr *srh;
4551 struct ipv6hdr *hdr;
4552 int srhoff = 0;
4553 int ret;
4554
4555 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
4556 return -EINVAL;
4557 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4558
4559 srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
4560 ((srh->first_segment + 1) << 4));
4561 srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
4562 srh_state->hdrlen);
4563 ptr = skb->data + offset;
4564
4565 if (unlikely(ptr < srh_tlvs || ptr > srh_end))
4566 return -EFAULT;
4567 if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
4568 return -EFAULT;
4569
4570 if (len > 0) {
4571 ret = skb_cow_head(skb, len);
4572 if (unlikely(ret < 0))
4573 return ret;
4574
4575 ret = bpf_skb_net_hdr_push(skb, offset, len);
4576 } else {
4577 ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
4578 }
4579
4580 bpf_compute_data_pointers(skb);
4581 if (unlikely(ret < 0))
4582 return ret;
4583
4584 hdr = (struct ipv6hdr *)skb->data;
4585 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
4586
4587 srh_state->hdrlen += len;
4588 srh_state->valid = 0;
4589 return 0;
4590#else /* CONFIG_IPV6_SEG6_BPF */
4591 return -EOPNOTSUPP;
4592#endif
4593}
4594
4595static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
4596 .func = bpf_lwt_seg6_adjust_srh,
4597 .gpl_only = false,
4598 .ret_type = RET_INTEGER,
4599 .arg1_type = ARG_PTR_TO_CTX,
4600 .arg2_type = ARG_ANYTHING,
4601 .arg3_type = ARG_ANYTHING,
4602};
4603
4604bool bpf_helper_changes_pkt_data(void *func)
4605{
4606 if (func == bpf_skb_vlan_push ||
4607 func == bpf_skb_vlan_pop ||
4608 func == bpf_skb_store_bytes ||
4609 func == bpf_skb_change_proto ||
4610 func == bpf_skb_change_head ||
4611 func == bpf_skb_change_tail ||
4612 func == bpf_skb_adjust_room ||
4613 func == bpf_skb_pull_data ||
4614 func == bpf_clone_redirect ||
4615 func == bpf_l3_csum_replace ||
4616 func == bpf_l4_csum_replace ||
4617 func == bpf_xdp_adjust_head ||
4618 func == bpf_xdp_adjust_meta ||
4619 func == bpf_msg_pull_data ||
4620 func == bpf_xdp_adjust_tail ||
4621 func == bpf_lwt_push_encap ||
4622 func == bpf_lwt_seg6_store_bytes ||
4623 func == bpf_lwt_seg6_adjust_srh ||
4624 func == bpf_lwt_seg6_action
4625 )
4626 return true;
4627
4628 return false;
4629}
4630
3714static const struct bpf_func_proto * 4631static const struct bpf_func_proto *
3715bpf_base_func_proto(enum bpf_func_id func_id) 4632bpf_base_func_proto(enum bpf_func_id func_id)
3716{ 4633{
@@ -3781,6 +4698,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3781 switch (func_id) { 4698 switch (func_id) {
3782 case BPF_FUNC_skb_load_bytes: 4699 case BPF_FUNC_skb_load_bytes:
3783 return &bpf_skb_load_bytes_proto; 4700 return &bpf_skb_load_bytes_proto;
4701 case BPF_FUNC_skb_load_bytes_relative:
4702 return &bpf_skb_load_bytes_relative_proto;
3784 case BPF_FUNC_get_socket_cookie: 4703 case BPF_FUNC_get_socket_cookie:
3785 return &bpf_get_socket_cookie_proto; 4704 return &bpf_get_socket_cookie_proto;
3786 case BPF_FUNC_get_socket_uid: 4705 case BPF_FUNC_get_socket_uid:
@@ -3798,6 +4717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3798 return &bpf_skb_store_bytes_proto; 4717 return &bpf_skb_store_bytes_proto;
3799 case BPF_FUNC_skb_load_bytes: 4718 case BPF_FUNC_skb_load_bytes:
3800 return &bpf_skb_load_bytes_proto; 4719 return &bpf_skb_load_bytes_proto;
4720 case BPF_FUNC_skb_load_bytes_relative:
4721 return &bpf_skb_load_bytes_relative_proto;
3801 case BPF_FUNC_skb_pull_data: 4722 case BPF_FUNC_skb_pull_data:
3802 return &bpf_skb_pull_data_proto; 4723 return &bpf_skb_pull_data_proto;
3803 case BPF_FUNC_csum_diff: 4724 case BPF_FUNC_csum_diff:
@@ -3852,6 +4773,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3852 return &bpf_get_socket_cookie_proto; 4773 return &bpf_get_socket_cookie_proto;
3853 case BPF_FUNC_get_socket_uid: 4774 case BPF_FUNC_get_socket_uid:
3854 return &bpf_get_socket_uid_proto; 4775 return &bpf_get_socket_uid_proto;
4776 case BPF_FUNC_fib_lookup:
4777 return &bpf_skb_fib_lookup_proto;
4778#ifdef CONFIG_XFRM
4779 case BPF_FUNC_skb_get_xfrm_state:
4780 return &bpf_skb_get_xfrm_state_proto;
4781#endif
4782#ifdef CONFIG_SOCK_CGROUP_DATA
4783 case BPF_FUNC_skb_cgroup_id:
4784 return &bpf_skb_cgroup_id_proto;
4785#endif
3855 default: 4786 default:
3856 return bpf_base_func_proto(func_id); 4787 return bpf_base_func_proto(func_id);
3857 } 4788 }
@@ -3875,33 +4806,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3875 return &bpf_xdp_redirect_proto; 4806 return &bpf_xdp_redirect_proto;
3876 case BPF_FUNC_redirect_map: 4807 case BPF_FUNC_redirect_map:
3877 return &bpf_xdp_redirect_map_proto; 4808 return &bpf_xdp_redirect_map_proto;
3878 default: 4809 case BPF_FUNC_xdp_adjust_tail:
3879 return bpf_base_func_proto(func_id); 4810 return &bpf_xdp_adjust_tail_proto;
3880 } 4811 case BPF_FUNC_fib_lookup:
3881} 4812 return &bpf_xdp_fib_lookup_proto;
3882
3883static const struct bpf_func_proto *
3884lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3885{
3886 switch (func_id) {
3887 case BPF_FUNC_skb_load_bytes:
3888 return &bpf_skb_load_bytes_proto;
3889 case BPF_FUNC_skb_pull_data:
3890 return &bpf_skb_pull_data_proto;
3891 case BPF_FUNC_csum_diff:
3892 return &bpf_csum_diff_proto;
3893 case BPF_FUNC_get_cgroup_classid:
3894 return &bpf_get_cgroup_classid_proto;
3895 case BPF_FUNC_get_route_realm:
3896 return &bpf_get_route_realm_proto;
3897 case BPF_FUNC_get_hash_recalc:
3898 return &bpf_get_hash_recalc_proto;
3899 case BPF_FUNC_perf_event_output:
3900 return &bpf_skb_event_output_proto;
3901 case BPF_FUNC_get_smp_processor_id:
3902 return &bpf_get_smp_processor_id_proto;
3903 case BPF_FUNC_skb_under_cgroup:
3904 return &bpf_skb_under_cgroup_proto;
3905 default: 4813 default:
3906 return bpf_base_func_proto(func_id); 4814 return bpf_base_func_proto(func_id);
3907 } 4815 }
@@ -3919,6 +4827,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3919 return &bpf_sock_ops_cb_flags_set_proto; 4827 return &bpf_sock_ops_cb_flags_set_proto;
3920 case BPF_FUNC_sock_map_update: 4828 case BPF_FUNC_sock_map_update:
3921 return &bpf_sock_map_update_proto; 4829 return &bpf_sock_map_update_proto;
4830 case BPF_FUNC_sock_hash_update:
4831 return &bpf_sock_hash_update_proto;
3922 default: 4832 default:
3923 return bpf_base_func_proto(func_id); 4833 return bpf_base_func_proto(func_id);
3924 } 4834 }
@@ -3930,6 +4840,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3930 switch (func_id) { 4840 switch (func_id) {
3931 case BPF_FUNC_msg_redirect_map: 4841 case BPF_FUNC_msg_redirect_map:
3932 return &bpf_msg_redirect_map_proto; 4842 return &bpf_msg_redirect_map_proto;
4843 case BPF_FUNC_msg_redirect_hash:
4844 return &bpf_msg_redirect_hash_proto;
3933 case BPF_FUNC_msg_apply_bytes: 4845 case BPF_FUNC_msg_apply_bytes:
3934 return &bpf_msg_apply_bytes_proto; 4846 return &bpf_msg_apply_bytes_proto;
3935 case BPF_FUNC_msg_cork_bytes: 4847 case BPF_FUNC_msg_cork_bytes:
@@ -3961,12 +4873,52 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3961 return &bpf_get_socket_uid_proto; 4873 return &bpf_get_socket_uid_proto;
3962 case BPF_FUNC_sk_redirect_map: 4874 case BPF_FUNC_sk_redirect_map:
3963 return &bpf_sk_redirect_map_proto; 4875 return &bpf_sk_redirect_map_proto;
4876 case BPF_FUNC_sk_redirect_hash:
4877 return &bpf_sk_redirect_hash_proto;
3964 default: 4878 default:
3965 return bpf_base_func_proto(func_id); 4879 return bpf_base_func_proto(func_id);
3966 } 4880 }
3967} 4881}
3968 4882
3969static const struct bpf_func_proto * 4883static const struct bpf_func_proto *
4884lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4885{
4886 switch (func_id) {
4887 case BPF_FUNC_skb_load_bytes:
4888 return &bpf_skb_load_bytes_proto;
4889 case BPF_FUNC_skb_pull_data:
4890 return &bpf_skb_pull_data_proto;
4891 case BPF_FUNC_csum_diff:
4892 return &bpf_csum_diff_proto;
4893 case BPF_FUNC_get_cgroup_classid:
4894 return &bpf_get_cgroup_classid_proto;
4895 case BPF_FUNC_get_route_realm:
4896 return &bpf_get_route_realm_proto;
4897 case BPF_FUNC_get_hash_recalc:
4898 return &bpf_get_hash_recalc_proto;
4899 case BPF_FUNC_perf_event_output:
4900 return &bpf_skb_event_output_proto;
4901 case BPF_FUNC_get_smp_processor_id:
4902 return &bpf_get_smp_processor_id_proto;
4903 case BPF_FUNC_skb_under_cgroup:
4904 return &bpf_skb_under_cgroup_proto;
4905 default:
4906 return bpf_base_func_proto(func_id);
4907 }
4908}
4909
4910static const struct bpf_func_proto *
4911lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4912{
4913 switch (func_id) {
4914 case BPF_FUNC_lwt_push_encap:
4915 return &bpf_lwt_push_encap_proto;
4916 default:
4917 return lwt_out_func_proto(func_id, prog);
4918 }
4919}
4920
4921static const struct bpf_func_proto *
3970lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4922lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3971{ 4923{
3972 switch (func_id) { 4924 switch (func_id) {
@@ -3997,7 +4949,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3997 case BPF_FUNC_set_hash_invalid: 4949 case BPF_FUNC_set_hash_invalid:
3998 return &bpf_set_hash_invalid_proto; 4950 return &bpf_set_hash_invalid_proto;
3999 default: 4951 default:
4000 return lwt_inout_func_proto(func_id, prog); 4952 return lwt_out_func_proto(func_id, prog);
4953 }
4954}
4955
4956static const struct bpf_func_proto *
4957lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4958{
4959 switch (func_id) {
4960 case BPF_FUNC_lwt_seg6_store_bytes:
4961 return &bpf_lwt_seg6_store_bytes_proto;
4962 case BPF_FUNC_lwt_seg6_action:
4963 return &bpf_lwt_seg6_action_proto;
4964 case BPF_FUNC_lwt_seg6_adjust_srh:
4965 return &bpf_lwt_seg6_adjust_srh_proto;
4966 default:
4967 return lwt_out_func_proto(func_id, prog);
4001 } 4968 }
4002} 4969}
4003 4970
@@ -4105,7 +5072,6 @@ static bool lwt_is_valid_access(int off, int size,
4105 return bpf_skb_is_valid_access(off, size, type, prog, info); 5072 return bpf_skb_is_valid_access(off, size, type, prog, info);
4106} 5073}
4107 5074
4108
4109/* Attach type specific accesses */ 5075/* Attach type specific accesses */
4110static bool __sock_filter_check_attach_type(int off, 5076static bool __sock_filter_check_attach_type(int off,
4111 enum bpf_access_type access_type, 5077 enum bpf_access_type access_type,
@@ -4221,6 +5187,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
4221 return insn - insn_buf; 5187 return insn - insn_buf;
4222} 5188}
4223 5189
5190static int bpf_gen_ld_abs(const struct bpf_insn *orig,
5191 struct bpf_insn *insn_buf)
5192{
5193 bool indirect = BPF_MODE(orig->code) == BPF_IND;
5194 struct bpf_insn *insn = insn_buf;
5195
5196 /* We're guaranteed here that CTX is in R6. */
5197 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
5198 if (!indirect) {
5199 *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
5200 } else {
5201 *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
5202 if (orig->imm)
5203 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
5204 }
5205
5206 switch (BPF_SIZE(orig->code)) {
5207 case BPF_B:
5208 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
5209 break;
5210 case BPF_H:
5211 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
5212 break;
5213 case BPF_W:
5214 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
5215 break;
5216 }
5217
5218 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
5219 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
5220 *insn++ = BPF_EXIT_INSN();
5221
5222 return insn - insn_buf;
5223}
5224
4224static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 5225static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
4225 const struct bpf_prog *prog) 5226 const struct bpf_prog *prog)
4226{ 5227{
@@ -4279,8 +5280,15 @@ static bool xdp_is_valid_access(int off, int size,
4279 const struct bpf_prog *prog, 5280 const struct bpf_prog *prog,
4280 struct bpf_insn_access_aux *info) 5281 struct bpf_insn_access_aux *info)
4281{ 5282{
4282 if (type == BPF_WRITE) 5283 if (type == BPF_WRITE) {
5284 if (bpf_prog_is_dev_bound(prog->aux)) {
5285 switch (off) {
5286 case offsetof(struct xdp_md, rx_queue_index):
5287 return __is_valid_xdp_access(off, size);
5288 }
5289 }
4283 return false; 5290 return false;
5291 }
4284 5292
4285 switch (off) { 5293 switch (off) {
4286 case offsetof(struct xdp_md, data): 5294 case offsetof(struct xdp_md, data):
@@ -4327,6 +5335,7 @@ static bool sock_addr_is_valid_access(int off, int size,
4327 switch (prog->expected_attach_type) { 5335 switch (prog->expected_attach_type) {
4328 case BPF_CGROUP_INET4_BIND: 5336 case BPF_CGROUP_INET4_BIND:
4329 case BPF_CGROUP_INET4_CONNECT: 5337 case BPF_CGROUP_INET4_CONNECT:
5338 case BPF_CGROUP_UDP4_SENDMSG:
4330 break; 5339 break;
4331 default: 5340 default:
4332 return false; 5341 return false;
@@ -4336,6 +5345,24 @@ static bool sock_addr_is_valid_access(int off, int size,
4336 switch (prog->expected_attach_type) { 5345 switch (prog->expected_attach_type) {
4337 case BPF_CGROUP_INET6_BIND: 5346 case BPF_CGROUP_INET6_BIND:
4338 case BPF_CGROUP_INET6_CONNECT: 5347 case BPF_CGROUP_INET6_CONNECT:
5348 case BPF_CGROUP_UDP6_SENDMSG:
5349 break;
5350 default:
5351 return false;
5352 }
5353 break;
5354 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
5355 switch (prog->expected_attach_type) {
5356 case BPF_CGROUP_UDP4_SENDMSG:
5357 break;
5358 default:
5359 return false;
5360 }
5361 break;
5362 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
5363 msg_src_ip6[3]):
5364 switch (prog->expected_attach_type) {
5365 case BPF_CGROUP_UDP6_SENDMSG:
4339 break; 5366 break;
4340 default: 5367 default:
4341 return false; 5368 return false;
@@ -4346,6 +5373,9 @@ static bool sock_addr_is_valid_access(int off, int size,
4346 switch (off) { 5373 switch (off) {
4347 case bpf_ctx_range(struct bpf_sock_addr, user_ip4): 5374 case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
4348 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 5375 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
5376 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
5377 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
5378 msg_src_ip6[3]):
4349 /* Only narrow read access allowed for now. */ 5379 /* Only narrow read access allowed for now. */
4350 if (type == BPF_READ) { 5380 if (type == BPF_READ) {
4351 bpf_ctx_record_field_size(info, size_default); 5381 bpf_ctx_record_field_size(info, size_default);
@@ -4465,18 +5495,23 @@ static bool sk_msg_is_valid_access(int off, int size,
4465 switch (off) { 5495 switch (off) {
4466 case offsetof(struct sk_msg_md, data): 5496 case offsetof(struct sk_msg_md, data):
4467 info->reg_type = PTR_TO_PACKET; 5497 info->reg_type = PTR_TO_PACKET;
5498 if (size != sizeof(__u64))
5499 return false;
4468 break; 5500 break;
4469 case offsetof(struct sk_msg_md, data_end): 5501 case offsetof(struct sk_msg_md, data_end):
4470 info->reg_type = PTR_TO_PACKET_END; 5502 info->reg_type = PTR_TO_PACKET_END;
5503 if (size != sizeof(__u64))
5504 return false;
4471 break; 5505 break;
5506 default:
5507 if (size != sizeof(__u32))
5508 return false;
4472 } 5509 }
4473 5510
4474 if (off < 0 || off >= sizeof(struct sk_msg_md)) 5511 if (off < 0 || off >= sizeof(struct sk_msg_md))
4475 return false; 5512 return false;
4476 if (off % size != 0) 5513 if (off % size != 0)
4477 return false; 5514 return false;
4478 if (size != sizeof(__u64))
4479 return false;
4480 5515
4481 return true; 5516 return true;
4482} 5517}
@@ -5095,6 +6130,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
5095 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 6130 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
5096 SK_FL_PROTO_SHIFT); 6131 SK_FL_PROTO_SHIFT);
5097 break; 6132 break;
6133
6134 case offsetof(struct bpf_sock_addr, msg_src_ip4):
6135 /* Treat t_ctx as struct in_addr for msg_src_ip4. */
6136 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
6137 struct bpf_sock_addr_kern, struct in_addr, t_ctx,
6138 s_addr, BPF_SIZE(si->code), 0, tmp_reg);
6139 break;
6140
6141 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
6142 msg_src_ip6[3]):
6143 off = si->off;
6144 off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
6145 /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
6146 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
6147 struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
6148 s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
6149 break;
5098 } 6150 }
5099 6151
5100 return insn - insn_buf; 6152 return insn - insn_buf;
@@ -5152,7 +6204,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
5152 break; 6204 break;
5153 6205
5154 case offsetof(struct bpf_sock_ops, local_ip4): 6206 case offsetof(struct bpf_sock_ops, local_ip4):
5155 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); 6207 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
6208 skc_rcv_saddr) != 4);
5156 6209
5157 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6210 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
5158 struct bpf_sock_ops_kern, sk), 6211 struct bpf_sock_ops_kern, sk),
@@ -5469,6 +6522,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
5469 struct bpf_prog *prog, u32 *target_size) 6522 struct bpf_prog *prog, u32 *target_size)
5470{ 6523{
5471 struct bpf_insn *insn = insn_buf; 6524 struct bpf_insn *insn = insn_buf;
6525#if IS_ENABLED(CONFIG_IPV6)
6526 int off;
6527#endif
5472 6528
5473 switch (si->off) { 6529 switch (si->off) {
5474 case offsetof(struct sk_msg_md, data): 6530 case offsetof(struct sk_msg_md, data):
@@ -5481,6 +6537,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
5481 si->dst_reg, si->src_reg, 6537 si->dst_reg, si->src_reg,
5482 offsetof(struct sk_msg_buff, data_end)); 6538 offsetof(struct sk_msg_buff, data_end));
5483 break; 6539 break;
6540 case offsetof(struct sk_msg_md, family):
6541 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
6542
6543 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6544 struct sk_msg_buff, sk),
6545 si->dst_reg, si->src_reg,
6546 offsetof(struct sk_msg_buff, sk));
6547 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
6548 offsetof(struct sock_common, skc_family));
6549 break;
6550
6551 case offsetof(struct sk_msg_md, remote_ip4):
6552 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
6553
6554 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6555 struct sk_msg_buff, sk),
6556 si->dst_reg, si->src_reg,
6557 offsetof(struct sk_msg_buff, sk));
6558 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6559 offsetof(struct sock_common, skc_daddr));
6560 break;
6561
6562 case offsetof(struct sk_msg_md, local_ip4):
6563 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
6564 skc_rcv_saddr) != 4);
6565
6566 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6567 struct sk_msg_buff, sk),
6568 si->dst_reg, si->src_reg,
6569 offsetof(struct sk_msg_buff, sk));
6570 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6571 offsetof(struct sock_common,
6572 skc_rcv_saddr));
6573 break;
6574
6575 case offsetof(struct sk_msg_md, remote_ip6[0]) ...
6576 offsetof(struct sk_msg_md, remote_ip6[3]):
6577#if IS_ENABLED(CONFIG_IPV6)
6578 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
6579 skc_v6_daddr.s6_addr32[0]) != 4);
6580
6581 off = si->off;
6582 off -= offsetof(struct sk_msg_md, remote_ip6[0]);
6583 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6584 struct sk_msg_buff, sk),
6585 si->dst_reg, si->src_reg,
6586 offsetof(struct sk_msg_buff, sk));
6587 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6588 offsetof(struct sock_common,
6589 skc_v6_daddr.s6_addr32[0]) +
6590 off);
6591#else
6592 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
6593#endif
6594 break;
6595
6596 case offsetof(struct sk_msg_md, local_ip6[0]) ...
6597 offsetof(struct sk_msg_md, local_ip6[3]):
6598#if IS_ENABLED(CONFIG_IPV6)
6599 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
6600 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
6601
6602 off = si->off;
6603 off -= offsetof(struct sk_msg_md, local_ip6[0]);
6604 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6605 struct sk_msg_buff, sk),
6606 si->dst_reg, si->src_reg,
6607 offsetof(struct sk_msg_buff, sk));
6608 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6609 offsetof(struct sock_common,
6610 skc_v6_rcv_saddr.s6_addr32[0]) +
6611 off);
6612#else
6613 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
6614#endif
6615 break;
6616
6617 case offsetof(struct sk_msg_md, remote_port):
6618 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
6619
6620 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6621 struct sk_msg_buff, sk),
6622 si->dst_reg, si->src_reg,
6623 offsetof(struct sk_msg_buff, sk));
6624 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
6625 offsetof(struct sock_common, skc_dport));
6626#ifndef __BIG_ENDIAN_BITFIELD
6627 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
6628#endif
6629 break;
6630
6631 case offsetof(struct sk_msg_md, local_port):
6632 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
6633
6634 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6635 struct sk_msg_buff, sk),
6636 si->dst_reg, si->src_reg,
6637 offsetof(struct sk_msg_buff, sk));
6638 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
6639 offsetof(struct sock_common, skc_num));
6640 break;
5484 } 6641 }
5485 6642
5486 return insn - insn_buf; 6643 return insn - insn_buf;
@@ -5490,6 +6647,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
5490 .get_func_proto = sk_filter_func_proto, 6647 .get_func_proto = sk_filter_func_proto,
5491 .is_valid_access = sk_filter_is_valid_access, 6648 .is_valid_access = sk_filter_is_valid_access,
5492 .convert_ctx_access = bpf_convert_ctx_access, 6649 .convert_ctx_access = bpf_convert_ctx_access,
6650 .gen_ld_abs = bpf_gen_ld_abs,
5493}; 6651};
5494 6652
5495const struct bpf_prog_ops sk_filter_prog_ops = { 6653const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5501,6 +6659,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
5501 .is_valid_access = tc_cls_act_is_valid_access, 6659 .is_valid_access = tc_cls_act_is_valid_access,
5502 .convert_ctx_access = tc_cls_act_convert_ctx_access, 6660 .convert_ctx_access = tc_cls_act_convert_ctx_access,
5503 .gen_prologue = tc_cls_act_prologue, 6661 .gen_prologue = tc_cls_act_prologue,
6662 .gen_ld_abs = bpf_gen_ld_abs,
5504}; 6663};
5505 6664
5506const struct bpf_prog_ops tc_cls_act_prog_ops = { 6665const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -5527,13 +6686,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
5527 .test_run = bpf_prog_test_run_skb, 6686 .test_run = bpf_prog_test_run_skb,
5528}; 6687};
5529 6688
5530const struct bpf_verifier_ops lwt_inout_verifier_ops = { 6689const struct bpf_verifier_ops lwt_in_verifier_ops = {
5531 .get_func_proto = lwt_inout_func_proto, 6690 .get_func_proto = lwt_in_func_proto,
5532 .is_valid_access = lwt_is_valid_access, 6691 .is_valid_access = lwt_is_valid_access,
5533 .convert_ctx_access = bpf_convert_ctx_access, 6692 .convert_ctx_access = bpf_convert_ctx_access,
5534}; 6693};
5535 6694
5536const struct bpf_prog_ops lwt_inout_prog_ops = { 6695const struct bpf_prog_ops lwt_in_prog_ops = {
6696 .test_run = bpf_prog_test_run_skb,
6697};
6698
6699const struct bpf_verifier_ops lwt_out_verifier_ops = {
6700 .get_func_proto = lwt_out_func_proto,
6701 .is_valid_access = lwt_is_valid_access,
6702 .convert_ctx_access = bpf_convert_ctx_access,
6703};
6704
6705const struct bpf_prog_ops lwt_out_prog_ops = {
5537 .test_run = bpf_prog_test_run_skb, 6706 .test_run = bpf_prog_test_run_skb,
5538}; 6707};
5539 6708
@@ -5548,6 +6717,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
5548 .test_run = bpf_prog_test_run_skb, 6717 .test_run = bpf_prog_test_run_skb,
5549}; 6718};
5550 6719
6720const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
6721 .get_func_proto = lwt_seg6local_func_proto,
6722 .is_valid_access = lwt_is_valid_access,
6723 .convert_ctx_access = bpf_convert_ctx_access,
6724};
6725
6726const struct bpf_prog_ops lwt_seg6local_prog_ops = {
6727 .test_run = bpf_prog_test_run_skb,
6728};
6729
5551const struct bpf_verifier_ops cg_sock_verifier_ops = { 6730const struct bpf_verifier_ops cg_sock_verifier_ops = {
5552 .get_func_proto = sock_filter_func_proto, 6731 .get_func_proto = sock_filter_func_proto,
5553 .is_valid_access = sock_filter_is_valid_access, 6732 .is_valid_access = sock_filter_is_valid_access,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..53f96e4f7bf5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
1253EXPORT_SYMBOL(skb_get_hash_perturb); 1253EXPORT_SYMBOL(skb_get_hash_perturb);
1254 1254
1255u32 __skb_get_poff(const struct sk_buff *skb, void *data, 1255u32 __skb_get_poff(const struct sk_buff *skb, void *data,
1256 const struct flow_keys *keys, int hlen) 1256 const struct flow_keys_basic *keys, int hlen)
1257{ 1257{
1258 u32 poff = keys->control.thoff; 1258 u32 poff = keys->control.thoff;
1259 1259
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
1314 */ 1314 */
1315u32 skb_get_poff(const struct sk_buff *skb) 1315u32 skb_get_poff(const struct sk_buff *skb)
1316{ 1316{
1317 struct flow_keys keys; 1317 struct flow_keys_basic keys;
1318 1318
1319 if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) 1319 if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
1320 return 0; 1320 return 0;
1321 1321
1322 return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); 1322 return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
1334 keys->ports.src = fl6->fl6_sport; 1334 keys->ports.src = fl6->fl6_sport;
1335 keys->ports.dst = fl6->fl6_dport; 1335 keys->ports.dst = fl6->fl6_dport;
1336 keys->keyid.keyid = fl6->fl6_gre_key; 1336 keys->keyid.keyid = fl6->fl6_gre_key;
1337 keys->tags.flow_label = (__force u32)fl6->flowlabel; 1337 keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1338 keys->basic.ip_proto = fl6->flowi6_proto; 1338 keys->basic.ip_proto = fl6->flowi6_proto;
1339 1339
1340 return flow_hash_from_keys(keys); 1340 return flow_hash_from_keys(keys);
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
1403 }, 1403 },
1404}; 1404};
1405 1405
1406static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { 1406static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
1407 { 1407 {
1408 .key_id = FLOW_DISSECTOR_KEY_CONTROL, 1408 .key_id = FLOW_DISSECTOR_KEY_CONTROL,
1409 .offset = offsetof(struct flow_keys, control), 1409 .offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
1417struct flow_dissector flow_keys_dissector __read_mostly; 1417struct flow_dissector flow_keys_dissector __read_mostly;
1418EXPORT_SYMBOL(flow_keys_dissector); 1418EXPORT_SYMBOL(flow_keys_dissector);
1419 1419
1420struct flow_dissector flow_keys_buf_dissector __read_mostly; 1420struct flow_dissector flow_keys_basic_dissector __read_mostly;
1421EXPORT_SYMBOL(flow_keys_basic_dissector);
1421 1422
1422static int __init init_default_flow_dissectors(void) 1423static int __init init_default_flow_dissectors(void)
1423{ 1424{
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
1427 skb_flow_dissector_init(&flow_keys_dissector_symmetric, 1428 skb_flow_dissector_init(&flow_keys_dissector_symmetric,
1428 flow_keys_dissector_symmetric_keys, 1429 flow_keys_dissector_symmetric_keys,
1429 ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); 1430 ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
1430 skb_flow_dissector_init(&flow_keys_buf_dissector, 1431 skb_flow_dissector_init(&flow_keys_basic_dissector,
1431 flow_keys_buf_dissector_keys, 1432 flow_keys_basic_dissector_keys,
1432 ARRAY_SIZE(flow_keys_buf_dissector_keys)); 1433 ARRAY_SIZE(flow_keys_basic_dissector_keys));
1433 return 0; 1434 return 0;
1434} 1435}
1435 1436
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1fb43bff417d..a7a9c3d738ba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
820 write_lock(&n->lock); 820 write_lock(&n->lock);
821 821
822 state = n->nud_state; 822 state = n->nud_state;
823 if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { 823 if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
824 (n->flags & NTF_EXT_LEARNED)) {
824 write_unlock(&n->lock); 825 write_unlock(&n->lock);
825 goto next_elt; 826 goto next_elt;
826 } 827 }
@@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1136 if (neigh->dead) 1137 if (neigh->dead)
1137 goto out; 1138 goto out;
1138 1139
1140 neigh_update_ext_learned(neigh, flags, &notify);
1141
1139 if (!(new & NUD_VALID)) { 1142 if (!(new & NUD_VALID)) {
1140 neigh_del_timer(neigh); 1143 neigh_del_timer(neigh);
1141 if (old & NUD_CONNECTED) 1144 if (old & NUD_CONNECTED)
@@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
1781 flags &= ~NEIGH_UPDATE_F_OVERRIDE; 1784 flags &= ~NEIGH_UPDATE_F_OVERRIDE;
1782 } 1785 }
1783 1786
1787 if (ndm->ndm_flags & NTF_EXT_LEARNED)
1788 flags |= NEIGH_UPDATE_F_EXT_LEARNED;
1789
1784 if (ndm->ndm_flags & NTF_USE) { 1790 if (ndm->ndm_flags & NTF_USE) {
1785 neigh_event_send(neigh, NULL); 1791 neigh_event_send(neigh, NULL);
1786 err = 0; 1792 err = 0;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
35#include <trace/events/tcp.h> 35#include <trace/events/tcp.h>
36#include <trace/events/fib.h> 36#include <trace/events/fib.h>
37#include <trace/events/qdisc.h> 37#include <trace/events/qdisc.h>
38#if IS_ENABLED(CONFIG_IPV6)
39#include <trace/events/fib6.h>
40EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
41#endif
42#if IS_ENABLED(CONFIG_BRIDGE) 38#if IS_ENABLED(CONFIG_BRIDGE)
43#include <trace/events/bridge.h> 39#include <trace/events/bridge.h>
44EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); 40EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000000..68bf07206744
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/slab.h>
10
11#include <net/page_pool.h>
12#include <linux/dma-direction.h>
13#include <linux/dma-mapping.h>
14#include <linux/page-flags.h>
15#include <linux/mm.h> /* for __put_page() */
16
17static int page_pool_init(struct page_pool *pool,
18 const struct page_pool_params *params)
19{
20 unsigned int ring_qsize = 1024; /* Default */
21
22 memcpy(&pool->p, params, sizeof(pool->p));
23
24 /* Validate only known flags were used */
25 if (pool->p.flags & ~(PP_FLAG_ALL))
26 return -EINVAL;
27
28 if (pool->p.pool_size)
29 ring_qsize = pool->p.pool_size;
30
31 /* Sanity limit mem that can be pinned down */
32 if (ring_qsize > 32768)
33 return -E2BIG;
34
35 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
36 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
37 * which is the XDP_TX use-case.
38 */
39 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
40 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
41 return -EINVAL;
42
43 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
44 return -ENOMEM;
45
46 return 0;
47}
48
49struct page_pool *page_pool_create(const struct page_pool_params *params)
50{
51 struct page_pool *pool;
52 int err = 0;
53
54 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
55 if (!pool)
56 return ERR_PTR(-ENOMEM);
57
58 err = page_pool_init(pool, params);
59 if (err < 0) {
60 pr_warn("%s() gave up with errno %d\n", __func__, err);
61 kfree(pool);
62 return ERR_PTR(err);
63 }
64 return pool;
65}
66EXPORT_SYMBOL(page_pool_create);
67
68/* fast path */
69static struct page *__page_pool_get_cached(struct page_pool *pool)
70{
71 struct ptr_ring *r = &pool->ring;
72 struct page *page;
73
74 /* Quicker fallback, avoid locks when ring is empty */
75 if (__ptr_ring_empty(r))
76 return NULL;
77
78 /* Test for safe-context, caller should provide this guarantee */
79 if (likely(in_serving_softirq())) {
80 if (likely(pool->alloc.count)) {
81 /* Fast-path */
82 page = pool->alloc.cache[--pool->alloc.count];
83 return page;
84 }
85 /* Slower-path: Alloc array empty, time to refill
86 *
87 * Open-coded bulk ptr_ring consumer.
88 *
89 * Discussion: the ring consumer lock is not really
90 * needed due to the softirq/NAPI protection, but
91 * later need the ability to reclaim pages on the
92 * ring. Thus, keeping the locks.
93 */
94 spin_lock(&r->consumer_lock);
95 while ((page = __ptr_ring_consume(r))) {
96 if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
97 break;
98 pool->alloc.cache[pool->alloc.count++] = page;
99 }
100 spin_unlock(&r->consumer_lock);
101 return page;
102 }
103
104 /* Slow-path: Get page from locked ring queue */
105 page = ptr_ring_consume(&pool->ring);
106 return page;
107}
108
109/* slow path */
110noinline
111static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
112 gfp_t _gfp)
113{
114 struct page *page;
115 gfp_t gfp = _gfp;
116 dma_addr_t dma;
117
118 /* We could always set __GFP_COMP, and avoid this branch, as
119 * prep_new_page() can handle order-0 with __GFP_COMP.
120 */
121 if (pool->p.order)
122 gfp |= __GFP_COMP;
123
124 /* FUTURE development:
125 *
126 * Current slow-path essentially falls back to single page
127 * allocations, which doesn't improve performance. This code
128 * need bulk allocation support from the page allocator code.
129 */
130
131 /* Cache was empty, do real allocation */
132 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
133 if (!page)
134 return NULL;
135
136 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
137 goto skip_dma_map;
138
139 /* Setup DMA mapping: use page->private for DMA-addr
140 * This mapping is kept for lifetime of page, until leaving pool.
141 */
142 dma = dma_map_page(pool->p.dev, page, 0,
143 (PAGE_SIZE << pool->p.order),
144 pool->p.dma_dir);
145 if (dma_mapping_error(pool->p.dev, dma)) {
146 put_page(page);
147 return NULL;
148 }
149 set_page_private(page, dma); /* page->private = dma; */
150
151skip_dma_map:
152 /* When page just alloc'ed is should/must have refcnt 1. */
153 return page;
154}
155
156/* For using page_pool replace: alloc_pages() API calls, but provide
157 * synchronization guarantee for allocation side.
158 */
159struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
160{
161 struct page *page;
162
163 /* Fast-path: Get a page from cache */
164 page = __page_pool_get_cached(pool);
165 if (page)
166 return page;
167
168 /* Slow-path: cache empty, do real allocation */
169 page = __page_pool_alloc_pages_slow(pool, gfp);
170 return page;
171}
172EXPORT_SYMBOL(page_pool_alloc_pages);
173
174/* Cleanup page_pool state from page */
175static void __page_pool_clean_page(struct page_pool *pool,
176 struct page *page)
177{
178 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
179 return;
180
181 /* DMA unmap */
182 dma_unmap_page(pool->p.dev, page_private(page),
183 PAGE_SIZE << pool->p.order, pool->p.dma_dir);
184 set_page_private(page, 0);
185}
186
187/* Return a page to the page allocator, cleaning up our state */
188static void __page_pool_return_page(struct page_pool *pool, struct page *page)
189{
190 __page_pool_clean_page(pool, page);
191 put_page(page);
192 /* An optimization would be to call __free_pages(page, pool->p.order)
193 * knowing page is not part of page-cache (thus avoiding a
194 * __page_cache_release() call).
195 */
196}
197
198static bool __page_pool_recycle_into_ring(struct page_pool *pool,
199 struct page *page)
200{
201 int ret;
202 /* BH protection not needed if current is serving softirq */
203 if (in_serving_softirq())
204 ret = ptr_ring_produce(&pool->ring, page);
205 else
206 ret = ptr_ring_produce_bh(&pool->ring, page);
207
208 return (ret == 0) ? true : false;
209}
210
211/* Only allow direct recycling in special circumstances, into the
212 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
213 *
214 * Caller must provide appropriate safe context.
215 */
216static bool __page_pool_recycle_direct(struct page *page,
217 struct page_pool *pool)
218{
219 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
220 return false;
221
222 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
223 pool->alloc.cache[pool->alloc.count++] = page;
224 return true;
225}
226
227void __page_pool_put_page(struct page_pool *pool,
228 struct page *page, bool allow_direct)
229{
230 /* This allocator is optimized for the XDP mode that uses
231 * one-frame-per-page, but have fallbacks that act like the
232 * regular page allocator APIs.
233 *
234 * refcnt == 1 means page_pool owns page, and can recycle it.
235 */
236 if (likely(page_ref_count(page) == 1)) {
237 /* Read barrier done in page_ref_count / READ_ONCE */
238
239 if (allow_direct && in_serving_softirq())
240 if (__page_pool_recycle_direct(page, pool))
241 return;
242
243 if (!__page_pool_recycle_into_ring(pool, page)) {
244 /* Cache full, fallback to free pages */
245 __page_pool_return_page(pool, page);
246 }
247 return;
248 }
249 /* Fallback/non-XDP mode: API user have elevated refcnt.
250 *
251 * Many drivers split up the page into fragments, and some
252 * want to keep doing this to save memory and do refcnt based
253 * recycling. Support this use case too, to ease drivers
254 * switching between XDP/non-XDP.
255 *
256 * In-case page_pool maintains the DMA mapping, API user must
257 * call page_pool_put_page once. In this elevated refcnt
258 * case, the DMA is unmapped/released, as driver is likely
259 * doing refcnt based recycle tricks, meaning another process
260 * will be invoking put_page.
261 */
262 __page_pool_clean_page(pool, page);
263 put_page(page);
264}
265EXPORT_SYMBOL(__page_pool_put_page);
266
267static void __page_pool_empty_ring(struct page_pool *pool)
268{
269 struct page *page;
270
271 /* Empty recycle ring */
272 while ((page = ptr_ring_consume(&pool->ring))) {
273 /* Verify the refcnt invariant of cached pages */
274 if (!(page_ref_count(page) == 1))
275 pr_crit("%s() page_pool refcnt %d violation\n",
276 __func__, page_ref_count(page));
277
278 __page_pool_return_page(pool, page);
279 }
280}
281
282static void __page_pool_destroy_rcu(struct rcu_head *rcu)
283{
284 struct page_pool *pool;
285
286 pool = container_of(rcu, struct page_pool, rcu);
287
288 WARN(pool->alloc.count, "API usage violation");
289
290 __page_pool_empty_ring(pool);
291 ptr_ring_cleanup(&pool->ring, NULL);
292 kfree(pool);
293}
294
295/* Cleanup and release resources */
296void page_pool_destroy(struct page_pool *pool)
297{
298 struct page *page;
299
300 /* Empty alloc cache, assume caller made sure this is
301 * no-longer in use, and page_pool_alloc_pages() cannot be
302 * call concurrently.
303 */
304 while (pool->alloc.count) {
305 page = pool->alloc.cache[--pool->alloc.count];
306 __page_pool_return_page(pool, page);
307 }
308
309 /* No more consumers should exist, but producers could still
310 * be in-flight.
311 */
312 __page_pool_empty_ring(pool);
313
314 /* An xdp_mem_allocator can still ref page_pool pointer */
315 call_rcu(&pool->rcu, __page_pool_destroy_rcu);
316}
317EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 45936922d7e2..5ef61222fdef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,6 +59,9 @@
59#include <net/rtnetlink.h> 59#include <net/rtnetlink.h>
60#include <net/net_namespace.h> 60#include <net/net_namespace.h>
61 61
62#define RTNL_MAX_TYPE 48
63#define RTNL_SLAVE_MAX_TYPE 36
64
62struct rtnl_link { 65struct rtnl_link {
63 rtnl_doit_func doit; 66 rtnl_doit_func doit;
64 rtnl_dumpit_func dumpit; 67 rtnl_dumpit_func dumpit;
@@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
389{ 392{
390 int err; 393 int err;
391 394
395 /* Sanity-check max sizes to avoid stack buffer overflow. */
396 if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
397 ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
398 return -EINVAL;
399
392 rtnl_lock(); 400 rtnl_lock();
393 err = __rtnl_link_register(ops); 401 err = __rtnl_link_register(ops);
394 rtnl_unlock(); 402 rtnl_unlock();
@@ -785,13 +793,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
785 long expires, u32 error) 793 long expires, u32 error)
786{ 794{
787 struct rta_cacheinfo ci = { 795 struct rta_cacheinfo ci = {
788 .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
789 .rta_used = dst->__use,
790 .rta_clntref = atomic_read(&(dst->__refcnt)),
791 .rta_error = error, 796 .rta_error = error,
792 .rta_id = id, 797 .rta_id = id,
793 }; 798 };
794 799
800 if (dst) {
801 ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
802 ci.rta_used = dst->__use;
803 ci.rta_clntref = atomic_read(&dst->__refcnt);
804 }
795 if (expires) { 805 if (expires) {
796 unsigned long clock; 806 unsigned long clock;
797 807
@@ -2256,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,
2256 const struct net_device_ops *ops = dev->netdev_ops; 2266 const struct net_device_ops *ops = dev->netdev_ops;
2257 int err; 2267 int err;
2258 2268
2269 err = validate_linkmsg(dev, tb);
2270 if (err < 0)
2271 return err;
2272
2259 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { 2273 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
2260 struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), 2274 struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
2261 tb, CAP_NET_ADMIN); 2275 tb, CAP_NET_ADMIN);
@@ -2619,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
2619 goto errout; 2633 goto errout;
2620 } 2634 }
2621 2635
2622 err = validate_linkmsg(dev, tb);
2623 if (err < 0)
2624 goto errout;
2625
2626 err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0); 2636 err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
2627errout: 2637errout:
2628 return err; 2638 return err;
@@ -2900,13 +2910,16 @@ replay:
2900 } 2910 }
2901 2911
2902 if (1) { 2912 if (1) {
2903 struct nlattr *attr[ops ? ops->maxtype + 1 : 1]; 2913 struct nlattr *attr[RTNL_MAX_TYPE + 1];
2904 struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1]; 2914 struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
2905 struct nlattr **data = NULL; 2915 struct nlattr **data = NULL;
2906 struct nlattr **slave_data = NULL; 2916 struct nlattr **slave_data = NULL;
2907 struct net *dest_net, *link_net = NULL; 2917 struct net *dest_net, *link_net = NULL;
2908 2918
2909 if (ops) { 2919 if (ops) {
2920 if (ops->maxtype > RTNL_MAX_TYPE)
2921 return -EINVAL;
2922
2910 if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { 2923 if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
2911 err = nla_parse_nested(attr, ops->maxtype, 2924 err = nla_parse_nested(attr, ops->maxtype,
2912 linkinfo[IFLA_INFO_DATA], 2925 linkinfo[IFLA_INFO_DATA],
@@ -2923,6 +2936,9 @@ replay:
2923 } 2936 }
2924 2937
2925 if (m_ops) { 2938 if (m_ops) {
2939 if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
2940 return -EINVAL;
2941
2926 if (m_ops->slave_maxtype && 2942 if (m_ops->slave_maxtype &&
2927 linkinfo[IFLA_INFO_SLAVE_DATA]) { 2943 linkinfo[IFLA_INFO_SLAVE_DATA]) {
2928 err = nla_parse_nested(slave_attr, 2944 err = nla_parse_nested(slave_attr,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..c642304f178c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
1305 skb->inner_mac_header += off; 1305 skb->inner_mac_header += off;
1306} 1306}
1307 1307
1308static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 1308void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
1309{ 1309{
1310 __copy_skb_header(new, old); 1310 __copy_skb_header(new, old);
1311 1311
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
1313 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 1313 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
1314 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 1314 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
1315} 1315}
1316EXPORT_SYMBOL(skb_copy_header);
1316 1317
1317static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 1318static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
1318{ 1319{
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
1355 1356
1356 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 1357 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
1357 1358
1358 copy_skb_header(n, skb); 1359 skb_copy_header(n, skb);
1359 return n; 1360 return n;
1360} 1361}
1361EXPORT_SYMBOL(skb_copy); 1362EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
1419 skb_clone_fraglist(n); 1420 skb_clone_fraglist(n);
1420 } 1421 }
1421 1422
1422 copy_skb_header(n, skb); 1423 skb_copy_header(n, skb);
1423out: 1424out:
1424 return n; 1425 return n;
1425} 1426}
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1599 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1600 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1600 skb->len + head_copy_len)); 1601 skb->len + head_copy_len));
1601 1602
1602 copy_skb_header(n, skb); 1603 skb_copy_header(n, skb);
1603 1604
1604 skb_headers_offset_update(n, newheadroom - oldheadroom); 1605 skb_headers_offset_update(n, newheadroom - oldheadroom);
1605 1606
@@ -1839,6 +1840,20 @@ done:
1839} 1840}
1840EXPORT_SYMBOL(___pskb_trim); 1841EXPORT_SYMBOL(___pskb_trim);
1841 1842
1843/* Note : use pskb_trim_rcsum() instead of calling this directly
1844 */
1845int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
1846{
1847 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1848 int delta = skb->len - len;
1849
1850 skb->csum = csum_sub(skb->csum,
1851 skb_checksum(skb, len, delta, 0));
1852 }
1853 return __pskb_trim(skb, len);
1854}
1855EXPORT_SYMBOL(pskb_trim_rcsum_slow);
1856
1842/** 1857/**
1843 * __pskb_pull_tail - advance tail of skb header 1858 * __pskb_pull_tail - advance tail of skb header
1844 * @skb: buffer to reallocate 1859 * @skb: buffer to reallocate
@@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
4926 thlen = tcp_hdrlen(skb); 4941 thlen = tcp_hdrlen(skb);
4927 } else if (unlikely(skb_is_gso_sctp(skb))) { 4942 } else if (unlikely(skb_is_gso_sctp(skb))) {
4928 thlen = sizeof(struct sctphdr); 4943 thlen = sizeof(struct sctphdr);
4944 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
4945 thlen = sizeof(struct udphdr);
4929 } 4946 }
4930 /* UFO sets gso_size to the size of the fragmentation 4947 /* UFO sets gso_size to the size of the fragmentation
4931 * payload, i.e. the size of the L4 (UDP) header is already 4948 * payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/core/sock.c b/net/core/sock.c
index 2aed99a541d5..f333d75ef1a9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" 229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
230 x "AF_MAX"
230 231
231static const char *const af_family_key_strings[AF_MAX+1] = { 232static const char *const af_family_key_strings[AF_MAX+1] = {
232 _sock_locks("sk_lock-") 233 _sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , 263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
263 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , 264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
264 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , 265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
265 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" 266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
267 "rlock-AF_MAX"
266}; 268};
267static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 269static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
279 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , 281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
280 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , 282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
281 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , 283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
282 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" 284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
285 "wlock-AF_MAX"
283}; 286};
284static const char *const af_family_elock_key_strings[AF_MAX+1] = { 287static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
296 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , 299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
297 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , 300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
298 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , 301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
299 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" 302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
303 "elock-AF_MAX"
300}; 304};
301 305
302/* 306/*
@@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);
323 327
324int sysctl_tstamp_allow_data __read_mostly = 1; 328int sysctl_tstamp_allow_data __read_mostly = 1;
325 329
326struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; 330DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
327EXPORT_SYMBOL_GPL(memalloc_socks); 331EXPORT_SYMBOL_GPL(memalloc_socks_key);
328 332
329/** 333/**
330 * sk_set_memalloc - sets %SOCK_MEMALLOC 334 * sk_set_memalloc - sets %SOCK_MEMALLOC
@@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)
338{ 342{
339 sock_set_flag(sk, SOCK_MEMALLOC); 343 sock_set_flag(sk, SOCK_MEMALLOC);
340 sk->sk_allocation |= __GFP_MEMALLOC; 344 sk->sk_allocation |= __GFP_MEMALLOC;
341 static_key_slow_inc(&memalloc_socks); 345 static_branch_inc(&memalloc_socks_key);
342} 346}
343EXPORT_SYMBOL_GPL(sk_set_memalloc); 347EXPORT_SYMBOL_GPL(sk_set_memalloc);
344 348
@@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)
346{ 350{
347 sock_reset_flag(sk, SOCK_MEMALLOC); 351 sock_reset_flag(sk, SOCK_MEMALLOC);
348 sk->sk_allocation &= ~__GFP_MEMALLOC; 352 sk->sk_allocation &= ~__GFP_MEMALLOC;
349 static_key_slow_dec(&memalloc_socks); 353 static_branch_dec(&memalloc_socks_key);
350 354
351 /* 355 /*
352 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 356 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
@@ -724,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
724 sock_valbool_flag(sk, SOCK_DBG, valbool); 728 sock_valbool_flag(sk, SOCK_DBG, valbool);
725 break; 729 break;
726 case SO_REUSEADDR: 730 case SO_REUSEADDR:
727 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 731 val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
732 if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
733 inet_sk(sk)->inet_num &&
734 (sk->sk_reuse != val)) {
735 ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
736 break;
737 }
738 sk->sk_reuse = val;
728 break; 739 break;
729 case SO_REUSEPORT: 740 case SO_REUSEPORT:
741 if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
742 inet_sk(sk)->inet_num &&
743 (sk->sk_reuseport != valbool)) {
744 ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
745 break;
746 }
730 sk->sk_reuseport = valbool; 747 sk->sk_reuseport = valbool;
731 break; 748 break;
732 case SO_TYPE: 749 case SO_TYPE:
@@ -905,7 +922,10 @@ set_rcvbuf:
905 case SO_RCVLOWAT: 922 case SO_RCVLOWAT:
906 if (val < 0) 923 if (val < 0)
907 val = INT_MAX; 924 val = INT_MAX;
908 sk->sk_rcvlowat = val ? : 1; 925 if (sock->ops->set_rcvlowat)
926 ret = sock->ops->set_rcvlowat(sk, val);
927 else
928 sk->sk_rcvlowat = val ? : 1;
909 break; 929 break;
910 930
911 case SO_RCVTIMEO: 931 case SO_RCVTIMEO:
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..9d1f22072d5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -5,6 +5,10 @@
5 */ 5 */
6#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/slab.h>
9#include <linux/idr.h>
10#include <linux/rhashtable.h>
11#include <net/page_pool.h>
8 12
9#include <net/xdp.h> 13#include <net/xdp.h>
10 14
@@ -13,6 +17,105 @@
13#define REG_STATE_UNREGISTERED 0x2 17#define REG_STATE_UNREGISTERED 0x2
14#define REG_STATE_UNUSED 0x3 18#define REG_STATE_UNUSED 0x3
15 19
20static DEFINE_IDA(mem_id_pool);
21static DEFINE_MUTEX(mem_id_lock);
22#define MEM_ID_MAX 0xFFFE
23#define MEM_ID_MIN 1
24static int mem_id_next = MEM_ID_MIN;
25
26static bool mem_id_init; /* false */
27static struct rhashtable *mem_id_ht;
28
29struct xdp_mem_allocator {
30 struct xdp_mem_info mem;
31 union {
32 void *allocator;
33 struct page_pool *page_pool;
34 struct zero_copy_allocator *zc_alloc;
35 };
36 struct rhash_head node;
37 struct rcu_head rcu;
38};
39
40static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
41{
42 const u32 *k = data;
43 const u32 key = *k;
44
45 BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
46 != sizeof(u32));
47
48 /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
49 return key << RHT_HASH_RESERVED_SPACE;
50}
51
52static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
53 const void *ptr)
54{
55 const struct xdp_mem_allocator *xa = ptr;
56 u32 mem_id = *(u32 *)arg->key;
57
58 return xa->mem.id != mem_id;
59}
60
61static const struct rhashtable_params mem_id_rht_params = {
62 .nelem_hint = 64,
63 .head_offset = offsetof(struct xdp_mem_allocator, node),
64 .key_offset = offsetof(struct xdp_mem_allocator, mem.id),
65 .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
66 .max_size = MEM_ID_MAX,
67 .min_size = 8,
68 .automatic_shrinking = true,
69 .hashfn = xdp_mem_id_hashfn,
70 .obj_cmpfn = xdp_mem_id_cmp,
71};
72
73static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
74{
75 struct xdp_mem_allocator *xa;
76
77 xa = container_of(rcu, struct xdp_mem_allocator, rcu);
78
79 /* Allow this ID to be reused */
80 ida_simple_remove(&mem_id_pool, xa->mem.id);
81
82 /* Notice, driver is expected to free the *allocator,
83 * e.g. page_pool, and MUST also use RCU free.
84 */
85
86 /* Poison memory */
87 xa->mem.id = 0xFFFF;
88 xa->mem.type = 0xF0F0;
89 xa->allocator = (void *)0xDEAD9001;
90
91 kfree(xa);
92}
93
94static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
95{
96 struct xdp_mem_allocator *xa;
97 int id = xdp_rxq->mem.id;
98 int err;
99
100 if (id == 0)
101 return;
102
103 mutex_lock(&mem_id_lock);
104
105 xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
106 if (!xa) {
107 mutex_unlock(&mem_id_lock);
108 return;
109 }
110
111 err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
112 WARN_ON(err);
113
114 call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
115
116 mutex_unlock(&mem_id_lock);
117}
118
16void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) 119void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
17{ 120{
18 /* Simplify driver cleanup code paths, allow unreg "unused" */ 121 /* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +124,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
21 124
22 WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); 125 WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
23 126
127 __xdp_rxq_info_unreg_mem_model(xdp_rxq);
128
24 xdp_rxq->reg_state = REG_STATE_UNREGISTERED; 129 xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
25 xdp_rxq->dev = NULL; 130 xdp_rxq->dev = NULL;
131
132 /* Reset mem info to defaults */
133 xdp_rxq->mem.id = 0;
134 xdp_rxq->mem.type = 0;
26} 135}
27EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); 136EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
28 137
@@ -71,3 +180,193 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
71 return (xdp_rxq->reg_state == REG_STATE_REGISTERED); 180 return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
72} 181}
73EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); 182EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
183
184static int __mem_id_init_hash_table(void)
185{
186 struct rhashtable *rht;
187 int ret;
188
189 if (unlikely(mem_id_init))
190 return 0;
191
192 rht = kzalloc(sizeof(*rht), GFP_KERNEL);
193 if (!rht)
194 return -ENOMEM;
195
196 ret = rhashtable_init(rht, &mem_id_rht_params);
197 if (ret < 0) {
198 kfree(rht);
199 return ret;
200 }
201 mem_id_ht = rht;
202 smp_mb(); /* mutex lock should provide enough pairing */
203 mem_id_init = true;
204
205 return 0;
206}
207
208/* Allocate a cyclic ID that maps to allocator pointer.
209 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
210 *
211 * Caller must lock mem_id_lock.
212 */
213static int __mem_id_cyclic_get(gfp_t gfp)
214{
215 int retries = 1;
216 int id;
217
218again:
219 id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
220 if (id < 0) {
221 if (id == -ENOSPC) {
222 /* Cyclic allocator, reset next id */
223 if (retries--) {
224 mem_id_next = MEM_ID_MIN;
225 goto again;
226 }
227 }
228 return id; /* errno */
229 }
230 mem_id_next = id + 1;
231
232 return id;
233}
234
235static bool __is_supported_mem_type(enum xdp_mem_type type)
236{
237 if (type == MEM_TYPE_PAGE_POOL)
238 return is_page_pool_compiled_in();
239
240 if (type >= MEM_TYPE_MAX)
241 return false;
242
243 return true;
244}
245
246int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
247 enum xdp_mem_type type, void *allocator)
248{
249 struct xdp_mem_allocator *xdp_alloc;
250 gfp_t gfp = GFP_KERNEL;
251 int id, errno, ret;
252 void *ptr;
253
254 if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
255 WARN(1, "Missing register, driver bug");
256 return -EFAULT;
257 }
258
259 if (!__is_supported_mem_type(type))
260 return -EOPNOTSUPP;
261
262 xdp_rxq->mem.type = type;
263
264 if (!allocator) {
265 if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
266 return -EINVAL; /* Setup time check page_pool req */
267 return 0;
268 }
269
270 /* Delay init of rhashtable to save memory if feature isn't used */
271 if (!mem_id_init) {
272 mutex_lock(&mem_id_lock);
273 ret = __mem_id_init_hash_table();
274 mutex_unlock(&mem_id_lock);
275 if (ret < 0) {
276 WARN_ON(1);
277 return ret;
278 }
279 }
280
281 xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
282 if (!xdp_alloc)
283 return -ENOMEM;
284
285 mutex_lock(&mem_id_lock);
286 id = __mem_id_cyclic_get(gfp);
287 if (id < 0) {
288 errno = id;
289 goto err;
290 }
291 xdp_rxq->mem.id = id;
292 xdp_alloc->mem = xdp_rxq->mem;
293 xdp_alloc->allocator = allocator;
294
295 /* Insert allocator into ID lookup table */
296 ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
297 if (IS_ERR(ptr)) {
298 errno = PTR_ERR(ptr);
299 goto err;
300 }
301
302 mutex_unlock(&mem_id_lock);
303
304 return 0;
305err:
306 mutex_unlock(&mem_id_lock);
307 kfree(xdp_alloc);
308 return errno;
309}
310EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
311
312/* XDP RX runs under NAPI protection, and in different delivery error
313 * scenarios (e.g. queue full), it is possible to return the xdp_frame
314 * while still leveraging this protection. The @napi_direct boolian
315 * is used for those calls sites. Thus, allowing for faster recycling
316 * of xdp_frames/pages in those cases.
317 */
318static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
319 unsigned long handle)
320{
321 struct xdp_mem_allocator *xa;
322 struct page *page;
323
324 switch (mem->type) {
325 case MEM_TYPE_PAGE_POOL:
326 rcu_read_lock();
327 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
328 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
329 page = virt_to_head_page(data);
330 if (xa)
331 page_pool_put_page(xa->page_pool, page, napi_direct);
332 else
333 put_page(page);
334 rcu_read_unlock();
335 break;
336 case MEM_TYPE_PAGE_SHARED:
337 page_frag_free(data);
338 break;
339 case MEM_TYPE_PAGE_ORDER0:
340 page = virt_to_page(data); /* Assumes order0 page*/
341 put_page(page);
342 break;
343 case MEM_TYPE_ZERO_COPY:
344 /* NB! Only valid from an xdp_buff! */
345 rcu_read_lock();
346 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
347 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
348 xa->zc_alloc->free(xa->zc_alloc, handle);
349 rcu_read_unlock();
350 default:
351 /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
352 break;
353 }
354}
355
356void xdp_return_frame(struct xdp_frame *xdpf)
357{
358 __xdp_return(xdpf->data, &xdpf->mem, false, 0);
359}
360EXPORT_SYMBOL_GPL(xdp_return_frame);
361
362void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
363{
364 __xdp_return(xdpf->data, &xdpf->mem, true, 0);
365}
366EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
367
368void xdp_return_buff(struct xdp_buff *xdp)
369{
370 __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
371}
372EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index bae7d78aa068..d2f4e0c1faaf 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -176,6 +176,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
176 [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, 176 [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)},
177 [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, 177 [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
178 [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, 178 [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
179 [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)},
179}; 180};
180 181
181/* DCB number of traffic classes nested attributes. */ 182/* DCB number of traffic classes nested attributes. */
@@ -1094,6 +1095,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
1094 return -EMSGSIZE; 1095 return -EMSGSIZE;
1095 } 1096 }
1096 1097
1098 if (ops->dcbnl_getbuffer) {
1099 struct dcbnl_buffer buffer;
1100
1101 memset(&buffer, 0, sizeof(buffer));
1102 err = ops->dcbnl_getbuffer(netdev, &buffer);
1103 if (!err &&
1104 nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
1105 return -EMSGSIZE;
1106 }
1107
1097 app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE); 1108 app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
1098 if (!app) 1109 if (!app)
1099 return -EMSGSIZE; 1110 return -EMSGSIZE;
@@ -1453,6 +1464,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
1453 goto err; 1464 goto err;
1454 } 1465 }
1455 1466
1467 if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
1468 struct dcbnl_buffer *buffer =
1469 nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
1470
1471 err = ops->dcbnl_setbuffer(netdev, buffer);
1472 if (err)
1473 goto err;
1474 }
1475
1456 if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { 1476 if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
1457 struct nlattr *attr; 1477 struct nlattr *attr;
1458 int rem; 1478 int rem;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 37ccbe62eb1a..ba6fc3c1186b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -53,7 +53,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
53 if (timeo < rto) 53 if (timeo < rto)
54 timeo = rto; 54 timeo = rto;
55 55
56 tw->tw_timeout = DCCP_TIMEWAIT_LEN;
57 if (state == DCCP_TIME_WAIT) 56 if (state == DCCP_TIME_WAIT)
58 timeo = DCCP_TIMEWAIT_LEN; 57 timeo = DCCP_TIMEWAIT_LEN;
59 58
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f509c9..72236695db3d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
121 121
122static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb, 122static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
123 struct fib_rule_hdr *frh, 123 struct fib_rule_hdr *frh,
124 struct nlattr **tb) 124 struct nlattr **tb,
125 struct netlink_ext_ack *extack)
125{ 126{
126 int err = -EINVAL; 127 int err = -EINVAL;
127 struct dn_fib_rule *r = (struct dn_fib_rule *)rule; 128 struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
128 129
129 if (frh->tos) 130 if (frh->tos) {
131 NL_SET_ERR_MSG(extack, "Invalid tos value");
130 goto errout; 132 goto errout;
133 }
131 134
132 if (rule->table == RT_TABLE_UNSPEC) { 135 if (rule->table == RT_TABLE_UNSPEC) {
133 if (rule->action == FR_ACT_TO_TBL) { 136 if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index bbf2c82cf7b2..4183e4ba27a5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -9,7 +9,7 @@ config NET_DSA
9 depends on HAVE_NET_DSA && MAY_USE_DEVLINK 9 depends on HAVE_NET_DSA && MAY_USE_DEVLINK
10 depends on BRIDGE || BRIDGE=n 10 depends on BRIDGE || BRIDGE=n
11 select NET_SWITCHDEV 11 select NET_SWITCHDEV
12 select PHYLIB 12 select PHYLINK
13 ---help--- 13 ---help---
14 Say Y if you want to enable support for the hardware switches supported 14 Say Y if you want to enable support for the hardware switches supported
15 by the Distributed Switch Architecture. 15 by the Distributed Switch Architecture.
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 47725250b4ca..dc5d9af3dc80 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -272,7 +272,28 @@ static int dsa_port_setup(struct dsa_port *dp)
272 case DSA_PORT_TYPE_UNUSED: 272 case DSA_PORT_TYPE_UNUSED:
273 break; 273 break;
274 case DSA_PORT_TYPE_CPU: 274 case DSA_PORT_TYPE_CPU:
275 /* dp->index is used now as port_number. However
276 * CPU ports should have separate numbering
277 * independent from front panel port numbers.
278 */
279 devlink_port_attrs_set(&dp->devlink_port,
280 DEVLINK_PORT_FLAVOUR_CPU,
281 dp->index, false, 0);
282 err = dsa_port_link_register_of(dp);
283 if (err) {
284 dev_err(ds->dev, "failed to setup link for port %d.%d\n",
285 ds->index, dp->index);
286 return err;
287 }
288 break;
275 case DSA_PORT_TYPE_DSA: 289 case DSA_PORT_TYPE_DSA:
290 /* dp->index is used now as port_number. However
291 * DSA ports should have separate numbering
292 * independent from front panel port numbers.
293 */
294 devlink_port_attrs_set(&dp->devlink_port,
295 DEVLINK_PORT_FLAVOUR_DSA,
296 dp->index, false, 0);
276 err = dsa_port_link_register_of(dp); 297 err = dsa_port_link_register_of(dp);
277 if (err) { 298 if (err) {
278 dev_err(ds->dev, "failed to setup link for port %d.%d\n", 299 dev_err(ds->dev, "failed to setup link for port %d.%d\n",
@@ -281,6 +302,9 @@ static int dsa_port_setup(struct dsa_port *dp)
281 } 302 }
282 break; 303 break;
283 case DSA_PORT_TYPE_USER: 304 case DSA_PORT_TYPE_USER:
305 devlink_port_attrs_set(&dp->devlink_port,
306 DEVLINK_PORT_FLAVOUR_PHYSICAL,
307 dp->index, false, 0);
284 err = dsa_slave_create(dp); 308 err = dsa_slave_create(dp);
285 if (err) 309 if (err)
286 dev_err(ds->dev, "failed to create slave for port %d.%d\n", 310 dev_err(ds->dev, "failed to create slave for port %d.%d\n",
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 053731473c99..3964c6f7a7c0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -75,15 +75,6 @@ struct dsa_slave_priv {
75 /* DSA port data, such as switch, port index, etc. */ 75 /* DSA port data, such as switch, port index, etc. */
76 struct dsa_port *dp; 76 struct dsa_port *dp;
77 77
78 /*
79 * The phylib phy_device pointer for the PHY connected
80 * to this port.
81 */
82 phy_interface_t phy_interface;
83 int old_link;
84 int old_pause;
85 int old_duplex;
86
87#ifdef CONFIG_NET_POLL_CONTROLLER 78#ifdef CONFIG_NET_POLL_CONTROLLER
88 struct netpoll *netpoll; 79 struct netpoll *netpoll;
89#endif 80#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 90e6df0351eb..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
22 int port = cpu_dp->index; 22 int port = cpu_dp->index;
23 int count = 0; 23 int count = 0;
24 24
25 if (ops && ops->get_sset_count && ops->get_ethtool_stats) { 25 if (ops->get_sset_count && ops->get_ethtool_stats) {
26 count = ops->get_sset_count(dev, ETH_SS_STATS); 26 count = ops->get_sset_count(dev, ETH_SS_STATS);
27 ops->get_ethtool_stats(dev, stats, data); 27 ops->get_ethtool_stats(dev, stats, data);
28 } 28 }
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
31 ds->ops->get_ethtool_stats(ds, port, data + count); 31 ds->ops->get_ethtool_stats(ds, port, data + count);
32} 32}
33 33
34static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
35 struct ethtool_stats *stats,
36 uint64_t *data)
37{
38 struct dsa_port *cpu_dp = dev->dsa_ptr;
39 const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
40 struct dsa_switch *ds = cpu_dp->ds;
41 int port = cpu_dp->index;
42 int count = 0;
43
44 if (dev->phydev && !ops->get_ethtool_phy_stats) {
45 count = phy_ethtool_get_sset_count(dev->phydev);
46 if (count >= 0)
47 phy_ethtool_get_stats(dev->phydev, stats, data);
48 } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
49 count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
50 ops->get_ethtool_phy_stats(dev, stats, data);
51 }
52
53 if (count < 0)
54 count = 0;
55
56 if (ds->ops->get_ethtool_phy_stats)
57 ds->ops->get_ethtool_phy_stats(ds, port, data + count);
58}
59
34static int dsa_master_get_sset_count(struct net_device *dev, int sset) 60static int dsa_master_get_sset_count(struct net_device *dev, int sset)
35{ 61{
36 struct dsa_port *cpu_dp = dev->dsa_ptr; 62 struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,17 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
38 struct dsa_switch *ds = cpu_dp->ds; 64 struct dsa_switch *ds = cpu_dp->ds;
39 int count = 0; 65 int count = 0;
40 66
41 if (ops && ops->get_sset_count) 67 if (sset == ETH_SS_PHY_STATS && dev->phydev &&
42 count += ops->get_sset_count(dev, sset); 68 !ops->get_ethtool_phy_stats)
69 count = phy_ethtool_get_sset_count(dev->phydev);
70 else if (ops->get_sset_count)
71 count = ops->get_sset_count(dev, sset);
72
73 if (count < 0)
74 count = 0;
43 75
44 if (sset == ETH_SS_STATS && ds->ops->get_sset_count) 76 if (ds->ops->get_sset_count)
45 count += ds->ops->get_sset_count(ds, cpu_dp->index); 77 count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
46 78
47 return count; 79 return count;
48} 80}
@@ -64,19 +96,28 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
64 /* We do not want to be NULL-terminated, since this is a prefix */ 96 /* We do not want to be NULL-terminated, since this is a prefix */
65 pfx[sizeof(pfx) - 1] = '_'; 97 pfx[sizeof(pfx) - 1] = '_';
66 98
67 if (ops && ops->get_sset_count && ops->get_strings) { 99 if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
68 mcount = ops->get_sset_count(dev, ETH_SS_STATS); 100 !ops->get_ethtool_phy_stats) {
101 mcount = phy_ethtool_get_sset_count(dev->phydev);
102 if (mcount < 0)
103 mcount = 0;
104 else
105 phy_ethtool_get_strings(dev->phydev, data);
106 } else if (ops->get_sset_count && ops->get_strings) {
107 mcount = ops->get_sset_count(dev, stringset);
108 if (mcount < 0)
109 mcount = 0;
69 ops->get_strings(dev, stringset, data); 110 ops->get_strings(dev, stringset, data);
70 } 111 }
71 112
72 if (stringset == ETH_SS_STATS && ds->ops->get_strings) { 113 if (ds->ops->get_strings) {
73 ndata = data + mcount * len; 114 ndata = data + mcount * len;
74 /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle 115 /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
75 * the output after to prepend our CPU port prefix we 116 * the output after to prepend our CPU port prefix we
76 * constructed earlier 117 * constructed earlier
77 */ 118 */
78 ds->ops->get_strings(ds, port, ndata); 119 ds->ops->get_strings(ds, port, stringset, ndata);
79 count = ds->ops->get_sset_count(ds, port); 120 count = ds->ops->get_sset_count(ds, port, stringset);
80 for (i = 0; i < count; i++) { 121 for (i = 0; i < count; i++) {
81 memmove(ndata + (i * len + sizeof(pfx)), 122 memmove(ndata + (i * len + sizeof(pfx)),
82 ndata + i * len, len - sizeof(pfx)); 123 ndata + i * len, len - sizeof(pfx));
@@ -102,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
102 ops->get_sset_count = dsa_master_get_sset_count; 143 ops->get_sset_count = dsa_master_get_sset_count;
103 ops->get_ethtool_stats = dsa_master_get_ethtool_stats; 144 ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
104 ops->get_strings = dsa_master_get_strings; 145 ops->get_strings = dsa_master_get_strings;
146 ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
105 147
106 dev->ethtool_ops = ops; 148 dev->ethtool_ops = ops;
107 149
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7acc1169d75e..ed0595459df1 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -252,6 +252,9 @@ int dsa_port_vlan_add(struct dsa_port *dp,
252 .vlan = vlan, 252 .vlan = vlan,
253 }; 253 };
254 254
255 if (netif_is_bridge_master(vlan->obj.orig_dev))
256 return -EOPNOTSUPP;
257
255 if (br_vlan_enabled(dp->bridge_dev)) 258 if (br_vlan_enabled(dp->bridge_dev))
256 return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); 259 return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
257 260
@@ -267,31 +270,47 @@ int dsa_port_vlan_del(struct dsa_port *dp,
267 .vlan = vlan, 270 .vlan = vlan,
268 }; 271 };
269 272
273 if (netif_is_bridge_master(vlan->obj.orig_dev))
274 return -EOPNOTSUPP;
275
270 if (br_vlan_enabled(dp->bridge_dev)) 276 if (br_vlan_enabled(dp->bridge_dev))
271 return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); 277 return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
272 278
273 return 0; 279 return 0;
274} 280}
275 281
276static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) 282static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
277{ 283{
278 struct device_node *port_dn = dp->dn;
279 struct device_node *phy_dn; 284 struct device_node *phy_dn;
280 struct dsa_switch *ds = dp->ds;
281 struct phy_device *phydev; 285 struct phy_device *phydev;
282 int port = dp->index;
283 int err = 0;
284 286
285 phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); 287 phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
286 if (!phy_dn) 288 if (!phy_dn)
287 return 0; 289 return NULL;
288 290
289 phydev = of_phy_find_device(phy_dn); 291 phydev = of_phy_find_device(phy_dn);
290 if (!phydev) { 292 if (!phydev) {
291 err = -EPROBE_DEFER; 293 of_node_put(phy_dn);
292 goto err_put_of; 294 return ERR_PTR(-EPROBE_DEFER);
293 } 295 }
294 296
297 return phydev;
298}
299
300static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
301{
302 struct dsa_switch *ds = dp->ds;
303 struct phy_device *phydev;
304 int port = dp->index;
305 int err = 0;
306
307 phydev = dsa_port_get_phy_device(dp);
308 if (!phydev)
309 return 0;
310
311 if (IS_ERR(phydev))
312 return PTR_ERR(phydev);
313
295 if (enable) { 314 if (enable) {
296 err = genphy_config_init(phydev); 315 err = genphy_config_init(phydev);
297 if (err < 0) 316 if (err < 0)
@@ -317,8 +336,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
317 336
318err_put_dev: 337err_put_dev:
319 put_device(&phydev->mdio.dev); 338 put_device(&phydev->mdio.dev);
320err_put_of:
321 of_node_put(phy_dn);
322 return err; 339 return err;
323} 340}
324 341
@@ -372,3 +389,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
372 else 389 else
373 dsa_port_setup_phy_of(dp, false); 390 dsa_port_setup_phy_of(dp, false);
374} 391}
392
393int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
394{
395 struct phy_device *phydev;
396 int ret = -EOPNOTSUPP;
397
398 if (of_phy_is_fixed_link(dp->dn))
399 return ret;
400
401 phydev = dsa_port_get_phy_device(dp);
402 if (IS_ERR_OR_NULL(phydev))
403 return ret;
404
405 ret = phy_ethtool_get_strings(phydev, data);
406 put_device(&phydev->mdio.dev);
407
408 return ret;
409}
410EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
411
412int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
413{
414 struct phy_device *phydev;
415 int ret = -EOPNOTSUPP;
416
417 if (of_phy_is_fixed_link(dp->dn))
418 return ret;
419
420 phydev = dsa_port_get_phy_device(dp);
421 if (IS_ERR_OR_NULL(phydev))
422 return ret;
423
424 ret = phy_ethtool_get_stats(phydev, NULL, data);
425 put_device(&phydev->mdio.dev);
426
427 return ret;
428}
429EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
430
431int dsa_port_get_phy_sset_count(struct dsa_port *dp)
432{
433 struct phy_device *phydev;
434 int ret = -EOPNOTSUPP;
435
436 if (of_phy_is_fixed_link(dp->dn))
437 return ret;
438
439 phydev = dsa_port_get_phy_device(dp);
440 if (IS_ERR_OR_NULL(phydev))
441 return ret;
442
443 ret = phy_ethtool_get_sset_count(phydev);
444 put_device(&phydev->mdio.dev);
445
446 return ret;
447}
448EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..1e3b6a6d8a40 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -13,6 +13,7 @@
13#include <linux/netdevice.h> 13#include <linux/netdevice.h>
14#include <linux/phy.h> 14#include <linux/phy.h>
15#include <linux/phy_fixed.h> 15#include <linux/phy_fixed.h>
16#include <linux/phylink.h>
16#include <linux/of_net.h> 17#include <linux/of_net.h>
17#include <linux/of_mdio.h> 18#include <linux/of_mdio.h>
18#include <linux/mdio.h> 19#include <linux/mdio.h>
@@ -97,8 +98,7 @@ static int dsa_slave_open(struct net_device *dev)
97 if (err) 98 if (err)
98 goto clear_promisc; 99 goto clear_promisc;
99 100
100 if (dev->phydev) 101 phylink_start(dp->pl);
101 phy_start(dev->phydev);
102 102
103 return 0; 103 return 0;
104 104
@@ -120,8 +120,7 @@ static int dsa_slave_close(struct net_device *dev)
120 struct net_device *master = dsa_slave_to_master(dev); 120 struct net_device *master = dsa_slave_to_master(dev);
121 struct dsa_port *dp = dsa_slave_to_port(dev); 121 struct dsa_port *dp = dsa_slave_to_port(dev);
122 122
123 if (dev->phydev) 123 phylink_stop(dp->pl);
124 phy_stop(dev->phydev);
125 124
126 dsa_port_disable(dp, dev->phydev); 125 dsa_port_disable(dp, dev->phydev);
127 126
@@ -272,10 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
272 break; 271 break;
273 } 272 }
274 273
275 if (!dev->phydev) 274 return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
276 return -ENODEV;
277
278 return phy_mii_ioctl(dev->phydev, ifr, cmd);
279} 275}
280 276
281static int dsa_slave_port_attr_set(struct net_device *dev, 277static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -498,14 +494,11 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
498 ds->ops->get_regs(ds, dp->index, regs, _p); 494 ds->ops->get_regs(ds, dp->index, regs, _p);
499} 495}
500 496
501static u32 dsa_slave_get_link(struct net_device *dev) 497static int dsa_slave_nway_reset(struct net_device *dev)
502{ 498{
503 if (!dev->phydev) 499 struct dsa_port *dp = dsa_slave_to_port(dev);
504 return -ENODEV;
505
506 genphy_update_link(dev->phydev);
507 500
508 return dev->phydev->link; 501 return phylink_ethtool_nway_reset(dp->pl);
509} 502}
510 503
511static int dsa_slave_get_eeprom_len(struct net_device *dev) 504static int dsa_slave_get_eeprom_len(struct net_device *dev)
@@ -560,7 +553,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
560 strncpy(data + 2 * len, "rx_packets", len); 553 strncpy(data + 2 * len, "rx_packets", len);
561 strncpy(data + 3 * len, "rx_bytes", len); 554 strncpy(data + 3 * len, "rx_bytes", len);
562 if (ds->ops->get_strings) 555 if (ds->ops->get_strings)
563 ds->ops->get_strings(ds, dp->index, data + 4 * len); 556 ds->ops->get_strings(ds, dp->index, stringset,
557 data + 4 * len);
564 } 558 }
565} 559}
566 560
@@ -605,7 +599,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
605 599
606 count = 4; 600 count = 4;
607 if (ds->ops->get_sset_count) 601 if (ds->ops->get_sset_count)
608 count += ds->ops->get_sset_count(ds, dp->index); 602 count += ds->ops->get_sset_count(ds, dp->index, sset);
609 603
610 return count; 604 return count;
611 } 605 }
@@ -618,6 +612,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
618 struct dsa_port *dp = dsa_slave_to_port(dev); 612 struct dsa_port *dp = dsa_slave_to_port(dev);
619 struct dsa_switch *ds = dp->ds; 613 struct dsa_switch *ds = dp->ds;
620 614
615 phylink_ethtool_get_wol(dp->pl, w);
616
621 if (ds->ops->get_wol) 617 if (ds->ops->get_wol)
622 ds->ops->get_wol(ds, dp->index, w); 618 ds->ops->get_wol(ds, dp->index, w);
623} 619}
@@ -628,6 +624,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
628 struct dsa_switch *ds = dp->ds; 624 struct dsa_switch *ds = dp->ds;
629 int ret = -EOPNOTSUPP; 625 int ret = -EOPNOTSUPP;
630 626
627 phylink_ethtool_set_wol(dp->pl, w);
628
631 if (ds->ops->set_wol) 629 if (ds->ops->set_wol)
632 ret = ds->ops->set_wol(ds, dp->index, w); 630 ret = ds->ops->set_wol(ds, dp->index, w);
633 631
@@ -651,13 +649,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
651 if (ret) 649 if (ret)
652 return ret; 650 return ret;
653 651
654 if (e->eee_enabled) { 652 return phylink_ethtool_set_eee(dp->pl, e);
655 ret = phy_init_eee(dev->phydev, 0);
656 if (ret)
657 return ret;
658 }
659
660 return phy_ethtool_set_eee(dev->phydev, e);
661} 653}
662 654
663static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) 655static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -677,7 +669,23 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
677 if (ret) 669 if (ret)
678 return ret; 670 return ret;
679 671
680 return phy_ethtool_get_eee(dev->phydev, e); 672 return phylink_ethtool_get_eee(dp->pl, e);
673}
674
675static int dsa_slave_get_link_ksettings(struct net_device *dev,
676 struct ethtool_link_ksettings *cmd)
677{
678 struct dsa_port *dp = dsa_slave_to_port(dev);
679
680 return phylink_ethtool_ksettings_get(dp->pl, cmd);
681}
682
683static int dsa_slave_set_link_ksettings(struct net_device *dev,
684 const struct ethtool_link_ksettings *cmd)
685{
686 struct dsa_port *dp = dsa_slave_to_port(dev);
687
688 return phylink_ethtool_ksettings_set(dp->pl, cmd);
681} 689}
682 690
683#ifdef CONFIG_NET_POLL_CONTROLLER 691#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -980,8 +988,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
980 .get_drvinfo = dsa_slave_get_drvinfo, 988 .get_drvinfo = dsa_slave_get_drvinfo,
981 .get_regs_len = dsa_slave_get_regs_len, 989 .get_regs_len = dsa_slave_get_regs_len,
982 .get_regs = dsa_slave_get_regs, 990 .get_regs = dsa_slave_get_regs,
983 .nway_reset = phy_ethtool_nway_reset, 991 .nway_reset = dsa_slave_nway_reset,
984 .get_link = dsa_slave_get_link, 992 .get_link = ethtool_op_get_link,
985 .get_eeprom_len = dsa_slave_get_eeprom_len, 993 .get_eeprom_len = dsa_slave_get_eeprom_len,
986 .get_eeprom = dsa_slave_get_eeprom, 994 .get_eeprom = dsa_slave_get_eeprom,
987 .set_eeprom = dsa_slave_set_eeprom, 995 .set_eeprom = dsa_slave_set_eeprom,
@@ -992,8 +1000,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
992 .get_wol = dsa_slave_get_wol, 1000 .get_wol = dsa_slave_get_wol,
993 .set_eee = dsa_slave_set_eee, 1001 .set_eee = dsa_slave_set_eee,
994 .get_eee = dsa_slave_get_eee, 1002 .get_eee = dsa_slave_get_eee,
995 .get_link_ksettings = phy_ethtool_get_link_ksettings, 1003 .get_link_ksettings = dsa_slave_get_link_ksettings,
996 .set_link_ksettings = phy_ethtool_set_link_ksettings, 1004 .set_link_ksettings = dsa_slave_set_link_ksettings,
997 .get_rxnfc = dsa_slave_get_rxnfc, 1005 .get_rxnfc = dsa_slave_get_rxnfc,
998 .set_rxnfc = dsa_slave_set_rxnfc, 1006 .set_rxnfc = dsa_slave_set_rxnfc,
999 .get_ts_info = dsa_slave_get_ts_info, 1007 .get_ts_info = dsa_slave_get_ts_info,
@@ -1052,56 +1060,122 @@ static struct device_type dsa_type = {
1052 .name = "dsa", 1060 .name = "dsa",
1053}; 1061};
1054 1062
1055static void dsa_slave_adjust_link(struct net_device *dev) 1063static void dsa_slave_phylink_validate(struct net_device *dev,
1064 unsigned long *supported,
1065 struct phylink_link_state *state)
1056{ 1066{
1057 struct dsa_port *dp = dsa_slave_to_port(dev); 1067 struct dsa_port *dp = dsa_slave_to_port(dev);
1058 struct dsa_slave_priv *p = netdev_priv(dev);
1059 struct dsa_switch *ds = dp->ds; 1068 struct dsa_switch *ds = dp->ds;
1060 unsigned int status_changed = 0;
1061 1069
1062 if (p->old_link != dev->phydev->link) { 1070 if (!ds->ops->phylink_validate)
1063 status_changed = 1; 1071 return;
1064 p->old_link = dev->phydev->link;
1065 }
1066 1072
1067 if (p->old_duplex != dev->phydev->duplex) { 1073 ds->ops->phylink_validate(ds, dp->index, supported, state);
1068 status_changed = 1; 1074}
1069 p->old_duplex = dev->phydev->duplex;
1070 }
1071 1075
1072 if (p->old_pause != dev->phydev->pause) { 1076static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
1073 status_changed = 1; 1077 struct phylink_link_state *state)
1074 p->old_pause = dev->phydev->pause; 1078{
1075 } 1079 struct dsa_port *dp = dsa_slave_to_port(dev);
1080 struct dsa_switch *ds = dp->ds;
1081
1082 /* Only called for SGMII and 802.3z */
1083 if (!ds->ops->phylink_mac_link_state)
1084 return -EOPNOTSUPP;
1085
1086 return ds->ops->phylink_mac_link_state(ds, dp->index, state);
1087}
1088
1089static void dsa_slave_phylink_mac_config(struct net_device *dev,
1090 unsigned int mode,
1091 const struct phylink_link_state *state)
1092{
1093 struct dsa_port *dp = dsa_slave_to_port(dev);
1094 struct dsa_switch *ds = dp->ds;
1095
1096 if (!ds->ops->phylink_mac_config)
1097 return;
1098
1099 ds->ops->phylink_mac_config(ds, dp->index, mode, state);
1100}
1076 1101
1077 if (ds->ops->adjust_link && status_changed) 1102static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
1078 ds->ops->adjust_link(ds, dp->index, dev->phydev); 1103{
1104 struct dsa_port *dp = dsa_slave_to_port(dev);
1105 struct dsa_switch *ds = dp->ds;
1079 1106
1080 if (status_changed) 1107 if (!ds->ops->phylink_mac_an_restart)
1081 phy_print_status(dev->phydev); 1108 return;
1109
1110 ds->ops->phylink_mac_an_restart(ds, dp->index);
1082} 1111}
1083 1112
1084static int dsa_slave_fixed_link_update(struct net_device *dev, 1113static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
1085 struct fixed_phy_status *status) 1114 unsigned int mode,
1115 phy_interface_t interface)
1086{ 1116{
1087 struct dsa_switch *ds; 1117 struct dsa_port *dp = dsa_slave_to_port(dev);
1088 struct dsa_port *dp; 1118 struct dsa_switch *ds = dp->ds;
1089 1119
1090 if (dev) { 1120 if (!ds->ops->phylink_mac_link_down) {
1091 dp = dsa_slave_to_port(dev); 1121 if (ds->ops->adjust_link && dev->phydev)
1092 ds = dp->ds; 1122 ds->ops->adjust_link(ds, dp->index, dev->phydev);
1093 if (ds->ops->fixed_link_update) 1123 return;
1094 ds->ops->fixed_link_update(ds, dp->index, status);
1095 } 1124 }
1096 1125
1097 return 0; 1126 ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
1127}
1128
1129static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
1130 unsigned int mode,
1131 phy_interface_t interface,
1132 struct phy_device *phydev)
1133{
1134 struct dsa_port *dp = dsa_slave_to_port(dev);
1135 struct dsa_switch *ds = dp->ds;
1136
1137 if (!ds->ops->phylink_mac_link_up) {
1138 if (ds->ops->adjust_link && dev->phydev)
1139 ds->ops->adjust_link(ds, dp->index, dev->phydev);
1140 return;
1141 }
1142
1143 ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
1144}
1145
1146static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
1147 .validate = dsa_slave_phylink_validate,
1148 .mac_link_state = dsa_slave_phylink_mac_link_state,
1149 .mac_config = dsa_slave_phylink_mac_config,
1150 .mac_an_restart = dsa_slave_phylink_mac_an_restart,
1151 .mac_link_down = dsa_slave_phylink_mac_link_down,
1152 .mac_link_up = dsa_slave_phylink_mac_link_up,
1153};
1154
1155void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
1156{
1157 const struct dsa_port *dp = dsa_to_port(ds, port);
1158
1159 phylink_mac_change(dp->pl, up);
1160}
1161EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
1162
1163static void dsa_slave_phylink_fixed_state(struct net_device *dev,
1164 struct phylink_link_state *state)
1165{
1166 struct dsa_port *dp = dsa_slave_to_port(dev);
1167 struct dsa_switch *ds = dp->ds;
1168
1169 /* No need to check that this operation is valid, the callback would
1170 * not be called if it was not.
1171 */
1172 ds->ops->phylink_fixed_state(ds, dp->index, state);
1098} 1173}
1099 1174
1100/* slave device setup *******************************************************/ 1175/* slave device setup *******************************************************/
1101static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) 1176static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
1102{ 1177{
1103 struct dsa_port *dp = dsa_slave_to_port(slave_dev); 1178 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1104 struct dsa_slave_priv *p = netdev_priv(slave_dev);
1105 struct dsa_switch *ds = dp->ds; 1179 struct dsa_switch *ds = dp->ds;
1106 1180
1107 slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); 1181 slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
@@ -1110,75 +1184,54 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
1110 return -ENODEV; 1184 return -ENODEV;
1111 } 1185 }
1112 1186
1113 /* Use already configured phy mode */ 1187 return phylink_connect_phy(dp->pl, slave_dev->phydev);
1114 if (p->phy_interface == PHY_INTERFACE_MODE_NA)
1115 p->phy_interface = slave_dev->phydev->interface;
1116
1117 return phy_connect_direct(slave_dev, slave_dev->phydev,
1118 dsa_slave_adjust_link, p->phy_interface);
1119} 1188}
1120 1189
1121static int dsa_slave_phy_setup(struct net_device *slave_dev) 1190static int dsa_slave_phy_setup(struct net_device *slave_dev)
1122{ 1191{
1123 struct dsa_port *dp = dsa_slave_to_port(slave_dev); 1192 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1124 struct dsa_slave_priv *p = netdev_priv(slave_dev);
1125 struct device_node *port_dn = dp->dn; 1193 struct device_node *port_dn = dp->dn;
1126 struct dsa_switch *ds = dp->ds; 1194 struct dsa_switch *ds = dp->ds;
1127 struct device_node *phy_dn;
1128 bool phy_is_fixed = false;
1129 u32 phy_flags = 0; 1195 u32 phy_flags = 0;
1130 int mode, ret; 1196 int mode, ret;
1131 1197
1132 mode = of_get_phy_mode(port_dn); 1198 mode = of_get_phy_mode(port_dn);
1133 if (mode < 0) 1199 if (mode < 0)
1134 mode = PHY_INTERFACE_MODE_NA; 1200 mode = PHY_INTERFACE_MODE_NA;
1135 p->phy_interface = mode;
1136 1201
1137 phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); 1202 dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
1138 if (!phy_dn && of_phy_is_fixed_link(port_dn)) { 1203 &dsa_slave_phylink_mac_ops);
1139 /* In the case of a fixed PHY, the DT node associated 1204 if (IS_ERR(dp->pl)) {
1140 * to the fixed PHY is the Port DT node 1205 netdev_err(slave_dev,
1141 */ 1206 "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
1142 ret = of_phy_register_fixed_link(port_dn); 1207 return PTR_ERR(dp->pl);
1143 if (ret) {
1144 netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret);
1145 return ret;
1146 }
1147 phy_is_fixed = true;
1148 phy_dn = of_node_get(port_dn);
1149 } 1208 }
1150 1209
1210 /* Register only if the switch provides such a callback, since this
1211 * callback takes precedence over polling the link GPIO in PHYLINK
1212 * (see phylink_get_fixed_state).
1213 */
1214 if (ds->ops->phylink_fixed_state)
1215 phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
1216
1151 if (ds->ops->get_phy_flags) 1217 if (ds->ops->get_phy_flags)
1152 phy_flags = ds->ops->get_phy_flags(ds, dp->index); 1218 phy_flags = ds->ops->get_phy_flags(ds, dp->index);
1153 1219
1154 if (phy_dn) { 1220 ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
1155 slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, 1221 if (ret == -ENODEV) {
1156 dsa_slave_adjust_link, 1222 /* We could not connect to a designated PHY or SFP, so use the
1157 phy_flags, 1223 * switch internal MDIO bus instead
1158 p->phy_interface); 1224 */
1159 of_node_put(phy_dn);
1160 }
1161
1162 if (slave_dev->phydev && phy_is_fixed)
1163 fixed_phy_set_link_update(slave_dev->phydev,
1164 dsa_slave_fixed_link_update);
1165
1166 /* We could not connect to a designated PHY, so use the switch internal
1167 * MDIO bus instead
1168 */
1169 if (!slave_dev->phydev) {
1170 ret = dsa_slave_phy_connect(slave_dev, dp->index); 1225 ret = dsa_slave_phy_connect(slave_dev, dp->index);
1171 if (ret) { 1226 if (ret) {
1172 netdev_err(slave_dev, "failed to connect to port %d: %d\n", 1227 netdev_err(slave_dev,
1228 "failed to connect to port %d: %d\n",
1173 dp->index, ret); 1229 dp->index, ret);
1174 if (phy_is_fixed) 1230 phylink_destroy(dp->pl);
1175 of_phy_deregister_fixed_link(port_dn);
1176 return ret; 1231 return ret;
1177 } 1232 }
1178 } 1233 }
1179 1234
1180 phy_attached_info(slave_dev->phydev);
1181
1182 return 0; 1235 return 0;
1183} 1236}
1184 1237
@@ -1193,29 +1246,26 @@ static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
1193 1246
1194int dsa_slave_suspend(struct net_device *slave_dev) 1247int dsa_slave_suspend(struct net_device *slave_dev)
1195{ 1248{
1196 struct dsa_slave_priv *p = netdev_priv(slave_dev); 1249 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1197 1250
1198 netif_device_detach(slave_dev); 1251 netif_device_detach(slave_dev);
1199 1252
1200 if (slave_dev->phydev) { 1253 rtnl_lock();
1201 phy_stop(slave_dev->phydev); 1254 phylink_stop(dp->pl);
1202 p->old_pause = -1; 1255 rtnl_unlock();
1203 p->old_link = -1;
1204 p->old_duplex = -1;
1205 phy_suspend(slave_dev->phydev);
1206 }
1207 1256
1208 return 0; 1257 return 0;
1209} 1258}
1210 1259
1211int dsa_slave_resume(struct net_device *slave_dev) 1260int dsa_slave_resume(struct net_device *slave_dev)
1212{ 1261{
1262 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1263
1213 netif_device_attach(slave_dev); 1264 netif_device_attach(slave_dev);
1214 1265
1215 if (slave_dev->phydev) { 1266 rtnl_lock();
1216 phy_resume(slave_dev->phydev); 1267 phylink_start(dp->pl);
1217 phy_start(slave_dev->phydev); 1268 rtnl_unlock();
1218 }
1219 1269
1220 return 0; 1270 return 0;
1221} 1271}
@@ -1280,11 +1330,6 @@ int dsa_slave_create(struct dsa_port *port)
1280 p->dp = port; 1330 p->dp = port;
1281 INIT_LIST_HEAD(&p->mall_tc_list); 1331 INIT_LIST_HEAD(&p->mall_tc_list);
1282 p->xmit = cpu_dp->tag_ops->xmit; 1332 p->xmit = cpu_dp->tag_ops->xmit;
1283
1284 p->old_pause = -1;
1285 p->old_link = -1;
1286 p->old_duplex = -1;
1287
1288 port->slave = slave_dev; 1333 port->slave = slave_dev;
1289 1334
1290 netif_carrier_off(slave_dev); 1335 netif_carrier_off(slave_dev);
@@ -1307,9 +1352,10 @@ int dsa_slave_create(struct dsa_port *port)
1307 return 0; 1352 return 0;
1308 1353
1309out_phy: 1354out_phy:
1310 phy_disconnect(slave_dev->phydev); 1355 rtnl_lock();
1311 if (of_phy_is_fixed_link(port->dn)) 1356 phylink_disconnect_phy(p->dp->pl);
1312 of_phy_deregister_fixed_link(port->dn); 1357 rtnl_unlock();
1358 phylink_destroy(p->dp->pl);
1313out_free: 1359out_free:
1314 free_percpu(p->stats64); 1360 free_percpu(p->stats64);
1315 free_netdev(slave_dev); 1361 free_netdev(slave_dev);
@@ -1321,17 +1367,15 @@ void dsa_slave_destroy(struct net_device *slave_dev)
1321{ 1367{
1322 struct dsa_port *dp = dsa_slave_to_port(slave_dev); 1368 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1323 struct dsa_slave_priv *p = netdev_priv(slave_dev); 1369 struct dsa_slave_priv *p = netdev_priv(slave_dev);
1324 struct device_node *port_dn = dp->dn;
1325 1370
1326 netif_carrier_off(slave_dev); 1371 netif_carrier_off(slave_dev);
1327 if (slave_dev->phydev) { 1372 rtnl_lock();
1328 phy_disconnect(slave_dev->phydev); 1373 phylink_disconnect_phy(dp->pl);
1374 rtnl_unlock();
1329 1375
1330 if (of_phy_is_fixed_link(port_dn))
1331 of_phy_deregister_fixed_link(port_dn);
1332 }
1333 dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); 1376 dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
1334 unregister_netdev(slave_dev); 1377 unregister_netdev(slave_dev);
1378 phylink_destroy(dp->pl);
1335 free_percpu(p->stats64); 1379 free_percpu(p->stats64);
1336 free_netdev(slave_dev); 1380 free_netdev(slave_dev);
1337} 1381}
@@ -1394,6 +1438,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
1394 switch (switchdev_work->event) { 1438 switch (switchdev_work->event) {
1395 case SWITCHDEV_FDB_ADD_TO_DEVICE: 1439 case SWITCHDEV_FDB_ADD_TO_DEVICE:
1396 fdb_info = &switchdev_work->fdb_info; 1440 fdb_info = &switchdev_work->fdb_info;
1441 if (!fdb_info->added_by_user)
1442 break;
1443
1397 err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid); 1444 err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
1398 if (err) { 1445 if (err) {
1399 netdev_dbg(dev, "fdb add failed err=%d\n", err); 1446 netdev_dbg(dev, "fdb add failed err=%d\n", err);
@@ -1405,6 +1452,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
1405 1452
1406 case SWITCHDEV_FDB_DEL_TO_DEVICE: 1453 case SWITCHDEV_FDB_DEL_TO_DEVICE:
1407 fdb_info = &switchdev_work->fdb_info; 1454 fdb_info = &switchdev_work->fdb_info;
1455 if (!fdb_info->added_by_user)
1456 break;
1457
1408 err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid); 1458 err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
1409 if (err) { 1459 if (err) {
1410 netdev_dbg(dev, "fdb del failed err=%d\n", err); 1460 netdev_dbg(dev, "fdb del failed err=%d\n", err);
@@ -1457,8 +1507,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
1457 switch (event) { 1507 switch (event) {
1458 case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */ 1508 case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
1459 case SWITCHDEV_FDB_DEL_TO_DEVICE: 1509 case SWITCHDEV_FDB_DEL_TO_DEVICE:
1460 if (dsa_slave_switchdev_fdb_work_init(switchdev_work, 1510 if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr))
1461 ptr))
1462 goto err_fdb_work_init; 1511 goto err_fdb_work_init;
1463 dev_hold(dev); 1512 dev_hold(dev);
1464 break; 1513 break;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index eaeba9b99a73..ee28440f57c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len)
128{ 128{
129 const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; 129 const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
130 const struct ethhdr *eth = (const struct ethhdr *)data; 130 const struct ethhdr *eth = (const struct ethhdr *)data;
131 struct flow_keys keys; 131 struct flow_keys_basic keys;
132 132
133 /* this should never happen, but better safe than sorry */ 133 /* this should never happen, but better safe than sorry */
134 if (unlikely(len < sizeof(*eth))) 134 if (unlikely(len < sizeof(*eth)))
135 return len; 135 return len;
136 136
137 /* parse any remaining L2/L3 headers, check for L4 */ 137 /* parse any remaining L2/L3 headers, check for L4 */
138 if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, 138 if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
139 sizeof(*eth), len, flags)) 139 sizeof(*eth), len, flags))
140 return max_t(u32, keys.control.thoff, sizeof(*eth)); 140 return max_t(u32, keys.control.thoff, sizeof(*eth));
141 141
142 /* parse for any L4 headers */ 142 /* parse for any L4 headers */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a07b7dd06def..eec9569ffa5c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,7 +13,10 @@ obj-y := route.o inetpeer.o protocol.o \
13 tcp_offload.o datagram.o raw.o udp.o udplite.o \ 13 tcp_offload.o datagram.o raw.o udp.o udplite.o \
14 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 14 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
15 fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ 15 fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
16 inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o 16 inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
17 metrics.o netlink.o
18
19obj-$(CONFIG_BPFILTER) += bpfilter/
17 20
18obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o 21obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
19obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 22obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a59428e63ab..15e125558c76 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
994 .getsockopt = sock_common_getsockopt, 994 .getsockopt = sock_common_getsockopt,
995 .sendmsg = inet_sendmsg, 995 .sendmsg = inet_sendmsg,
996 .recvmsg = inet_recvmsg, 996 .recvmsg = inet_recvmsg,
997 .mmap = sock_no_mmap, 997#ifdef CONFIG_MMU
998 .mmap = tcp_mmap,
999#endif
998 .sendpage = inet_sendpage, 1000 .sendpage = inet_sendpage,
999 .splice_read = tcp_splice_read, 1001 .splice_read = tcp_splice_read,
1000 .read_sock = tcp_read_sock, 1002 .read_sock = tcp_read_sock,
@@ -1006,6 +1008,7 @@ const struct proto_ops inet_stream_ops = {
1006 .compat_getsockopt = compat_sock_common_getsockopt, 1008 .compat_getsockopt = compat_sock_common_getsockopt,
1007 .compat_ioctl = inet_compat_ioctl, 1009 .compat_ioctl = inet_compat_ioctl,
1008#endif 1010#endif
1011 .set_rcvlowat = tcp_set_rcvlowat,
1009}; 1012};
1010EXPORT_SYMBOL(inet_stream_ops); 1013EXPORT_SYMBOL(inet_stream_ops);
1011 1014
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_BPFILTER) += sockopt.o
2
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..5e04ed25bc0e
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,43 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/uaccess.h>
3#include <linux/bpfilter.h>
4#include <uapi/linux/bpf.h>
5#include <linux/wait.h>
6#include <linux/kmod.h>
7
8int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
9 char __user *optval,
10 unsigned int optlen, bool is_set);
11EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
12
13static int bpfilter_mbox_request(struct sock *sk, int optname,
14 char __user *optval,
15 unsigned int optlen, bool is_set)
16{
17 if (!bpfilter_process_sockopt) {
18 int err = request_module("bpfilter");
19
20 if (err)
21 return err;
22 if (!bpfilter_process_sockopt)
23 return -ECHILD;
24 }
25 return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
26}
27
28int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
29 unsigned int optlen)
30{
31 return bpfilter_mbox_request(sk, optname, optval, optlen, true);
32}
33
34int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
35 int __user *optlen)
36{
37 int len;
38
39 if (get_user(len, optlen))
40 return -EFAULT;
41
42 return bpfilter_mbox_request(sk, optname, optval, len, false);
43}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..d7585ab1a77a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
99 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 99 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
100 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, 100 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
101 [IFA_FLAGS] = { .type = NLA_U32 }, 101 [IFA_FLAGS] = { .type = NLA_U32 },
102 [IFA_RT_PRIORITY] = { .type = NLA_U32 },
102}; 103};
103 104
104#define IN4_ADDR_HSIZE_SHIFT 8 105#define IN4_ADDR_HSIZE_SHIFT 8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
835 else 836 else
836 memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); 837 memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
837 838
839 if (tb[IFA_RT_PRIORITY])
840 ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
841
838 if (tb[IFA_CACHEINFO]) { 842 if (tb[IFA_CACHEINFO]) {
839 struct ifa_cacheinfo *ci; 843 struct ifa_cacheinfo *ci;
840 844
@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
906 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, 910 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
907 extack); 911 extack);
908 } else { 912 } else {
913 u32 new_metric = ifa->ifa_rt_priority;
914
909 inet_free_ifa(ifa); 915 inet_free_ifa(ifa);
910 916
911 if (nlh->nlmsg_flags & NLM_F_EXCL || 917 if (nlh->nlmsg_flags & NLM_F_EXCL ||
912 !(nlh->nlmsg_flags & NLM_F_REPLACE)) 918 !(nlh->nlmsg_flags & NLM_F_REPLACE))
913 return -EEXIST; 919 return -EEXIST;
914 ifa = ifa_existing; 920 ifa = ifa_existing;
921
922 if (ifa->ifa_rt_priority != new_metric) {
923 fib_modify_prefix_metric(ifa, new_metric);
924 ifa->ifa_rt_priority = new_metric;
925 }
926
915 set_ifa_lifetime(ifa, valid_lft, prefered_lft); 927 set_ifa_lifetime(ifa, valid_lft, prefered_lft);
916 cancel_delayed_work(&check_lifetime_work); 928 cancel_delayed_work(&check_lifetime_work);
917 queue_delayed_work(system_power_efficient_wq, 929 queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
1549 + nla_total_size(4) /* IFA_BROADCAST */ 1561 + nla_total_size(4) /* IFA_BROADCAST */
1550 + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ 1562 + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
1551 + nla_total_size(4) /* IFA_FLAGS */ 1563 + nla_total_size(4) /* IFA_FLAGS */
1564 + nla_total_size(4) /* IFA_RT_PRIORITY */
1552 + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ 1565 + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
1553} 1566}
1554 1567
@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1618 (ifa->ifa_label[0] && 1631 (ifa->ifa_label[0] &&
1619 nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || 1632 nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
1620 nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || 1633 nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
1634 (ifa->ifa_rt_priority &&
1635 nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
1621 put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, 1636 put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
1622 preferred, valid)) 1637 preferred, valid))
1623 goto nla_put_failure; 1638 goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e66172aaf241..63aa39b3af03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
354 fl4.fl4_dport = 0; 354 fl4.fl4_dport = 0;
355 } 355 }
356 356
357 trace_fib_validate_source(dev, &fl4);
358
359 if (fib_lookup(net, &fl4, &res, 0)) 357 if (fib_lookup(net, &fl4, &res, 0))
360 goto last_resort; 358 goto last_resort;
361 if (res.type != RTN_UNICAST && 359 if (res.type != RTN_UNICAST &&
@@ -650,6 +648,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
650 [RTA_UID] = { .type = NLA_U32 }, 648 [RTA_UID] = { .type = NLA_U32 },
651 [RTA_MARK] = { .type = NLA_U32 }, 649 [RTA_MARK] = { .type = NLA_U32 },
652 [RTA_TABLE] = { .type = NLA_U32 }, 650 [RTA_TABLE] = { .type = NLA_U32 },
651 [RTA_IP_PROTO] = { .type = NLA_U8 },
652 [RTA_SPORT] = { .type = NLA_U16 },
653 [RTA_DPORT] = { .type = NLA_U16 },
653}; 654};
654 655
655static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 656static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -846,7 +847,8 @@ out_err:
846 * to fib engine. It is legal, because all events occur 847 * to fib engine. It is legal, because all events occur
847 * only when netlink is already locked. 848 * only when netlink is already locked.
848 */ 849 */
849static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) 850static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
851 struct in_ifaddr *ifa, u32 rt_priority)
850{ 852{
851 struct net *net = dev_net(ifa->ifa_dev->dev); 853 struct net *net = dev_net(ifa->ifa_dev->dev);
852 u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); 854 u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -856,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
856 .fc_type = type, 858 .fc_type = type,
857 .fc_dst = dst, 859 .fc_dst = dst,
858 .fc_dst_len = dst_len, 860 .fc_dst_len = dst_len,
861 .fc_priority = rt_priority,
859 .fc_prefsrc = ifa->ifa_local, 862 .fc_prefsrc = ifa->ifa_local,
860 .fc_oif = ifa->ifa_dev->dev->ifindex, 863 .fc_oif = ifa->ifa_dev->dev->ifindex,
861 .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, 864 .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -901,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
901 } 904 }
902 } 905 }
903 906
904 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); 907 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
905 908
906 if (!(dev->flags & IFF_UP)) 909 if (!(dev->flags & IFF_UP))
907 return; 910 return;
908 911
909 /* Add broadcast address, if it is explicitly assigned. */ 912 /* Add broadcast address, if it is explicitly assigned. */
910 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) 913 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
911 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 914 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
915 prim, 0);
912 916
913 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && 917 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
914 (prefix != addr || ifa->ifa_prefixlen < 32)) { 918 (prefix != addr || ifa->ifa_prefixlen < 32)) {
915 if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) 919 if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
916 fib_magic(RTM_NEWROUTE, 920 fib_magic(RTM_NEWROUTE,
917 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, 921 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
918 prefix, ifa->ifa_prefixlen, prim); 922 prefix, ifa->ifa_prefixlen, prim,
923 ifa->ifa_rt_priority);
919 924
920 /* Add network specific broadcasts, when it takes a sense */ 925 /* Add network specific broadcasts, when it takes a sense */
921 if (ifa->ifa_prefixlen < 31) { 926 if (ifa->ifa_prefixlen < 31) {
922 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); 927 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
928 prim, 0);
923 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 929 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
924 32, prim); 930 32, prim, 0);
925 } 931 }
926 } 932 }
927} 933}
928 934
935void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
936{
937 __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
938 struct in_device *in_dev = ifa->ifa_dev;
939 struct net_device *dev = in_dev->dev;
940
941 if (!(dev->flags & IFF_UP) ||
942 ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
943 ipv4_is_zeronet(prefix) ||
944 prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
945 return;
946
947 /* add the new */
948 fib_magic(RTM_NEWROUTE,
949 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
950 prefix, ifa->ifa_prefixlen, ifa, new_metric);
951
952 /* delete the old */
953 fib_magic(RTM_DELROUTE,
954 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
955 prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
956}
957
929/* Delete primary or secondary address. 958/* Delete primary or secondary address.
930 * Optionally, on secondary address promotion consider the addresses 959 * Optionally, on secondary address promotion consider the addresses
931 * from subnet iprim as deleted, even if they are in device list. 960 * from subnet iprim as deleted, even if they are in device list.
@@ -967,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
967 if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) 996 if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
968 fib_magic(RTM_DELROUTE, 997 fib_magic(RTM_DELROUTE,
969 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, 998 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
970 any, ifa->ifa_prefixlen, prim); 999 any, ifa->ifa_prefixlen, prim, 0);
971 subnet = 1; 1000 subnet = 1;
972 } 1001 }
973 1002
@@ -1051,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1051 1080
1052no_promotions: 1081no_promotions:
1053 if (!(ok & BRD_OK)) 1082 if (!(ok & BRD_OK))
1054 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 1083 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1084 prim, 0);
1055 if (subnet && ifa->ifa_prefixlen < 31) { 1085 if (subnet && ifa->ifa_prefixlen < 31) {
1056 if (!(ok & BRD1_OK)) 1086 if (!(ok & BRD1_OK))
1057 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 1087 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1088 prim, 0);
1058 if (!(ok & BRD0_OK)) 1089 if (!(ok & BRD0_OK))
1059 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 1090 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1091 prim, 0);
1060 } 1092 }
1061 if (!(ok & LOCAL_OK)) { 1093 if (!(ok & LOCAL_OK)) {
1062 unsigned int addr_type; 1094 unsigned int addr_type;
1063 1095
1064 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 1096 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1065 1097
1066 /* Check, that this local address finally disappeared. */ 1098 /* Check, that this local address finally disappeared. */
1067 addr_type = inet_addr_type_dev_table(dev_net(dev), dev, 1099 addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11bc8838..f8eb78d042a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
213 213
214static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, 214static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
215 struct fib_rule_hdr *frh, 215 struct fib_rule_hdr *frh,
216 struct nlattr **tb) 216 struct nlattr **tb,
217 struct netlink_ext_ack *extack)
217{ 218{
218 struct net *net = sock_net(skb->sk); 219 struct net *net = sock_net(skb->sk);
219 int err = -EINVAL; 220 int err = -EINVAL;
220 struct fib4_rule *rule4 = (struct fib4_rule *) rule; 221 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
221 222
222 if (frh->tos & ~IPTOS_TOS_MASK) 223 if (frh->tos & ~IPTOS_TOS_MASK) {
224 NL_SET_ERR_MSG(extack, "Invalid tos");
223 goto errout; 225 goto errout;
226 }
224 227
225 /* split local/main if they are not already split */ 228 /* split local/main if they are not already split */
226 err = fib_unmerge(net); 229 err = fib_unmerge(net);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c27122f01b87..f3c89ccf14c5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -717,6 +717,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
717 nla_strlcpy(tmp, nla, sizeof(tmp)); 717 nla_strlcpy(tmp, nla, sizeof(tmp));
718 val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); 718 val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
719 } else { 719 } else {
720 if (nla_len(nla) != sizeof(u32))
721 return false;
720 val = nla_get_u32(nla); 722 val = nla_get_u32(nla);
721 } 723 }
722 724
@@ -1019,47 +1021,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
1019static int 1021static int
1020fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) 1022fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
1021{ 1023{
1022 bool ecn_ca = false; 1024 return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
1023 struct nlattr *nla; 1025 fi->fib_metrics->metrics);
1024 int remaining;
1025
1026 if (!cfg->fc_mx)
1027 return 0;
1028
1029 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1030 int type = nla_type(nla);
1031 u32 val;
1032
1033 if (!type)
1034 continue;
1035 if (type > RTAX_MAX)
1036 return -EINVAL;
1037
1038 if (type == RTAX_CC_ALGO) {
1039 char tmp[TCP_CA_NAME_MAX];
1040
1041 nla_strlcpy(tmp, nla, sizeof(tmp));
1042 val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
1043 if (val == TCP_CA_UNSPEC)
1044 return -EINVAL;
1045 } else {
1046 val = nla_get_u32(nla);
1047 }
1048 if (type == RTAX_ADVMSS && val > 65535 - 40)
1049 val = 65535 - 40;
1050 if (type == RTAX_MTU && val > 65535 - 15)
1051 val = 65535 - 15;
1052 if (type == RTAX_HOPLIMIT && val > 255)
1053 val = 255;
1054 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1055 return -EINVAL;
1056 fi->fib_metrics->metrics[type - 1] = val;
1057 }
1058
1059 if (ecn_ca)
1060 fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1061
1062 return 0;
1063} 1026}
1064 1027
1065struct fib_info *fib_create_info(struct fib_config *cfg, 1028struct fib_info *fib_create_info(struct fib_config *cfg,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 99c23a0cb8ca..5bc0c89e81e4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1326 unsigned long index; 1326 unsigned long index;
1327 t_key cindex; 1327 t_key cindex;
1328 1328
1329 trace_fib_table_lookup(tb->tb_id, flp);
1330
1331 pn = t->kv; 1329 pn = t->kv;
1332 cindex = 0; 1330 cindex = 0;
1333 1331
1334 n = get_child_rcu(pn, cindex); 1332 n = get_child_rcu(pn, cindex);
1335 if (!n) 1333 if (!n) {
1334 trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
1336 return -EAGAIN; 1335 return -EAGAIN;
1336 }
1337 1337
1338#ifdef CONFIG_IP_FIB_TRIE_STATS 1338#ifdef CONFIG_IP_FIB_TRIE_STATS
1339 this_cpu_inc(stats->gets); 1339 this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ backtrace:
1416 * nothing for us to do as we do not have any 1416 * nothing for us to do as we do not have any
1417 * further nodes to parse. 1417 * further nodes to parse.
1418 */ 1418 */
1419 if (IS_TRIE(pn)) 1419 if (IS_TRIE(pn)) {
1420 trace_fib_table_lookup(tb->tb_id, flp,
1421 NULL, -EAGAIN);
1420 return -EAGAIN; 1422 return -EAGAIN;
1423 }
1421#ifdef CONFIG_IP_FIB_TRIE_STATS 1424#ifdef CONFIG_IP_FIB_TRIE_STATS
1422 this_cpu_inc(stats->backtrack); 1425 this_cpu_inc(stats->backtrack);
1423#endif 1426#endif
@@ -1459,6 +1462,7 @@ found:
1459#ifdef CONFIG_IP_FIB_TRIE_STATS 1462#ifdef CONFIG_IP_FIB_TRIE_STATS
1460 this_cpu_inc(stats->semantic_match_passed); 1463 this_cpu_inc(stats->semantic_match_passed);
1461#endif 1464#endif
1465 trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
1462 return err; 1466 return err;
1463 } 1467 }
1464 if (fi->fib_flags & RTNH_F_DEAD) 1468 if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ found:
1494#ifdef CONFIG_IP_FIB_TRIE_STATS 1498#ifdef CONFIG_IP_FIB_TRIE_STATS
1495 this_cpu_inc(stats->semantic_match_passed); 1499 this_cpu_inc(stats->semantic_match_passed);
1496#endif 1500#endif
1497 trace_fib_table_lookup_nh(nh); 1501 trace_fib_table_lookup(tb->tb_id, flp, nh, err);
1498 1502
1499 return err; 1503 return err;
1500 } 1504 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 881ac6d046f2..33a88e045efd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,11 +27,6 @@
27#include <net/sock_reuseport.h> 27#include <net/sock_reuseport.h>
28#include <net/addrconf.h> 28#include <net/addrconf.h>
29 29
30#ifdef INET_CSK_DEBUG
31const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
32EXPORT_SYMBOL(inet_csk_timer_bug_msg);
33#endif
34
35#if IS_ENABLED(CONFIG_IPV6) 30#if IS_ENABLED(CONFIG_IPV6)
36/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 31/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
37 * only, and any IPv4 addresses if not IPv6 only 32 * only, and any IPv4 addresses if not IPv6 only
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f200b304f76c..2d8efeecf619 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -578,6 +578,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
578 int tunnel_hlen; 578 int tunnel_hlen;
579 int version; 579 int version;
580 __be16 df; 580 __be16 df;
581 int nhoff;
582 int thoff;
581 583
582 tun_info = skb_tunnel_info(skb); 584 tun_info = skb_tunnel_info(skb);
583 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 585 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -605,6 +607,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
605 truncate = true; 607 truncate = true;
606 } 608 }
607 609
610 nhoff = skb_network_header(skb) - skb_mac_header(skb);
611 if (skb->protocol == htons(ETH_P_IP) &&
612 (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
613 truncate = true;
614
615 thoff = skb_transport_header(skb) - skb_mac_header(skb);
616 if (skb->protocol == htons(ETH_P_IPV6) &&
617 (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
618 truncate = true;
619
608 if (version == 1) { 620 if (version == 1) {
609 erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), 621 erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
610 ntohl(md->u.index), truncate, true); 622 ntohl(md->u.index), truncate, true);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d54abc097800..af5a830ff6ad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk,
878 struct rtable *rt = (struct rtable *)cork->dst; 878 struct rtable *rt = (struct rtable *)cork->dst;
879 unsigned int wmem_alloc_delta = 0; 879 unsigned int wmem_alloc_delta = 0;
880 u32 tskey = 0; 880 u32 tskey = 0;
881 bool paged;
881 882
882 skb = skb_peek_tail(queue); 883 skb = skb_peek_tail(queue);
883 884
884 exthdrlen = !skb ? rt->dst.header_len : 0; 885 exthdrlen = !skb ? rt->dst.header_len : 0;
885 mtu = cork->fragsize; 886 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
887 paged = !!cork->gso_size;
888
886 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 889 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
887 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 890 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
888 tskey = sk->sk_tskey++; 891 tskey = sk->sk_tskey++;
@@ -906,8 +909,8 @@ static int __ip_append_data(struct sock *sk,
906 if (transhdrlen && 909 if (transhdrlen &&
907 length + fragheaderlen <= mtu && 910 length + fragheaderlen <= mtu &&
908 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && 911 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
909 !(flags & MSG_MORE) && 912 (!(flags & MSG_MORE) || cork->gso_size) &&
910 !exthdrlen) 913 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
911 csummode = CHECKSUM_PARTIAL; 914 csummode = CHECKSUM_PARTIAL;
912 915
913 cork->length += length; 916 cork->length += length;
@@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
933 unsigned int fraglen; 936 unsigned int fraglen;
934 unsigned int fraggap; 937 unsigned int fraggap;
935 unsigned int alloclen; 938 unsigned int alloclen;
939 unsigned int pagedlen = 0;
936 struct sk_buff *skb_prev; 940 struct sk_buff *skb_prev;
937alloc_new_skb: 941alloc_new_skb:
938 skb_prev = skb; 942 skb_prev = skb;
@@ -953,8 +957,12 @@ alloc_new_skb:
953 if ((flags & MSG_MORE) && 957 if ((flags & MSG_MORE) &&
954 !(rt->dst.dev->features&NETIF_F_SG)) 958 !(rt->dst.dev->features&NETIF_F_SG))
955 alloclen = mtu; 959 alloclen = mtu;
956 else 960 else if (!paged)
957 alloclen = fraglen; 961 alloclen = fraglen;
962 else {
963 alloclen = min_t(int, fraglen, MAX_HEADER);
964 pagedlen = fraglen - alloclen;
965 }
958 966
959 alloclen += exthdrlen; 967 alloclen += exthdrlen;
960 968
@@ -998,7 +1006,7 @@ alloc_new_skb:
998 /* 1006 /*
999 * Find where to start putting bytes. 1007 * Find where to start putting bytes.
1000 */ 1008 */
1001 data = skb_put(skb, fraglen + exthdrlen); 1009 data = skb_put(skb, fraglen + exthdrlen - pagedlen);
1002 skb_set_network_header(skb, exthdrlen); 1010 skb_set_network_header(skb, exthdrlen);
1003 skb->transport_header = (skb->network_header + 1011 skb->transport_header = (skb->network_header +
1004 fragheaderlen); 1012 fragheaderlen);
@@ -1014,7 +1022,7 @@ alloc_new_skb:
1014 pskb_trim_unique(skb_prev, maxfraglen); 1022 pskb_trim_unique(skb_prev, maxfraglen);
1015 } 1023 }
1016 1024
1017 copy = datalen - transhdrlen - fraggap; 1025 copy = datalen - transhdrlen - fraggap - pagedlen;
1018 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 1026 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1019 err = -EFAULT; 1027 err = -EFAULT;
1020 kfree_skb(skb); 1028 kfree_skb(skb);
@@ -1022,7 +1030,7 @@ alloc_new_skb:
1022 } 1030 }
1023 1031
1024 offset += copy; 1032 offset += copy;
1025 length -= datalen - fraggap; 1033 length -= copy + transhdrlen;
1026 transhdrlen = 0; 1034 transhdrlen = 0;
1027 exthdrlen = 0; 1035 exthdrlen = 0;
1028 csummode = CHECKSUM_NONE; 1036 csummode = CHECKSUM_NONE;
@@ -1136,6 +1144,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1136 *rtp = NULL; 1144 *rtp = NULL;
1137 cork->fragsize = ip_sk_use_pmtu(sk) ? 1145 cork->fragsize = ip_sk_use_pmtu(sk) ?
1138 dst_mtu(&rt->dst) : rt->dst.dev->mtu; 1146 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1147
1148 cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
1139 cork->dst = &rt->dst; 1149 cork->dst = &rt->dst;
1140 cork->length = 0; 1150 cork->length = 0;
1141 cork->ttl = ipc->ttl; 1151 cork->ttl = ipc->ttl;
@@ -1215,7 +1225,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1215 return -EOPNOTSUPP; 1225 return -EOPNOTSUPP;
1216 1226
1217 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1227 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1218 mtu = cork->fragsize; 1228 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
1219 1229
1220 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1230 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1221 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1231 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -1471,9 +1481,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1471 int len, int odd, struct sk_buff *skb), 1481 int len, int odd, struct sk_buff *skb),
1472 void *from, int length, int transhdrlen, 1482 void *from, int length, int transhdrlen,
1473 struct ipcm_cookie *ipc, struct rtable **rtp, 1483 struct ipcm_cookie *ipc, struct rtable **rtp,
1474 unsigned int flags) 1484 struct inet_cork *cork, unsigned int flags)
1475{ 1485{
1476 struct inet_cork cork;
1477 struct sk_buff_head queue; 1486 struct sk_buff_head queue;
1478 int err; 1487 int err;
1479 1488
@@ -1482,22 +1491,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1482 1491
1483 __skb_queue_head_init(&queue); 1492 __skb_queue_head_init(&queue);
1484 1493
1485 cork.flags = 0; 1494 cork->flags = 0;
1486 cork.addr = 0; 1495 cork->addr = 0;
1487 cork.opt = NULL; 1496 cork->opt = NULL;
1488 err = ip_setup_cork(sk, &cork, ipc, rtp); 1497 err = ip_setup_cork(sk, cork, ipc, rtp);
1489 if (err) 1498 if (err)
1490 return ERR_PTR(err); 1499 return ERR_PTR(err);
1491 1500
1492 err = __ip_append_data(sk, fl4, &queue, &cork, 1501 err = __ip_append_data(sk, fl4, &queue, cork,
1493 &current->task_frag, getfrag, 1502 &current->task_frag, getfrag,
1494 from, length, transhdrlen, flags); 1503 from, length, transhdrlen, flags);
1495 if (err) { 1504 if (err) {
1496 __ip_flush_pending_frames(sk, &queue, &cork); 1505 __ip_flush_pending_frames(sk, &queue, cork);
1497 return ERR_PTR(err); 1506 return ERR_PTR(err);
1498 } 1507 }
1499 1508
1500 return __ip_make_skb(sk, fl4, &queue, &cork); 1509 return __ip_make_skb(sk, fl4, &queue, cork);
1501} 1510}
1502 1511
1503/* 1512/*
@@ -1553,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1553 oif = skb->skb_iif; 1562 oif = skb->skb_iif;
1554 1563
1555 flowi4_init_output(&fl4, oif, 1564 flowi4_init_output(&fl4, oif,
1556 IP4_REPLY_MARK(net, skb->mark), 1565 IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
1557 RT_TOS(arg->tos), 1566 RT_TOS(arg->tos),
1558 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, 1567 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1559 ip_reply_arg_flowi_flags(arg), 1568 ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 57bbb060faaf..fc32fdbeefa6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@
47#include <linux/errqueue.h> 47#include <linux/errqueue.h>
48#include <linux/uaccess.h> 48#include <linux/uaccess.h>
49 49
50#include <linux/bpfilter.h>
51
50/* 52/*
51 * SOL_IP control messages. 53 * SOL_IP control messages.
52 */ 54 */
@@ -1242,6 +1244,11 @@ int ip_setsockopt(struct sock *sk, int level,
1242 return -ENOPROTOOPT; 1244 return -ENOPROTOOPT;
1243 1245
1244 err = do_ip_setsockopt(sk, level, optname, optval, optlen); 1246 err = do_ip_setsockopt(sk, level, optname, optval, optlen);
1247#ifdef CONFIG_BPFILTER
1248 if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
1249 optname < BPFILTER_IPT_SET_MAX)
1250 err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
1251#endif
1245#ifdef CONFIG_NETFILTER 1252#ifdef CONFIG_NETFILTER
1246 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1253 /* we need to exclude all possible ENOPROTOOPTs except default case */
1247 if (err == -ENOPROTOOPT && optname != IP_HDRINCL && 1254 if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1550,6 +1557,11 @@ int ip_getsockopt(struct sock *sk, int level,
1550 int err; 1557 int err;
1551 1558
1552 err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); 1559 err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
1560#ifdef CONFIG_BPFILTER
1561 if (optname >= BPFILTER_IPT_SO_GET_INFO &&
1562 optname < BPFILTER_IPT_GET_MAX)
1563 err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
1564#endif
1553#ifdef CONFIG_NETFILTER 1565#ifdef CONFIG_NETFILTER
1554 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1566 /* we need to exclude all possible ENOPROTOOPTs except default case */
1555 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && 1567 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1582,6 +1594,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
1582 err = do_ip_getsockopt(sk, level, optname, optval, optlen, 1594 err = do_ip_getsockopt(sk, level, optname, optval, optlen,
1583 MSG_CMSG_COMPAT); 1595 MSG_CMSG_COMPAT);
1584 1596
1597#ifdef CONFIG_BPFILTER
1598 if (optname >= BPFILTER_IPT_SO_GET_INFO &&
1599 optname < BPFILTER_IPT_GET_MAX)
1600 err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
1601#endif
1585#ifdef CONFIG_NETFILTER 1602#ifdef CONFIG_NETFILTER
1586 /* we need to exclude all possible ENOPROTOOPTs except default case */ 1603 /* we need to exclude all possible ENOPROTOOPTs except default case */
1587 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && 1604 if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..dde671e97829 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -423,17 +423,17 @@ void __init ip_tunnel_core_init(void)
423 lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); 423 lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
424} 424}
425 425
426struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; 426DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
427EXPORT_SYMBOL(ip_tunnel_metadata_cnt); 427EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
428 428
429void ip_tunnel_need_metadata(void) 429void ip_tunnel_need_metadata(void)
430{ 430{
431 static_key_slow_inc(&ip_tunnel_metadata_cnt); 431 static_branch_inc(&ip_tunnel_metadata_cnt);
432} 432}
433EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); 433EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
434 434
435void ip_tunnel_unneed_metadata(void) 435void ip_tunnel_unneed_metadata(void)
436{ 436{
437 static_key_slow_dec(&ip_tunnel_metadata_cnt); 437 static_branch_dec(&ip_tunnel_metadata_cnt);
438} 438}
439EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); 439EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index bbcbcc113d19..88212615bf4c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -28,6 +28,9 @@
28 * 28 *
29 * Multiple Nameservers in /proc/net/pnp 29 * Multiple Nameservers in /proc/net/pnp
30 * -- Josef Siemes <jsiemes@web.de>, Aug 2002 30 * -- Josef Siemes <jsiemes@web.de>, Aug 2002
31 *
32 * NTP servers in /proc/net/ipconfig/ntp_servers
33 * -- Chris Novakovic <chris@chrisn.me.uk>, April 2018
31 */ 34 */
32 35
33#include <linux/types.h> 36#include <linux/types.h>
@@ -93,6 +96,7 @@
93#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ 96#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
94#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers 97#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
95 - '3' from resolv.h */ 98 - '3' from resolv.h */
99#define CONF_NTP_SERVERS_MAX 3 /* Maximum number of NTP servers */
96 100
97#define NONE cpu_to_be32(INADDR_NONE) 101#define NONE cpu_to_be32(INADDR_NONE)
98#define ANY cpu_to_be32(INADDR_ANY) 102#define ANY cpu_to_be32(INADDR_ANY)
@@ -152,6 +156,7 @@ static int ic_proto_used; /* Protocol used, if any */
152#define ic_proto_used 0 156#define ic_proto_used 0
153#endif 157#endif
154static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ 158static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
159static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
155static u8 ic_domain[64]; /* DNS (not NIS) domain name */ 160static u8 ic_domain[64]; /* DNS (not NIS) domain name */
156 161
157/* 162/*
@@ -576,6 +581,15 @@ static inline void __init ic_nameservers_predef(void)
576 ic_nameservers[i] = NONE; 581 ic_nameservers[i] = NONE;
577} 582}
578 583
584/* Predefine NTP servers */
585static inline void __init ic_ntp_servers_predef(void)
586{
587 int i;
588
589 for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
590 ic_ntp_servers[i] = NONE;
591}
592
579/* 593/*
580 * DHCP/BOOTP support. 594 * DHCP/BOOTP support.
581 */ 595 */
@@ -671,6 +685,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
671 17, /* Boot path */ 685 17, /* Boot path */
672 26, /* MTU */ 686 26, /* MTU */
673 40, /* NIS domain name */ 687 40, /* NIS domain name */
688 42, /* NTP servers */
674 }; 689 };
675 690
676 *e++ = 55; /* Parameter request list */ 691 *e++ = 55; /* Parameter request list */
@@ -721,9 +736,11 @@ static void __init ic_bootp_init_ext(u8 *e)
721 *e++ = 3; /* Default gateway request */ 736 *e++ = 3; /* Default gateway request */
722 *e++ = 4; 737 *e++ = 4;
723 e += 4; 738 e += 4;
724 *e++ = 5; /* Name server request */ 739#if CONF_NAMESERVERS_MAX > 0
725 *e++ = 8; 740 *e++ = 6; /* (DNS) name server request */
726 e += 8; 741 *e++ = 4 * CONF_NAMESERVERS_MAX;
742 e += 4 * CONF_NAMESERVERS_MAX;
743#endif
727 *e++ = 12; /* Host name request */ 744 *e++ = 12; /* Host name request */
728 *e++ = 32; 745 *e++ = 32;
729 e += 32; 746 e += 32;
@@ -748,7 +765,13 @@ static void __init ic_bootp_init_ext(u8 *e)
748 */ 765 */
749static inline void __init ic_bootp_init(void) 766static inline void __init ic_bootp_init(void)
750{ 767{
768 /* Re-initialise all name servers and NTP servers to NONE, in case any
769 * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
770 * any IP addresses specified there will already have been decoded but
771 * are no longer needed
772 */
751 ic_nameservers_predef(); 773 ic_nameservers_predef();
774 ic_ntp_servers_predef();
752 775
753 dev_add_pack(&bootp_packet_type); 776 dev_add_pack(&bootp_packet_type);
754} 777}
@@ -912,6 +935,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
912 ic_bootp_string(utsname()->domainname, ext+1, *ext, 935 ic_bootp_string(utsname()->domainname, ext+1, *ext,
913 __NEW_UTS_LEN); 936 __NEW_UTS_LEN);
914 break; 937 break;
938 case 42: /* NTP servers */
939 servers = *ext / 4;
940 if (servers > CONF_NTP_SERVERS_MAX)
941 servers = CONF_NTP_SERVERS_MAX;
942 for (i = 0; i < servers; i++) {
943 if (ic_ntp_servers[i] == NONE)
944 memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
945 }
946 break;
915 } 947 }
916} 948}
917 949
@@ -1257,7 +1289,10 @@ static int __init ic_dynamic(void)
1257#endif /* IPCONFIG_DYNAMIC */ 1289#endif /* IPCONFIG_DYNAMIC */
1258 1290
1259#ifdef CONFIG_PROC_FS 1291#ifdef CONFIG_PROC_FS
1292/* proc_dir_entry for /proc/net/ipconfig */
1293static struct proc_dir_entry *ipconfig_dir;
1260 1294
1295/* Name servers: */
1261static int pnp_seq_show(struct seq_file *seq, void *v) 1296static int pnp_seq_show(struct seq_file *seq, void *v)
1262{ 1297{
1263 int i; 1298 int i;
@@ -1282,6 +1317,62 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
1282 &ic_servaddr); 1317 &ic_servaddr);
1283 return 0; 1318 return 0;
1284} 1319}
1320
1321/* Create the /proc/net/ipconfig directory */
1322static int __init ipconfig_proc_net_init(void)
1323{
1324 ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
1325 if (!ipconfig_dir)
1326 return -ENOMEM;
1327
1328 return 0;
1329}
1330
1331/* Create a new file under /proc/net/ipconfig */
1332static int ipconfig_proc_net_create(const char *name,
1333 const struct file_operations *fops)
1334{
1335 char *pname;
1336 struct proc_dir_entry *p;
1337
1338 if (!ipconfig_dir)
1339 return -ENOMEM;
1340
1341 pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
1342 if (!pname)
1343 return -ENOMEM;
1344
1345 p = proc_create(pname, 0444, init_net.proc_net, fops);
1346 kfree(pname);
1347 if (!p)
1348 return -ENOMEM;
1349
1350 return 0;
1351}
1352
1353/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
1354static int ntp_servers_seq_show(struct seq_file *seq, void *v)
1355{
1356 int i;
1357
1358 for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
1359 if (ic_ntp_servers[i] != NONE)
1360 seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
1361 }
1362 return 0;
1363}
1364
1365static int ntp_servers_seq_open(struct inode *inode, struct file *file)
1366{
1367 return single_open(file, ntp_servers_seq_show, NULL);
1368}
1369
1370static const struct file_operations ntp_servers_seq_fops = {
1371 .open = ntp_servers_seq_open,
1372 .read = seq_read,
1373 .llseek = seq_lseek,
1374 .release = single_release,
1375};
1285#endif /* CONFIG_PROC_FS */ 1376#endif /* CONFIG_PROC_FS */
1286 1377
1287/* 1378/*
@@ -1356,8 +1447,20 @@ static int __init ip_auto_config(void)
1356 int err; 1447 int err;
1357 unsigned int i; 1448 unsigned int i;
1358 1449
1450 /* Initialise all name servers and NTP servers to NONE (but only if the
1451 * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
1452 * otherwise we'll overwrite the IP addresses specified there)
1453 */
1454 if (ic_set_manually == 0) {
1455 ic_nameservers_predef();
1456 ic_ntp_servers_predef();
1457 }
1458
1359#ifdef CONFIG_PROC_FS 1459#ifdef CONFIG_PROC_FS
1360 proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show); 1460 proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
1461
1462 if (ipconfig_proc_net_init() == 0)
1463 ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
1361#endif /* CONFIG_PROC_FS */ 1464#endif /* CONFIG_PROC_FS */
1362 1465
1363 if (!ic_enable) 1466 if (!ic_enable)
@@ -1469,16 +1572,32 @@ static int __init ip_auto_config(void)
1469 &ic_servaddr, &root_server_addr, root_server_path); 1572 &ic_servaddr, &root_server_addr, root_server_path);
1470 if (ic_dev_mtu) 1573 if (ic_dev_mtu)
1471 pr_cont(", mtu=%d", ic_dev_mtu); 1574 pr_cont(", mtu=%d", ic_dev_mtu);
1472 for (i = 0; i < CONF_NAMESERVERS_MAX; i++) 1575 /* Name servers (if any): */
1576 for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
1473 if (ic_nameservers[i] != NONE) { 1577 if (ic_nameservers[i] != NONE) {
1474 pr_cont(" nameserver%u=%pI4", 1578 if (i == 0)
1475 i, &ic_nameservers[i]); 1579 pr_info(" nameserver%u=%pI4",
1476 break; 1580 i, &ic_nameservers[i]);
1581 else
1582 pr_cont(", nameserver%u=%pI4",
1583 i, &ic_nameservers[i]);
1477 } 1584 }
1478 for (i++; i < CONF_NAMESERVERS_MAX; i++) 1585 if (i + 1 == CONF_NAMESERVERS_MAX)
1479 if (ic_nameservers[i] != NONE) 1586 pr_cont("\n");
1480 pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); 1587 }
1481 pr_cont("\n"); 1588 /* NTP servers (if any): */
1589 for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
1590 if (ic_ntp_servers[i] != NONE) {
1591 if (i == 0)
1592 pr_info(" ntpserver%u=%pI4",
1593 i, &ic_ntp_servers[i]);
1594 else
1595 pr_cont(", ntpserver%u=%pI4",
1596 i, &ic_ntp_servers[i]);
1597 }
1598 if (i + 1 == CONF_NTP_SERVERS_MAX)
1599 pr_cont("\n");
1600 }
1482#endif /* !SILENT */ 1601#endif /* !SILENT */
1483 1602
1484 /* 1603 /*
@@ -1576,7 +1695,9 @@ static int __init ip_auto_config_setup(char *addrs)
1576 return 1; 1695 return 1;
1577 } 1696 }
1578 1697
1698 /* Initialise all name servers and NTP servers to NONE */
1579 ic_nameservers_predef(); 1699 ic_nameservers_predef();
1700 ic_ntp_servers_predef();
1580 1701
1581 /* Parse string for static IP assignment. */ 1702 /* Parse string for static IP assignment. */
1582 ip = addrs; 1703 ip = addrs;
@@ -1635,6 +1756,13 @@ static int __init ip_auto_config_setup(char *addrs)
1635 ic_nameservers[1] = NONE; 1756 ic_nameservers[1] = NONE;
1636 } 1757 }
1637 break; 1758 break;
1759 case 9:
1760 if (CONF_NTP_SERVERS_MAX >= 1) {
1761 ic_ntp_servers[0] = in_aton(ip);
1762 if (ic_ntp_servers[0] == ANY)
1763 ic_ntp_servers[0] = NONE;
1764 }
1765 break;
1638 } 1766 }
1639 } 1767 }
1640 ip = cp; 1768 ip = cp;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 37c4f885ff7b..9f79b9803a16 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
201}; 201};
202 202
203static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, 203static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
204 struct fib_rule_hdr *frh, struct nlattr **tb) 204 struct fib_rule_hdr *frh, struct nlattr **tb,
205 struct netlink_ext_ack *extack)
205{ 206{
206 return 0; 207 return 0;
207} 208}
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..cafb0506c8c9 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id,
35 struct net *net)) 35 struct net *net))
36{ 36{
37 struct mr_table *mrt; 37 struct mr_table *mrt;
38 int err;
38 39
39 mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); 40 mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
40 if (!mrt) 41 if (!mrt)
41 return NULL; 42 return ERR_PTR(-ENOMEM);
42 mrt->id = id; 43 mrt->id = id;
43 write_pnet(&mrt->net, net); 44 write_pnet(&mrt->net, net);
44 45
45 mrt->ops = *ops; 46 mrt->ops = *ops;
46 if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) { 47 err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
48 if (err) {
47 kfree(mrt); 49 kfree(mrt);
48 return NULL; 50 return ERR_PTR(err);
49 } 51 }
50 INIT_LIST_HEAD(&mrt->mfc_cache_list); 52 INIT_LIST_HEAD(&mrt->mfc_cache_list);
51 INIT_LIST_HEAD(&mrt->mfc_unres_queue); 53 INIT_LIST_HEAD(&mrt->mfc_unres_queue);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 000000000000..04311f7067e2
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,55 @@
1#include <linux/netlink.h>
2#include <linux/rtnetlink.h>
3#include <linux/types.h>
4#include <net/ip.h>
5#include <net/net_namespace.h>
6#include <net/tcp.h>
7
8int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
9 u32 *metrics)
10{
11 bool ecn_ca = false;
12 struct nlattr *nla;
13 int remaining;
14
15 if (!fc_mx)
16 return 0;
17
18 nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
19 int type = nla_type(nla);
20 u32 val;
21
22 if (!type)
23 continue;
24 if (type > RTAX_MAX)
25 return -EINVAL;
26
27 if (type == RTAX_CC_ALGO) {
28 char tmp[TCP_CA_NAME_MAX];
29
30 nla_strlcpy(tmp, nla, sizeof(tmp));
31 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
32 if (val == TCP_CA_UNSPEC)
33 return -EINVAL;
34 } else {
35 if (nla_len(nla) != sizeof(u32))
36 return -EINVAL;
37 val = nla_get_u32(nla);
38 }
39 if (type == RTAX_ADVMSS && val > 65535 - 40)
40 val = 65535 - 40;
41 if (type == RTAX_MTU && val > 65535 - 15)
42 val = 65535 - 15;
43 if (type == RTAX_HOPLIMIT && val > 255)
44 val = 255;
45 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
46 return -EINVAL;
47 metrics[type - 1] = val;
48 }
49
50 if (ecn_ca)
51 metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
52
53 return 0;
54}
55EXPORT_SYMBOL_GPL(ip_metrics_convert);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 280048e1e395..bbfc356cb1b5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV4
29 tristate "IPv4 socket lookup support" 29 tristate "IPv4 socket lookup support"
30 help 30 help
31 This option enables the IPv4 socket lookup infrastructure. This is 31 This option enables the IPv4 socket lookup infrastructure. This is
32 is required by the iptables socket match. 32 is required by the {ip,nf}tables socket match.
33
34config NF_TPROXY_IPV4
35 tristate "IPv4 tproxy support"
33 36
34if NF_TABLES 37if NF_TABLES
35 38
@@ -129,10 +132,7 @@ config NFT_CHAIN_NAT_IPV4
129 source and destination ports. 132 source and destination ports.
130 133
131config NF_NAT_MASQUERADE_IPV4 134config NF_NAT_MASQUERADE_IPV4
132 tristate "IPv4 masquerade support" 135 bool
133 help
134 This is the kernel functionality to provide NAT in the masquerade
135 flavour (automatic source address selection).
136 136
137config NFT_MASQ_IPV4 137config NFT_MASQ_IPV4
138 tristate "IPv4 masquerading support for nf_tables" 138 tristate "IPv4 masquerading support for nf_tables"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 0e5edd0c7926..8394c17c269f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,12 +10,14 @@ nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
10obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 10obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
11 11
12nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o 12nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
13nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
13obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o 14obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
14 15
15# defrag 16# defrag
16obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 17obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
17 18
18obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o 19obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
20obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
19 21
20# logging 22# logging
21obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o 23obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
@@ -32,9 +34,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
32$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h 34$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
33obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o 35obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
34 36
35obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
36
37
38# NAT protocols (nf_nat) 37# NAT protocols (nf_nat)
39obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o 38obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
40 39
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e85f35b89c49..38ab97b0a2ec 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -301,7 +301,7 @@ ipt_do_table(struct sk_buff *skb,
301 counter = xt_get_this_cpu_counter(&e->counters); 301 counter = xt_get_this_cpu_counter(&e->counters);
302 ADD_COUNTER(*counter, skb->len, 1); 302 ADD_COUNTER(*counter, skb->len, 1);
303 303
304 t = ipt_get_target(e); 304 t = ipt_get_target_c(e);
305 WARN_ON(!t->u.kernel.target); 305 WARN_ON(!t->u.kernel.target);
306 306
307#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) 307#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
@@ -1783,6 +1783,8 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
1783 1783
1784 /* set res now, will see skbs right after nf_register_net_hooks */ 1784 /* set res now, will see skbs right after nf_register_net_hooks */
1785 WRITE_ONCE(*res, new_table); 1785 WRITE_ONCE(*res, new_table);
1786 if (!ops)
1787 return 0;
1786 1788
1787 ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); 1789 ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
1788 if (ret != 0) { 1790 if (ret != 0) {
@@ -1800,7 +1802,8 @@ out_free:
1800void ipt_unregister_table(struct net *net, struct xt_table *table, 1802void ipt_unregister_table(struct net *net, struct xt_table *table,
1801 const struct nf_hook_ops *ops) 1803 const struct nf_hook_ops *ops)
1802{ 1804{
1803 nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); 1805 if (ops)
1806 nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
1804 __ipt_unregister_table(net, table); 1807 __ipt_unregister_table(net, table);
1805} 1808}
1806 1809
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index a03e4e7ef5f9..ce1512b02cb2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
47static unsigned int 47static unsigned int
48masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) 48masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
49{ 49{
50 struct nf_nat_range range; 50 struct nf_nat_range2 range;
51 const struct nf_nat_ipv4_multi_range_compat *mr; 51 const struct nf_nat_ipv4_multi_range_compat *mr;
52 52
53 mr = par->targinfo; 53 mr = par->targinfo;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0f7255cc65ee..a317445448bf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -33,75 +33,63 @@ static const struct xt_table nf_nat_ipv4_table = {
33 33
34static unsigned int iptable_nat_do_chain(void *priv, 34static unsigned int iptable_nat_do_chain(void *priv,
35 struct sk_buff *skb, 35 struct sk_buff *skb,
36 const struct nf_hook_state *state,
37 struct nf_conn *ct)
38{
39 return ipt_do_table(skb, state, state->net->ipv4.nat_table);
40}
41
42static unsigned int iptable_nat_ipv4_fn(void *priv,
43 struct sk_buff *skb,
44 const struct nf_hook_state *state)
45{
46 return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
47}
48
49static unsigned int iptable_nat_ipv4_in(void *priv,
50 struct sk_buff *skb,
51 const struct nf_hook_state *state)
52{
53 return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
54}
55
56static unsigned int iptable_nat_ipv4_out(void *priv,
57 struct sk_buff *skb,
58 const struct nf_hook_state *state) 36 const struct nf_hook_state *state)
59{ 37{
60 return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); 38 return ipt_do_table(skb, state, state->net->ipv4.nat_table);
61}
62
63static unsigned int iptable_nat_ipv4_local_fn(void *priv,
64 struct sk_buff *skb,
65 const struct nf_hook_state *state)
66{
67 return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
68} 39}
69 40
70static const struct nf_hook_ops nf_nat_ipv4_ops[] = { 41static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
71 /* Before packet filtering, change destination */
72 { 42 {
73 .hook = iptable_nat_ipv4_in, 43 .hook = iptable_nat_do_chain,
74 .pf = NFPROTO_IPV4, 44 .pf = NFPROTO_IPV4,
75 .nat_hook = true,
76 .hooknum = NF_INET_PRE_ROUTING, 45 .hooknum = NF_INET_PRE_ROUTING,
77 .priority = NF_IP_PRI_NAT_DST, 46 .priority = NF_IP_PRI_NAT_DST,
78 }, 47 },
79 /* After packet filtering, change source */
80 { 48 {
81 .hook = iptable_nat_ipv4_out, 49 .hook = iptable_nat_do_chain,
82 .pf = NFPROTO_IPV4, 50 .pf = NFPROTO_IPV4,
83 .nat_hook = true,
84 .hooknum = NF_INET_POST_ROUTING, 51 .hooknum = NF_INET_POST_ROUTING,
85 .priority = NF_IP_PRI_NAT_SRC, 52 .priority = NF_IP_PRI_NAT_SRC,
86 }, 53 },
87 /* Before packet filtering, change destination */
88 { 54 {
89 .hook = iptable_nat_ipv4_local_fn, 55 .hook = iptable_nat_do_chain,
90 .pf = NFPROTO_IPV4, 56 .pf = NFPROTO_IPV4,
91 .nat_hook = true,
92 .hooknum = NF_INET_LOCAL_OUT, 57 .hooknum = NF_INET_LOCAL_OUT,
93 .priority = NF_IP_PRI_NAT_DST, 58 .priority = NF_IP_PRI_NAT_DST,
94 }, 59 },
95 /* After packet filtering, change source */
96 { 60 {
97 .hook = iptable_nat_ipv4_fn, 61 .hook = iptable_nat_do_chain,
98 .pf = NFPROTO_IPV4, 62 .pf = NFPROTO_IPV4,
99 .nat_hook = true,
100 .hooknum = NF_INET_LOCAL_IN, 63 .hooknum = NF_INET_LOCAL_IN,
101 .priority = NF_IP_PRI_NAT_SRC, 64 .priority = NF_IP_PRI_NAT_SRC,
102 }, 65 },
103}; 66};
104 67
68static int ipt_nat_register_lookups(struct net *net)
69{
70 int i, ret;
71
72 for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
73 ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
74 if (ret) {
75 while (i)
76 nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
77
78 return ret;
79 }
80 }
81
82 return 0;
83}
84
85static void ipt_nat_unregister_lookups(struct net *net)
86{
87 int i;
88
89 for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
90 nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
91}
92
105static int __net_init iptable_nat_table_init(struct net *net) 93static int __net_init iptable_nat_table_init(struct net *net)
106{ 94{
107 struct ipt_replace *repl; 95 struct ipt_replace *repl;
@@ -114,7 +102,18 @@ static int __net_init iptable_nat_table_init(struct net *net)
114 if (repl == NULL) 102 if (repl == NULL)
115 return -ENOMEM; 103 return -ENOMEM;
116 ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, 104 ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
117 nf_nat_ipv4_ops, &net->ipv4.nat_table); 105 NULL, &net->ipv4.nat_table);
106 if (ret < 0) {
107 kfree(repl);
108 return ret;
109 }
110
111 ret = ipt_nat_register_lookups(net);
112 if (ret < 0) {
113 ipt_unregister_table(net, net->ipv4.nat_table, NULL);
114 net->ipv4.nat_table = NULL;
115 }
116
118 kfree(repl); 117 kfree(repl);
119 return ret; 118 return ret;
120} 119}
@@ -123,7 +122,8 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
123{ 122{
124 if (!net->ipv4.nat_table) 123 if (!net->ipv4.nat_table)
125 return; 124 return;
126 ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); 125 ipt_nat_unregister_lookups(net);
126 ipt_unregister_table(net, net->ipv4.nat_table, NULL);
127 net->ipv4.nat_table = NULL; 127 net->ipv4.nat_table = NULL;
128} 128}
129 129
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 0cd46bffa469..e1e56d7123d2 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -2,265 +2,12 @@
2#include <linux/init.h> 2#include <linux/init.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/netfilter.h> 4#include <linux/netfilter.h>
5#include <linux/rhashtable.h>
6#include <linux/ip.h>
7#include <linux/netdevice.h>
8#include <net/ip.h>
9#include <net/neighbour.h>
10#include <net/netfilter/nf_flow_table.h> 5#include <net/netfilter/nf_flow_table.h>
11#include <net/netfilter/nf_tables.h> 6#include <net/netfilter/nf_tables.h>
12/* For layer 4 checksum field offset. */
13#include <linux/tcp.h>
14#include <linux/udp.h>
15
16static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
17 __be32 addr, __be32 new_addr)
18{
19 struct tcphdr *tcph;
20
21 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
22 skb_try_make_writable(skb, thoff + sizeof(*tcph)))
23 return -1;
24
25 tcph = (void *)(skb_network_header(skb) + thoff);
26 inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
27
28 return 0;
29}
30
31static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
32 __be32 addr, __be32 new_addr)
33{
34 struct udphdr *udph;
35
36 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
37 skb_try_make_writable(skb, thoff + sizeof(*udph)))
38 return -1;
39
40 udph = (void *)(skb_network_header(skb) + thoff);
41 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
42 inet_proto_csum_replace4(&udph->check, skb, addr,
43 new_addr, true);
44 if (!udph->check)
45 udph->check = CSUM_MANGLED_0;
46 }
47
48 return 0;
49}
50
51static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
52 unsigned int thoff, __be32 addr,
53 __be32 new_addr)
54{
55 switch (iph->protocol) {
56 case IPPROTO_TCP:
57 if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
58 return NF_DROP;
59 break;
60 case IPPROTO_UDP:
61 if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
62 return NF_DROP;
63 break;
64 }
65
66 return 0;
67}
68
69static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
70 struct iphdr *iph, unsigned int thoff,
71 enum flow_offload_tuple_dir dir)
72{
73 __be32 addr, new_addr;
74
75 switch (dir) {
76 case FLOW_OFFLOAD_DIR_ORIGINAL:
77 addr = iph->saddr;
78 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
79 iph->saddr = new_addr;
80 break;
81 case FLOW_OFFLOAD_DIR_REPLY:
82 addr = iph->daddr;
83 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
84 iph->daddr = new_addr;
85 break;
86 default:
87 return -1;
88 }
89 csum_replace4(&iph->check, addr, new_addr);
90
91 return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
92}
93
94static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
95 struct iphdr *iph, unsigned int thoff,
96 enum flow_offload_tuple_dir dir)
97{
98 __be32 addr, new_addr;
99
100 switch (dir) {
101 case FLOW_OFFLOAD_DIR_ORIGINAL:
102 addr = iph->daddr;
103 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
104 iph->daddr = new_addr;
105 break;
106 case FLOW_OFFLOAD_DIR_REPLY:
107 addr = iph->saddr;
108 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
109 iph->saddr = new_addr;
110 break;
111 default:
112 return -1;
113 }
114 csum_replace4(&iph->check, addr, new_addr);
115
116 return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
117}
118
119static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
120 enum flow_offload_tuple_dir dir)
121{
122 struct iphdr *iph = ip_hdr(skb);
123 unsigned int thoff = iph->ihl * 4;
124
125 if (flow->flags & FLOW_OFFLOAD_SNAT &&
126 (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
127 nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
128 return -1;
129 if (flow->flags & FLOW_OFFLOAD_DNAT &&
130 (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
131 nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
132 return -1;
133
134 return 0;
135}
136
137static bool ip_has_options(unsigned int thoff)
138{
139 return thoff != sizeof(struct iphdr);
140}
141
142static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
143 struct flow_offload_tuple *tuple)
144{
145 struct flow_ports *ports;
146 unsigned int thoff;
147 struct iphdr *iph;
148
149 if (!pskb_may_pull(skb, sizeof(*iph)))
150 return -1;
151
152 iph = ip_hdr(skb);
153 thoff = iph->ihl * 4;
154
155 if (ip_is_fragment(iph) ||
156 unlikely(ip_has_options(thoff)))
157 return -1;
158
159 if (iph->protocol != IPPROTO_TCP &&
160 iph->protocol != IPPROTO_UDP)
161 return -1;
162
163 thoff = iph->ihl * 4;
164 if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
165 return -1;
166
167 ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
168
169 tuple->src_v4.s_addr = iph->saddr;
170 tuple->dst_v4.s_addr = iph->daddr;
171 tuple->src_port = ports->source;
172 tuple->dst_port = ports->dest;
173 tuple->l3proto = AF_INET;
174 tuple->l4proto = iph->protocol;
175 tuple->iifidx = dev->ifindex;
176
177 return 0;
178}
179
180/* Based on ip_exceeds_mtu(). */
181static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
182{
183 if (skb->len <= mtu)
184 return false;
185
186 if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
187 return false;
188
189 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
190 return false;
191
192 return true;
193}
194
195static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
196{
197 u32 mtu;
198
199 mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
200 if (__nf_flow_exceeds_mtu(skb, mtu))
201 return true;
202
203 return false;
204}
205
206unsigned int
207nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
208 const struct nf_hook_state *state)
209{
210 struct flow_offload_tuple_rhash *tuplehash;
211 struct nf_flowtable *flow_table = priv;
212 struct flow_offload_tuple tuple = {};
213 enum flow_offload_tuple_dir dir;
214 struct flow_offload *flow;
215 struct net_device *outdev;
216 const struct rtable *rt;
217 struct iphdr *iph;
218 __be32 nexthop;
219
220 if (skb->protocol != htons(ETH_P_IP))
221 return NF_ACCEPT;
222
223 if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
224 return NF_ACCEPT;
225
226 tuplehash = flow_offload_lookup(flow_table, &tuple);
227 if (tuplehash == NULL)
228 return NF_ACCEPT;
229
230 outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
231 if (!outdev)
232 return NF_ACCEPT;
233
234 dir = tuplehash->tuple.dir;
235 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
236
237 rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
238 if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
239 return NF_ACCEPT;
240
241 if (skb_try_make_writable(skb, sizeof(*iph)))
242 return NF_DROP;
243
244 if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
245 nf_flow_nat_ip(flow, skb, dir) < 0)
246 return NF_DROP;
247
248 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
249 iph = ip_hdr(skb);
250 ip_decrease_ttl(iph);
251
252 skb->dev = outdev;
253 nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
254 neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
255
256 return NF_STOLEN;
257}
258EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
259 7
260static struct nf_flowtable_type flowtable_ipv4 = { 8static struct nf_flowtable_type flowtable_ipv4 = {
261 .family = NFPROTO_IPV4, 9 .family = NFPROTO_IPV4,
262 .params = &nf_flow_offload_rhash_params, 10 .init = nf_flow_table_init,
263 .gc = nf_flow_offload_work_gc,
264 .free = nf_flow_table_free, 11 .free = nf_flow_table_free,
265 .hook = nf_flow_offload_ip_hook, 12 .hook = nf_flow_offload_ip_hook,
266 .owner = THIS_MODULE, 13 .owner = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ac8342dcb55e..4e6b53ab6c33 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
395static void ip_nat_q931_expect(struct nf_conn *new, 395static void ip_nat_q931_expect(struct nf_conn *new,
396 struct nf_conntrack_expect *this) 396 struct nf_conntrack_expect *this)
397{ 397{
398 struct nf_nat_range range; 398 struct nf_nat_range2 range;
399 399
400 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ 400 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
401 nf_nat_follow_master(new, this); 401 nf_nat_follow_master(new, this);
@@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
497static void ip_nat_callforwarding_expect(struct nf_conn *new, 497static void ip_nat_callforwarding_expect(struct nf_conn *new,
498 struct nf_conntrack_expect *this) 498 struct nf_conntrack_expect *this)
499{ 499{
500 struct nf_nat_range range; 500 struct nf_nat_range2 range;
501 501
502 /* This must be a fresh one. */ 502 /* This must be a fresh one. */
503 BUG_ON(new->status & IPS_NAT_DONE_MASK); 503 BUG_ON(new->status & IPS_NAT_DONE_MASK);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f7ff6a364d7b..6115bf1ff6f0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
63#endif /* CONFIG_XFRM */ 63#endif /* CONFIG_XFRM */
64 64
65static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, 65static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
66 const struct nf_nat_range *range) 66 const struct nf_nat_range2 *range)
67{ 67{
68 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && 68 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
69 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); 69 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
@@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
143 143
144#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 144#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
145static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 145static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
146 struct nf_nat_range *range) 146 struct nf_nat_range2 *range)
147{ 147{
148 if (tb[CTA_NAT_V4_MINIP]) { 148 if (tb[CTA_NAT_V4_MINIP]) {
149 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); 149 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
@@ -241,34 +241,18 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
241} 241}
242EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); 242EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
243 243
244unsigned int 244static unsigned int
245nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, 245nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
246 const struct nf_hook_state *state, 246 const struct nf_hook_state *state)
247 unsigned int (*do_chain)(void *priv,
248 struct sk_buff *skb,
249 const struct nf_hook_state *state,
250 struct nf_conn *ct))
251{ 247{
252 struct nf_conn *ct; 248 struct nf_conn *ct;
253 enum ip_conntrack_info ctinfo; 249 enum ip_conntrack_info ctinfo;
254 struct nf_conn_nat *nat;
255 /* maniptype == SRC for postrouting. */
256 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
257 250
258 ct = nf_ct_get(skb, &ctinfo); 251 ct = nf_ct_get(skb, &ctinfo);
259 /* Can't track? It's not due to stress, or conntrack would
260 * have dropped it. Hence it's the user's responsibilty to
261 * packet filter it out, or implement conntrack/NAT for that
262 * protocol. 8) --RR
263 */
264 if (!ct) 252 if (!ct)
265 return NF_ACCEPT; 253 return NF_ACCEPT;
266 254
267 nat = nfct_nat(ct); 255 if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
268
269 switch (ctinfo) {
270 case IP_CT_RELATED:
271 case IP_CT_RELATED_REPLY:
272 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { 256 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
273 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 257 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
274 state->hook)) 258 state->hook))
@@ -276,78 +260,30 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
276 else 260 else
277 return NF_ACCEPT; 261 return NF_ACCEPT;
278 } 262 }
279 /* Only ICMPs can be IP_CT_IS_REPLY: */
280 /* fall through */
281 case IP_CT_NEW:
282 /* Seen it before? This can happen for loopback, retrans,
283 * or local packets.
284 */
285 if (!nf_nat_initialized(ct, maniptype)) {
286 unsigned int ret;
287
288 ret = do_chain(priv, skb, state, ct);
289 if (ret != NF_ACCEPT)
290 return ret;
291
292 if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
293 break;
294
295 ret = nf_nat_alloc_null_binding(ct, state->hook);
296 if (ret != NF_ACCEPT)
297 return ret;
298 } else {
299 pr_debug("Already setup manip %s for ct %p\n",
300 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
301 ct);
302 if (nf_nat_oif_changed(state->hook, ctinfo, nat,
303 state->out))
304 goto oif_changed;
305 }
306 break;
307
308 default:
309 /* ESTABLISHED */
310 WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
311 ctinfo != IP_CT_ESTABLISHED_REPLY);
312 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
313 goto oif_changed;
314 } 263 }
315 264
316 return nf_nat_packet(ct, ctinfo, state->hook, skb); 265 return nf_nat_inet_fn(priv, skb, state);
317
318oif_changed:
319 nf_ct_kill_acct(ct, ctinfo, skb);
320 return NF_DROP;
321} 266}
322EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); 267EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
323 268
324unsigned int 269static unsigned int
325nf_nat_ipv4_in(void *priv, struct sk_buff *skb, 270nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
326 const struct nf_hook_state *state, 271 const struct nf_hook_state *state)
327 unsigned int (*do_chain)(void *priv,
328 struct sk_buff *skb,
329 const struct nf_hook_state *state,
330 struct nf_conn *ct))
331{ 272{
332 unsigned int ret; 273 unsigned int ret;
333 __be32 daddr = ip_hdr(skb)->daddr; 274 __be32 daddr = ip_hdr(skb)->daddr;
334 275
335 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 276 ret = nf_nat_ipv4_fn(priv, skb, state);
336 if (ret != NF_DROP && ret != NF_STOLEN && 277 if (ret != NF_DROP && ret != NF_STOLEN &&
337 daddr != ip_hdr(skb)->daddr) 278 daddr != ip_hdr(skb)->daddr)
338 skb_dst_drop(skb); 279 skb_dst_drop(skb);
339 280
340 return ret; 281 return ret;
341} 282}
342EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
343 283
344unsigned int 284static unsigned int
345nf_nat_ipv4_out(void *priv, struct sk_buff *skb, 285nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
346 const struct nf_hook_state *state, 286 const struct nf_hook_state *state)
347 unsigned int (*do_chain)(void *priv,
348 struct sk_buff *skb,
349 const struct nf_hook_state *state,
350 struct nf_conn *ct))
351{ 287{
352#ifdef CONFIG_XFRM 288#ifdef CONFIG_XFRM
353 const struct nf_conn *ct; 289 const struct nf_conn *ct;
@@ -356,7 +292,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
356#endif 292#endif
357 unsigned int ret; 293 unsigned int ret;
358 294
359 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 295 ret = nf_nat_ipv4_fn(priv, skb, state);
360#ifdef CONFIG_XFRM 296#ifdef CONFIG_XFRM
361 if (ret != NF_DROP && ret != NF_STOLEN && 297 if (ret != NF_DROP && ret != NF_STOLEN &&
362 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 298 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -376,22 +312,17 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
376#endif 312#endif
377 return ret; 313 return ret;
378} 314}
379EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
380 315
381unsigned int 316static unsigned int
382nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, 317nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
383 const struct nf_hook_state *state, 318 const struct nf_hook_state *state)
384 unsigned int (*do_chain)(void *priv,
385 struct sk_buff *skb,
386 const struct nf_hook_state *state,
387 struct nf_conn *ct))
388{ 319{
389 const struct nf_conn *ct; 320 const struct nf_conn *ct;
390 enum ip_conntrack_info ctinfo; 321 enum ip_conntrack_info ctinfo;
391 unsigned int ret; 322 unsigned int ret;
392 int err; 323 int err;
393 324
394 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 325 ret = nf_nat_ipv4_fn(priv, skb, state);
395 if (ret != NF_DROP && ret != NF_STOLEN && 326 if (ret != NF_DROP && ret != NF_STOLEN &&
396 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 327 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
397 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 328 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -415,7 +346,49 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
415 } 346 }
416 return ret; 347 return ret;
417} 348}
418EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn); 349
350static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
351 /* Before packet filtering, change destination */
352 {
353 .hook = nf_nat_ipv4_in,
354 .pf = NFPROTO_IPV4,
355 .hooknum = NF_INET_PRE_ROUTING,
356 .priority = NF_IP_PRI_NAT_DST,
357 },
358 /* After packet filtering, change source */
359 {
360 .hook = nf_nat_ipv4_out,
361 .pf = NFPROTO_IPV4,
362 .hooknum = NF_INET_POST_ROUTING,
363 .priority = NF_IP_PRI_NAT_SRC,
364 },
365 /* Before packet filtering, change destination */
366 {
367 .hook = nf_nat_ipv4_local_fn,
368 .pf = NFPROTO_IPV4,
369 .hooknum = NF_INET_LOCAL_OUT,
370 .priority = NF_IP_PRI_NAT_DST,
371 },
372 /* After packet filtering, change source */
373 {
374 .hook = nf_nat_ipv4_fn,
375 .pf = NFPROTO_IPV4,
376 .hooknum = NF_INET_LOCAL_IN,
377 .priority = NF_IP_PRI_NAT_SRC,
378 },
379};
380
381int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
382{
383 return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
384}
385EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
386
387void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
388{
389 nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
390}
391EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
419 392
420static int __init nf_nat_l3proto_ipv4_init(void) 393static int __init nf_nat_l3proto_ipv4_init(void)
421{ 394{
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index 0c366aad89cb..ad3aeff152ed 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/module.h>
11#include <linux/atomic.h> 10#include <linux/atomic.h>
12#include <linux/inetdevice.h> 11#include <linux/inetdevice.h>
13#include <linux/ip.h> 12#include <linux/ip.h>
@@ -24,13 +23,13 @@
24 23
25unsigned int 24unsigned int
26nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, 25nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
27 const struct nf_nat_range *range, 26 const struct nf_nat_range2 *range,
28 const struct net_device *out) 27 const struct net_device *out)
29{ 28{
30 struct nf_conn *ct; 29 struct nf_conn *ct;
31 struct nf_conn_nat *nat; 30 struct nf_conn_nat *nat;
32 enum ip_conntrack_info ctinfo; 31 enum ip_conntrack_info ctinfo;
33 struct nf_nat_range newrange; 32 struct nf_nat_range2 newrange;
34 const struct rtable *rt; 33 const struct rtable *rt;
35 __be32 newsrc, nh; 34 __be32 newsrc, nh;
36 35
@@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void)
157 unregister_inetaddr_notifier(&masq_inet_notifier); 156 unregister_inetaddr_notifier(&masq_inet_notifier);
158} 157}
159EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); 158EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
160
161MODULE_LICENSE("GPL");
162MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..5d259a12e25f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
48 struct nf_conntrack_tuple t = {}; 48 struct nf_conntrack_tuple t = {};
49 const struct nf_ct_pptp_master *ct_pptp_info; 49 const struct nf_ct_pptp_master *ct_pptp_info;
50 const struct nf_nat_pptp *nat_pptp_info; 50 const struct nf_nat_pptp *nat_pptp_info;
51 struct nf_nat_range range; 51 struct nf_nat_range2 range;
52 struct nf_conn_nat *nat; 52 struct nf_conn_nat *nat;
53 53
54 nat = nf_ct_nat_ext_add(ct); 54 nat = nf_ct_nat_ext_add(ct);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index edf05002d674..00fda6331ce5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
41static void 41static void
42gre_unique_tuple(const struct nf_nat_l3proto *l3proto, 42gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
43 struct nf_conntrack_tuple *tuple, 43 struct nf_conntrack_tuple *tuple,
44 const struct nf_nat_range *range, 44 const struct nf_nat_range2 *range,
45 enum nf_nat_manip_type maniptype, 45 enum nf_nat_manip_type maniptype,
46 const struct nf_conn *ct) 46 const struct nf_conn *ct)
47{ 47{
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 7b98baa13ede..6d7cf1d79baf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
30static void 30static void
31icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, 31icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
32 struct nf_conntrack_tuple *tuple, 32 struct nf_conntrack_tuple *tuple,
33 const struct nf_nat_range *range, 33 const struct nf_nat_range2 *range,
34 enum nf_nat_manip_type maniptype, 34 enum nf_nat_manip_type maniptype,
35 const struct nf_conn *ct) 35 const struct nf_conn *ct)
36{ 36{
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 000000000000..805e83ec3ad9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (C) 2007-2008 BalaBit IT Ltd.
3 * Author: Krisztian Kovacs
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 */
10
11#include <net/netfilter/nf_tproxy.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <net/sock.h>
15#include <net/inet_sock.h>
16#include <linux/ip.h>
17#include <net/checksum.h>
18#include <net/udp.h>
19#include <net/tcp.h>
20#include <linux/inetdevice.h>
21
22struct sock *
23nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
24 __be32 laddr, __be16 lport, struct sock *sk)
25{
26 const struct iphdr *iph = ip_hdr(skb);
27 struct tcphdr _hdr, *hp;
28
29 hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
30 if (hp == NULL) {
31 inet_twsk_put(inet_twsk(sk));
32 return NULL;
33 }
34
35 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
36 /* SYN to a TIME_WAIT socket, we'd rather redirect it
37 * to a listener socket if there's one */
38 struct sock *sk2;
39
40 sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
41 iph->saddr, laddr ? laddr : iph->daddr,
42 hp->source, lport ? lport : hp->dest,
43 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
44 if (sk2) {
45 inet_twsk_deschedule_put(inet_twsk(sk));
46 sk = sk2;
47 }
48 }
49
50 return sk;
51}
52EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
53
54__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
55{
56 struct in_device *indev;
57 __be32 laddr;
58
59 if (user_laddr)
60 return user_laddr;
61
62 laddr = 0;
63 indev = __in_dev_get_rcu(skb->dev);
64 for_primary_ifa(indev) {
65 laddr = ifa->ifa_local;
66 break;
67 } endfor_ifa(indev);
68
69 return laddr ? laddr : daddr;
70}
71EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
72
73struct sock *
74nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
75 const u8 protocol,
76 const __be32 saddr, const __be32 daddr,
77 const __be16 sport, const __be16 dport,
78 const struct net_device *in,
79 const enum nf_tproxy_lookup_t lookup_type)
80{
81 struct sock *sk;
82 struct tcphdr *tcph;
83
84 switch (protocol) {
85 case IPPROTO_TCP:
86 switch (lookup_type) {
87 case NF_TPROXY_LOOKUP_LISTENER:
88 tcph = hp;
89 sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
90 ip_hdrlen(skb) +
91 __tcp_hdrlen(tcph),
92 saddr, sport,
93 daddr, dport,
94 in->ifindex, 0);
95
96 if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
97 sk = NULL;
98 /* NOTE: we return listeners even if bound to
99 * 0.0.0.0, those are filtered out in
100 * xt_socket, since xt_TPROXY needs 0 bound
101 * listeners too
102 */
103 break;
104 case NF_TPROXY_LOOKUP_ESTABLISHED:
105 sk = inet_lookup_established(net, &tcp_hashinfo,
106 saddr, sport, daddr, dport,
107 in->ifindex);
108 break;
109 default:
110 BUG();
111 }
112 break;
113 case IPPROTO_UDP:
114 sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
115 in->ifindex);
116 if (sk) {
117 int connected = (sk->sk_state == TCP_ESTABLISHED);
118 int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
119
120 /* NOTE: we return listeners even if bound to
121 * 0.0.0.0, those are filtered out in
122 * xt_socket, since xt_TPROXY needs 0 bound
123 * listeners too
124 */
125 if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
126 (!connected || wildcard)) ||
127 (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
128 sock_put(sk);
129 sk = NULL;
130 }
131 }
132 break;
133 default:
134 WARN_ON(1);
135 sk = NULL;
136 }
137
138 pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
139 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
140
141 return sk;
142}
143EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
144
145MODULE_LICENSE("GPL");
146MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
147MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5464a3f253b..a3c4ea303e3e 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -27,9 +27,8 @@
27#include <net/ip.h> 27#include <net/ip.h>
28 28
29static unsigned int nft_nat_do_chain(void *priv, 29static unsigned int nft_nat_do_chain(void *priv,
30 struct sk_buff *skb, 30 struct sk_buff *skb,
31 const struct nf_hook_state *state, 31 const struct nf_hook_state *state)
32 struct nf_conn *ct)
33{ 32{
34 struct nft_pktinfo pkt; 33 struct nft_pktinfo pkt;
35 34
@@ -39,42 +38,14 @@ static unsigned int nft_nat_do_chain(void *priv,
39 return nft_do_chain(&pkt, priv); 38 return nft_do_chain(&pkt, priv);
40} 39}
41 40
42static unsigned int nft_nat_ipv4_fn(void *priv, 41static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
43 struct sk_buff *skb,
44 const struct nf_hook_state *state)
45{
46 return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
47}
48
49static unsigned int nft_nat_ipv4_in(void *priv,
50 struct sk_buff *skb,
51 const struct nf_hook_state *state)
52{
53 return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
54}
55
56static unsigned int nft_nat_ipv4_out(void *priv,
57 struct sk_buff *skb,
58 const struct nf_hook_state *state)
59{
60 return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
61}
62
63static unsigned int nft_nat_ipv4_local_fn(void *priv,
64 struct sk_buff *skb,
65 const struct nf_hook_state *state)
66{
67 return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
68}
69
70static int nft_nat_ipv4_init(struct nft_ctx *ctx)
71{ 42{
72 return nf_ct_netns_get(ctx->net, ctx->family); 43 return nf_nat_l3proto_ipv4_register_fn(net, ops);
73} 44}
74 45
75static void nft_nat_ipv4_free(struct nft_ctx *ctx) 46static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
76{ 47{
77 nf_ct_netns_put(ctx->net, ctx->family); 48 nf_nat_l3proto_ipv4_unregister_fn(net, ops);
78} 49}
79 50
80static const struct nft_chain_type nft_chain_nat_ipv4 = { 51static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -87,13 +58,13 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
87 (1 << NF_INET_LOCAL_OUT) | 58 (1 << NF_INET_LOCAL_OUT) |
88 (1 << NF_INET_LOCAL_IN), 59 (1 << NF_INET_LOCAL_IN),
89 .hooks = { 60 .hooks = {
90 [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in, 61 [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
91 [NF_INET_POST_ROUTING] = nft_nat_ipv4_out, 62 [NF_INET_POST_ROUTING] = nft_nat_do_chain,
92 [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, 63 [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
93 [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, 64 [NF_INET_LOCAL_IN] = nft_nat_do_chain,
94 }, 65 },
95 .init = nft_nat_ipv4_init, 66 .ops_register = nft_nat_ipv4_reg,
96 .free = nft_nat_ipv4_free, 67 .ops_unregister = nft_nat_ipv4_unreg,
97}; 68};
98 69
99static int __init nft_chain_nat_init(void) 70static int __init nft_chain_nat_init(void)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f18677277119..f1193e1e928a 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
21 const struct nft_pktinfo *pkt) 21 const struct nft_pktinfo *pkt)
22{ 22{
23 struct nft_masq *priv = nft_expr_priv(expr); 23 struct nft_masq *priv = nft_expr_priv(expr);
24 struct nf_nat_range range; 24 struct nf_nat_range2 range;
25 25
26 memset(&range, 0, sizeof(range)); 26 memset(&range, 0, sizeof(range));
27 range.flags = priv->flags; 27 range.flags = priv->flags;
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..f86bb4f06609
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
1#include <linux/netlink.h>
2#include <linux/rtnetlink.h>
3#include <linux/types.h>
4#include <net/net_namespace.h>
5#include <net/netlink.h>
6#include <net/ip.h>
7
8int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
9 struct netlink_ext_ack *extack)
10{
11 *ip_proto = nla_get_u8(attr);
12
13 switch (*ip_proto) {
14 case IPPROTO_TCP:
15 case IPPROTO_UDP:
16 case IPPROTO_ICMP:
17 return 0;
18 default:
19 NL_SET_ERR_MSG(extack, "Unsupported ip proto");
20 return -EOPNOTSUPP;
21 }
22}
23EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 573e43c8ed87..77350c1256ce 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -284,6 +284,9 @@ static const struct snmp_mib snmp4_net_list[] = {
284 SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), 284 SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
285 SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), 285 SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
286 SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), 286 SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
287 SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
288 SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
289 SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
287 SNMP_MIB_SENTINEL 290 SNMP_MIB_SENTINEL
288}; 291};
289 292
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 75fb8864be67..bf4e4adc2d00 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1341,6 +1341,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1341 return NULL; 1341 return NULL;
1342} 1342}
1343 1343
1344/* MTU selection:
1345 * 1. mtu on route is locked - use it
1346 * 2. mtu from nexthop exception
1347 * 3. mtu from egress device
1348 */
1349
1350u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1351{
1352 struct fib_info *fi = res->fi;
1353 struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1354 struct net_device *dev = nh->nh_dev;
1355 u32 mtu = 0;
1356
1357 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1358 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1359 mtu = fi->fib_mtu;
1360
1361 if (likely(!mtu)) {
1362 struct fib_nh_exception *fnhe;
1363
1364 fnhe = find_exception(nh, daddr);
1365 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1366 mtu = fnhe->fnhe_pmtu;
1367 }
1368
1369 if (likely(!mtu))
1370 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1371
1372 return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1373}
1374
1344static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1375static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1345 __be32 daddr, const bool do_cache) 1376 __be32 daddr, const bool do_cache)
1346{ 1377{
@@ -2563,11 +2594,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2563EXPORT_SYMBOL_GPL(ip_route_output_flow); 2594EXPORT_SYMBOL_GPL(ip_route_output_flow);
2564 2595
2565/* called with rcu_read_lock held */ 2596/* called with rcu_read_lock held */
2566static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, 2597static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2567 struct flowi4 *fl4, struct sk_buff *skb, u32 portid, 2598 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2568 u32 seq) 2599 struct sk_buff *skb, u32 portid, u32 seq)
2569{ 2600{
2570 struct rtable *rt = skb_rtable(skb);
2571 struct rtmsg *r; 2601 struct rtmsg *r;
2572 struct nlmsghdr *nlh; 2602 struct nlmsghdr *nlh;
2573 unsigned long expires = 0; 2603 unsigned long expires = 0;
@@ -2663,7 +2693,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2663 } 2693 }
2664 } else 2694 } else
2665#endif 2695#endif
2666 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) 2696 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2667 goto nla_put_failure; 2697 goto nla_put_failure;
2668 } 2698 }
2669 2699
@@ -2678,43 +2708,93 @@ nla_put_failure:
2678 return -EMSGSIZE; 2708 return -EMSGSIZE;
2679} 2709}
2680 2710
2711static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2712 u8 ip_proto, __be16 sport,
2713 __be16 dport)
2714{
2715 struct sk_buff *skb;
2716 struct iphdr *iph;
2717
2718 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2719 if (!skb)
2720 return NULL;
2721
2722 /* Reserve room for dummy headers, this skb can pass
2723 * through good chunk of routing engine.
2724 */
2725 skb_reset_mac_header(skb);
2726 skb_reset_network_header(skb);
2727 skb->protocol = htons(ETH_P_IP);
2728 iph = skb_put(skb, sizeof(struct iphdr));
2729 iph->protocol = ip_proto;
2730 iph->saddr = src;
2731 iph->daddr = dst;
2732 iph->version = 0x4;
2733 iph->frag_off = 0;
2734 iph->ihl = 0x5;
2735 skb_set_transport_header(skb, skb->len);
2736
2737 switch (iph->protocol) {
2738 case IPPROTO_UDP: {
2739 struct udphdr *udph;
2740
2741 udph = skb_put_zero(skb, sizeof(struct udphdr));
2742 udph->source = sport;
2743 udph->dest = dport;
2744 udph->len = sizeof(struct udphdr);
2745 udph->check = 0;
2746 break;
2747 }
2748 case IPPROTO_TCP: {
2749 struct tcphdr *tcph;
2750
2751 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2752 tcph->source = sport;
2753 tcph->dest = dport;
2754 tcph->doff = sizeof(struct tcphdr) / 4;
2755 tcph->rst = 1;
2756 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2757 src, dst, 0);
2758 break;
2759 }
2760 case IPPROTO_ICMP: {
2761 struct icmphdr *icmph;
2762
2763 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2764 icmph->type = ICMP_ECHO;
2765 icmph->code = 0;
2766 }
2767 }
2768
2769 return skb;
2770}
2771
2681static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 2772static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2682 struct netlink_ext_ack *extack) 2773 struct netlink_ext_ack *extack)
2683{ 2774{
2684 struct net *net = sock_net(in_skb->sk); 2775 struct net *net = sock_net(in_skb->sk);
2685 struct rtmsg *rtm;
2686 struct nlattr *tb[RTA_MAX+1]; 2776 struct nlattr *tb[RTA_MAX+1];
2777 u32 table_id = RT_TABLE_MAIN;
2778 __be16 sport = 0, dport = 0;
2687 struct fib_result res = {}; 2779 struct fib_result res = {};
2780 u8 ip_proto = IPPROTO_UDP;
2688 struct rtable *rt = NULL; 2781 struct rtable *rt = NULL;
2782 struct sk_buff *skb;
2783 struct rtmsg *rtm;
2689 struct flowi4 fl4; 2784 struct flowi4 fl4;
2690 __be32 dst = 0; 2785 __be32 dst = 0;
2691 __be32 src = 0; 2786 __be32 src = 0;
2787 kuid_t uid;
2692 u32 iif; 2788 u32 iif;
2693 int err; 2789 int err;
2694 int mark; 2790 int mark;
2695 struct sk_buff *skb;
2696 u32 table_id = RT_TABLE_MAIN;
2697 kuid_t uid;
2698 2791
2699 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, 2792 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2700 extack); 2793 extack);
2701 if (err < 0) 2794 if (err < 0)
2702 goto errout; 2795 return err;
2703 2796
2704 rtm = nlmsg_data(nlh); 2797 rtm = nlmsg_data(nlh);
2705
2706 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2707 if (!skb) {
2708 err = -ENOBUFS;
2709 goto errout;
2710 }
2711
2712 /* Reserve room for dummy headers, this skb can pass
2713 through good chunk of routing engine.
2714 */
2715 skb_reset_mac_header(skb);
2716 skb_reset_network_header(skb);
2717
2718 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2798 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2719 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2799 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2720 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2800 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2724,14 +2804,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2724 else 2804 else
2725 uid = (iif ? INVALID_UID : current_uid()); 2805 uid = (iif ? INVALID_UID : current_uid());
2726 2806
2727 /* Bugfix: need to give ip_route_input enough of an IP header to 2807 if (tb[RTA_IP_PROTO]) {
2728 * not gag. 2808 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2729 */ 2809 &ip_proto, extack);
2730 ip_hdr(skb)->protocol = IPPROTO_UDP; 2810 if (err)
2731 ip_hdr(skb)->saddr = src; 2811 return err;
2732 ip_hdr(skb)->daddr = dst; 2812 }
2733 2813
2734 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2814 if (tb[RTA_SPORT])
2815 sport = nla_get_be16(tb[RTA_SPORT]);
2816
2817 if (tb[RTA_DPORT])
2818 dport = nla_get_be16(tb[RTA_DPORT]);
2819
2820 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2821 if (!skb)
2822 return -ENOBUFS;
2735 2823
2736 memset(&fl4, 0, sizeof(fl4)); 2824 memset(&fl4, 0, sizeof(fl4));
2737 fl4.daddr = dst; 2825 fl4.daddr = dst;
@@ -2740,6 +2828,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2740 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2828 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2741 fl4.flowi4_mark = mark; 2829 fl4.flowi4_mark = mark;
2742 fl4.flowi4_uid = uid; 2830 fl4.flowi4_uid = uid;
2831 if (sport)
2832 fl4.fl4_sport = sport;
2833 if (dport)
2834 fl4.fl4_dport = dport;
2835 fl4.flowi4_proto = ip_proto;
2743 2836
2744 rcu_read_lock(); 2837 rcu_read_lock();
2745 2838
@@ -2749,10 +2842,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2749 dev = dev_get_by_index_rcu(net, iif); 2842 dev = dev_get_by_index_rcu(net, iif);
2750 if (!dev) { 2843 if (!dev) {
2751 err = -ENODEV; 2844 err = -ENODEV;
2752 goto errout_free; 2845 goto errout_rcu;
2753 } 2846 }
2754 2847
2755 skb->protocol = htons(ETH_P_IP); 2848 fl4.flowi4_iif = iif; /* for rt_fill_info */
2756 skb->dev = dev; 2849 skb->dev = dev;
2757 skb->mark = mark; 2850 skb->mark = mark;
2758 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 2851 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2772,7 +2865,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2772 } 2865 }
2773 2866
2774 if (err) 2867 if (err)
2775 goto errout_free; 2868 goto errout_rcu;
2776 2869
2777 if (rtm->rtm_flags & RTM_F_NOTIFY) 2870 if (rtm->rtm_flags & RTM_F_NOTIFY)
2778 rt->rt_flags |= RTCF_NOTIFY; 2871 rt->rt_flags |= RTCF_NOTIFY;
@@ -2780,34 +2873,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2780 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 2873 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2781 table_id = res.table ? res.table->tb_id : 0; 2874 table_id = res.table ? res.table->tb_id : 0;
2782 2875
2876 /* reset skb for netlink reply msg */
2877 skb_trim(skb, 0);
2878 skb_reset_network_header(skb);
2879 skb_reset_transport_header(skb);
2880 skb_reset_mac_header(skb);
2881
2783 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 2882 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2784 if (!res.fi) { 2883 if (!res.fi) {
2785 err = fib_props[res.type].error; 2884 err = fib_props[res.type].error;
2786 if (!err) 2885 if (!err)
2787 err = -EHOSTUNREACH; 2886 err = -EHOSTUNREACH;
2788 goto errout_free; 2887 goto errout_rcu;
2789 } 2888 }
2790 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 2889 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2791 nlh->nlmsg_seq, RTM_NEWROUTE, table_id, 2890 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2792 rt->rt_type, res.prefix, res.prefixlen, 2891 rt->rt_type, res.prefix, res.prefixlen,
2793 fl4.flowi4_tos, res.fi, 0); 2892 fl4.flowi4_tos, res.fi, 0);
2794 } else { 2893 } else {
2795 err = rt_fill_info(net, dst, src, table_id, &fl4, skb, 2894 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2796 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); 2895 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2797 } 2896 }
2798 if (err < 0) 2897 if (err < 0)
2799 goto errout_free; 2898 goto errout_rcu;
2800 2899
2801 rcu_read_unlock(); 2900 rcu_read_unlock();
2802 2901
2803 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 2902 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2804errout:
2805 return err;
2806 2903
2807errout_free: 2904errout_free:
2905 return err;
2906errout_rcu:
2808 rcu_read_unlock(); 2907 rcu_read_unlock();
2809 kfree_skb(skb); 2908 kfree_skb(skb);
2810 goto errout; 2909 goto errout_free;
2811} 2910}
2812 2911
2813void ip_rt_multicast_event(struct in_device *in_dev) 2912void ip_rt_multicast_event(struct in_device *in_dev)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b195bac8ac0..d06247ba08b2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -30,6 +30,7 @@
30 30
31static int zero; 31static int zero;
32static int one = 1; 32static int one = 1;
33static int two = 2;
33static int four = 4; 34static int four = 4;
34static int thousand = 1000; 35static int thousand = 1000;
35static int gso_max_segs = GSO_MAX_SEGS; 36static int gso_max_segs = GSO_MAX_SEGS;
@@ -46,6 +47,7 @@ static int tcp_syn_retries_min = 1;
46static int tcp_syn_retries_max = MAX_TCP_SYNCNT; 47static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
47static int ip_ping_group_range_min[] = { 0, 0 }; 48static int ip_ping_group_range_min[] = { 0, 0 };
48static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; 49static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
50static int comp_sack_nr_max = 255;
49 51
50/* obsolete */ 52/* obsolete */
51static int sysctl_tcp_low_latency __read_mostly; 53static int sysctl_tcp_low_latency __read_mostly;
@@ -844,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = {
844 .data = &init_net.ipv4.sysctl_tcp_tw_reuse, 846 .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
845 .maxlen = sizeof(int), 847 .maxlen = sizeof(int),
846 .mode = 0644, 848 .mode = 0644,
847 .proc_handler = proc_dointvec 849 .proc_handler = proc_dointvec_minmax,
850 .extra1 = &zero,
851 .extra2 = &two,
848 }, 852 },
849 { 853 {
850 .procname = "tcp_max_tw_buckets", 854 .procname = "tcp_max_tw_buckets",
@@ -1152,6 +1156,22 @@ static struct ctl_table ipv4_net_table[] = {
1152 .extra1 = &one, 1156 .extra1 = &one,
1153 }, 1157 },
1154 { 1158 {
1159 .procname = "tcp_comp_sack_delay_ns",
1160 .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
1161 .maxlen = sizeof(unsigned long),
1162 .mode = 0644,
1163 .proc_handler = proc_doulongvec_minmax,
1164 },
1165 {
1166 .procname = "tcp_comp_sack_nr",
1167 .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
1168 .maxlen = sizeof(int),
1169 .mode = 0644,
1170 .proc_handler = proc_dointvec_minmax,
1171 .extra1 = &zero,
1172 .extra2 = &comp_sack_nr_max,
1173 },
1174 {
1155 .procname = "udp_rmem_min", 1175 .procname = "udp_rmem_min",
1156 .data = &init_net.ipv4.sysctl_udp_rmem_min, 1176 .data = &init_net.ipv4.sysctl_udp_rmem_min,
1157 .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), 1177 .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dec47e6789e7..2741953adaba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1691,6 +1691,139 @@ int tcp_peek_len(struct socket *sock)
1691} 1691}
1692EXPORT_SYMBOL(tcp_peek_len); 1692EXPORT_SYMBOL(tcp_peek_len);
1693 1693
1694/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
1695int tcp_set_rcvlowat(struct sock *sk, int val)
1696{
1697 sk->sk_rcvlowat = val ? : 1;
1698
1699 /* Check if we need to signal EPOLLIN right now */
1700 tcp_data_ready(sk);
1701
1702 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1703 return 0;
1704
1705 /* val comes from user space and might be close to INT_MAX */
1706 val <<= 1;
1707 if (val < 0)
1708 val = INT_MAX;
1709
1710 val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
1711 if (val > sk->sk_rcvbuf) {
1712 sk->sk_rcvbuf = val;
1713 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1714 }
1715 return 0;
1716}
1717EXPORT_SYMBOL(tcp_set_rcvlowat);
1718
1719#ifdef CONFIG_MMU
1720static const struct vm_operations_struct tcp_vm_ops = {
1721};
1722
1723int tcp_mmap(struct file *file, struct socket *sock,
1724 struct vm_area_struct *vma)
1725{
1726 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1727 return -EPERM;
1728 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1729
1730 /* Instruct vm_insert_page() to not down_read(mmap_sem) */
1731 vma->vm_flags |= VM_MIXEDMAP;
1732
1733 vma->vm_ops = &tcp_vm_ops;
1734 return 0;
1735}
1736EXPORT_SYMBOL(tcp_mmap);
1737
1738static int tcp_zerocopy_receive(struct sock *sk,
1739 struct tcp_zerocopy_receive *zc)
1740{
1741 unsigned long address = (unsigned long)zc->address;
1742 const skb_frag_t *frags = NULL;
1743 u32 length = 0, seq, offset;
1744 struct vm_area_struct *vma;
1745 struct sk_buff *skb = NULL;
1746 struct tcp_sock *tp;
1747 int ret;
1748
1749 if (address & (PAGE_SIZE - 1) || address != zc->address)
1750 return -EINVAL;
1751
1752 if (sk->sk_state == TCP_LISTEN)
1753 return -ENOTCONN;
1754
1755 sock_rps_record_flow(sk);
1756
1757 down_read(&current->mm->mmap_sem);
1758
1759 ret = -EINVAL;
1760 vma = find_vma(current->mm, address);
1761 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1762 goto out;
1763 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1764
1765 tp = tcp_sk(sk);
1766 seq = tp->copied_seq;
1767 zc->length = min_t(u32, zc->length, tcp_inq(sk));
1768 zc->length &= ~(PAGE_SIZE - 1);
1769
1770 zap_page_range(vma, address, zc->length);
1771
1772 zc->recv_skip_hint = 0;
1773 ret = 0;
1774 while (length + PAGE_SIZE <= zc->length) {
1775 if (zc->recv_skip_hint < PAGE_SIZE) {
1776 if (skb) {
1777 skb = skb->next;
1778 offset = seq - TCP_SKB_CB(skb)->seq;
1779 } else {
1780 skb = tcp_recv_skb(sk, seq, &offset);
1781 }
1782
1783 zc->recv_skip_hint = skb->len - offset;
1784 offset -= skb_headlen(skb);
1785 if ((int)offset < 0 || skb_has_frag_list(skb))
1786 break;
1787 frags = skb_shinfo(skb)->frags;
1788 while (offset) {
1789 if (frags->size > offset)
1790 goto out;
1791 offset -= frags->size;
1792 frags++;
1793 }
1794 }
1795 if (frags->size != PAGE_SIZE || frags->page_offset)
1796 break;
1797 ret = vm_insert_page(vma, address + length,
1798 skb_frag_page(frags));
1799 if (ret)
1800 break;
1801 length += PAGE_SIZE;
1802 seq += PAGE_SIZE;
1803 zc->recv_skip_hint -= PAGE_SIZE;
1804 frags++;
1805 }
1806out:
1807 up_read(&current->mm->mmap_sem);
1808 if (length) {
1809 tp->copied_seq = seq;
1810 tcp_rcv_space_adjust(sk);
1811
1812 /* Clean up data we have read: This will do ACK frames. */
1813 tcp_recv_skb(sk, seq, &offset);
1814 tcp_cleanup_rbuf(sk, length);
1815 ret = 0;
1816 if (length == zc->length)
1817 zc->recv_skip_hint = 0;
1818 } else {
1819 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1820 ret = -EIO;
1821 }
1822 zc->length = length;
1823 return ret;
1824}
1825#endif
1826
1694static void tcp_update_recv_tstamps(struct sk_buff *skb, 1827static void tcp_update_recv_tstamps(struct sk_buff *skb,
1695 struct scm_timestamping *tss) 1828 struct scm_timestamping *tss)
1696{ 1829{
@@ -1746,6 +1879,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1746 } 1879 }
1747} 1880}
1748 1881
1882static int tcp_inq_hint(struct sock *sk)
1883{
1884 const struct tcp_sock *tp = tcp_sk(sk);
1885 u32 copied_seq = READ_ONCE(tp->copied_seq);
1886 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1887 int inq;
1888
1889 inq = rcv_nxt - copied_seq;
1890 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1891 lock_sock(sk);
1892 inq = tp->rcv_nxt - tp->copied_seq;
1893 release_sock(sk);
1894 }
1895 return inq;
1896}
1897
1749/* 1898/*
1750 * This routine copies from a sock struct into the user buffer. 1899 * This routine copies from a sock struct into the user buffer.
1751 * 1900 *
@@ -1762,13 +1911,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1762 u32 peek_seq; 1911 u32 peek_seq;
1763 u32 *seq; 1912 u32 *seq;
1764 unsigned long used; 1913 unsigned long used;
1765 int err; 1914 int err, inq;
1766 int target; /* Read at least this many bytes */ 1915 int target; /* Read at least this many bytes */
1767 long timeo; 1916 long timeo;
1768 struct sk_buff *skb, *last; 1917 struct sk_buff *skb, *last;
1769 u32 urg_hole = 0; 1918 u32 urg_hole = 0;
1770 struct scm_timestamping tss; 1919 struct scm_timestamping tss;
1771 bool has_tss = false; 1920 bool has_tss = false;
1921 bool has_cmsg;
1772 1922
1773 if (unlikely(flags & MSG_ERRQUEUE)) 1923 if (unlikely(flags & MSG_ERRQUEUE))
1774 return inet_recv_error(sk, msg, len, addr_len); 1924 return inet_recv_error(sk, msg, len, addr_len);
@@ -1783,6 +1933,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1783 if (sk->sk_state == TCP_LISTEN) 1933 if (sk->sk_state == TCP_LISTEN)
1784 goto out; 1934 goto out;
1785 1935
1936 has_cmsg = tp->recvmsg_inq;
1786 timeo = sock_rcvtimeo(sk, nonblock); 1937 timeo = sock_rcvtimeo(sk, nonblock);
1787 1938
1788 /* Urgent data needs to be handled specially. */ 1939 /* Urgent data needs to be handled specially. */
@@ -1969,6 +2120,7 @@ skip_copy:
1969 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2120 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1970 tcp_update_recv_tstamps(skb, &tss); 2121 tcp_update_recv_tstamps(skb, &tss);
1971 has_tss = true; 2122 has_tss = true;
2123 has_cmsg = true;
1972 } 2124 }
1973 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 2125 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1974 goto found_fin_ok; 2126 goto found_fin_ok;
@@ -1988,13 +2140,20 @@ skip_copy:
1988 * on connected socket. I was just happy when found this 8) --ANK 2140 * on connected socket. I was just happy when found this 8) --ANK
1989 */ 2141 */
1990 2142
1991 if (has_tss)
1992 tcp_recv_timestamp(msg, sk, &tss);
1993
1994 /* Clean up data we have read: This will do ACK frames. */ 2143 /* Clean up data we have read: This will do ACK frames. */
1995 tcp_cleanup_rbuf(sk, copied); 2144 tcp_cleanup_rbuf(sk, copied);
1996 2145
1997 release_sock(sk); 2146 release_sock(sk);
2147
2148 if (has_cmsg) {
2149 if (has_tss)
2150 tcp_recv_timestamp(msg, sk, &tss);
2151 if (tp->recvmsg_inq) {
2152 inq = tcp_inq_hint(sk);
2153 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2154 }
2155 }
2156
1998 return copied; 2157 return copied;
1999 2158
2000out: 2159out:
@@ -2411,6 +2570,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2411 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 2570 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2412 tp->snd_cwnd_cnt = 0; 2571 tp->snd_cwnd_cnt = 0;
2413 tp->window_clamp = 0; 2572 tp->window_clamp = 0;
2573 tp->delivered_ce = 0;
2414 tcp_set_ca_state(sk, TCP_CA_Open); 2574 tcp_set_ca_state(sk, TCP_CA_Open);
2415 tp->is_sack_reneg = 0; 2575 tp->is_sack_reneg = 0;
2416 tcp_clear_retrans(tp); 2576 tcp_clear_retrans(tp);
@@ -2424,6 +2584,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2424 dst_release(sk->sk_rx_dst); 2584 dst_release(sk->sk_rx_dst);
2425 sk->sk_rx_dst = NULL; 2585 sk->sk_rx_dst = NULL;
2426 tcp_saved_syn_free(tp); 2586 tcp_saved_syn_free(tp);
2587 tp->compressed_ack = 0;
2427 2588
2428 /* Clean up fastopen related fields */ 2589 /* Clean up fastopen related fields */
2429 tcp_free_fastopen_req(tp); 2590 tcp_free_fastopen_req(tp);
@@ -2862,6 +3023,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2862 tp->notsent_lowat = val; 3023 tp->notsent_lowat = val;
2863 sk->sk_write_space(sk); 3024 sk->sk_write_space(sk);
2864 break; 3025 break;
3026 case TCP_INQ:
3027 if (val > 1 || val < 0)
3028 err = -EINVAL;
3029 else
3030 tp->recvmsg_inq = val;
3031 break;
2865 default: 3032 default:
2866 err = -ENOPROTOOPT; 3033 err = -ENOPROTOOPT;
2867 break; 3034 break;
@@ -3020,6 +3187,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
3020 rate64 = tcp_compute_delivery_rate(tp); 3187 rate64 = tcp_compute_delivery_rate(tp);
3021 if (rate64) 3188 if (rate64)
3022 info->tcpi_delivery_rate = rate64; 3189 info->tcpi_delivery_rate = rate64;
3190 info->tcpi_delivered = tp->delivered;
3191 info->tcpi_delivered_ce = tp->delivered_ce;
3023 unlock_sock_fast(sk, slow); 3192 unlock_sock_fast(sk, slow);
3024} 3193}
3025EXPORT_SYMBOL_GPL(tcp_get_info); 3194EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3033,7 +3202,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3033 u32 rate; 3202 u32 rate;
3034 3203
3035 stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + 3204 stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
3036 5 * nla_total_size(sizeof(u32)) + 3205 7 * nla_total_size(sizeof(u32)) +
3037 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); 3206 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
3038 if (!stats) 3207 if (!stats)
3039 return NULL; 3208 return NULL;
@@ -3064,9 +3233,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3064 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); 3233 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3065 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); 3234 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3066 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); 3235 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3236 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3237 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3067 3238
3068 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); 3239 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3069 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); 3240 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3241
3070 return stats; 3242 return stats;
3071} 3243}
3072 3244
@@ -3282,6 +3454,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3282 case TCP_NOTSENT_LOWAT: 3454 case TCP_NOTSENT_LOWAT:
3283 val = tp->notsent_lowat; 3455 val = tp->notsent_lowat;
3284 break; 3456 break;
3457 case TCP_INQ:
3458 val = tp->recvmsg_inq;
3459 break;
3285 case TCP_SAVE_SYN: 3460 case TCP_SAVE_SYN:
3286 val = tp->save_syn; 3461 val = tp->save_syn;
3287 break; 3462 break;
@@ -3318,6 +3493,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3318 } 3493 }
3319 return 0; 3494 return 0;
3320 } 3495 }
3496#ifdef CONFIG_MMU
3497 case TCP_ZEROCOPY_RECEIVE: {
3498 struct tcp_zerocopy_receive zc;
3499 int err;
3500
3501 if (get_user(len, optlen))
3502 return -EFAULT;
3503 if (len != sizeof(zc))
3504 return -EINVAL;
3505 if (copy_from_user(&zc, optval, len))
3506 return -EFAULT;
3507 lock_sock(sk);
3508 err = tcp_zerocopy_receive(sk, &zc);
3509 release_sock(sk);
3510 if (!err && copy_to_user(optval, &zc, len))
3511 err = -EFAULT;
3512 return err;
3513 }
3514#endif
3321 default: 3515 default:
3322 return -ENOPROTOOPT; 3516 return -ENOPROTOOPT;
3323 } 3517 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e51c644484dc..355d3dffd021 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
111#define REXMIT_LOST 1 /* retransmit packets marked lost */ 111#define REXMIT_LOST 1 /* retransmit packets marked lost */
112#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ 112#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
113 113
114#if IS_ENABLED(CONFIG_TLS_DEVICE)
115static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
116
117void clean_acked_data_enable(struct inet_connection_sock *icsk,
118 void (*cad)(struct sock *sk, u32 ack_seq))
119{
120 icsk->icsk_clean_acked = cad;
121 static_branch_inc(&clean_acked_data_enabled);
122}
123EXPORT_SYMBOL_GPL(clean_acked_data_enable);
124
125void clean_acked_data_disable(struct inet_connection_sock *icsk)
126{
127 static_branch_dec(&clean_acked_data_enabled);
128 icsk->icsk_clean_acked = NULL;
129}
130EXPORT_SYMBOL_GPL(clean_acked_data_disable);
131#endif
132
114static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, 133static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
115 unsigned int len) 134 unsigned int len)
116{ 135{
@@ -184,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
184 } 203 }
185} 204}
186 205
187static void tcp_incr_quickack(struct sock *sk) 206static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
188{ 207{
189 struct inet_connection_sock *icsk = inet_csk(sk); 208 struct inet_connection_sock *icsk = inet_csk(sk);
190 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); 209 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
191 210
192 if (quickacks == 0) 211 if (quickacks == 0)
193 quickacks = 2; 212 quickacks = 2;
213 quickacks = min(quickacks, max_quickacks);
194 if (quickacks > icsk->icsk_ack.quick) 214 if (quickacks > icsk->icsk_ack.quick)
195 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 215 icsk->icsk_ack.quick = quickacks;
196} 216}
197 217
198static void tcp_enter_quickack_mode(struct sock *sk) 218static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
199{ 219{
200 struct inet_connection_sock *icsk = inet_csk(sk); 220 struct inet_connection_sock *icsk = inet_csk(sk);
201 tcp_incr_quickack(sk); 221
222 tcp_incr_quickack(sk, max_quickacks);
202 icsk->icsk_ack.pingpong = 0; 223 icsk->icsk_ack.pingpong = 0;
203 icsk->icsk_ack.ato = TCP_ATO_MIN; 224 icsk->icsk_ack.ato = TCP_ATO_MIN;
204} 225}
@@ -233,8 +254,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
233 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 254 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
234} 255}
235 256
236static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) 257static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
237{ 258{
259 struct tcp_sock *tp = tcp_sk(sk);
260
238 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { 261 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
239 case INET_ECN_NOT_ECT: 262 case INET_ECN_NOT_ECT:
240 /* Funny extension: if ECT is not set on a segment, 263 /* Funny extension: if ECT is not set on a segment,
@@ -242,31 +265,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
242 * it is probably a retransmit. 265 * it is probably a retransmit.
243 */ 266 */
244 if (tp->ecn_flags & TCP_ECN_SEEN) 267 if (tp->ecn_flags & TCP_ECN_SEEN)
245 tcp_enter_quickack_mode((struct sock *)tp); 268 tcp_enter_quickack_mode(sk, 1);
246 break; 269 break;
247 case INET_ECN_CE: 270 case INET_ECN_CE:
248 if (tcp_ca_needs_ecn((struct sock *)tp)) 271 if (tcp_ca_needs_ecn(sk))
249 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); 272 tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
250 273
251 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 274 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
252 /* Better not delay acks, sender can have a very low cwnd */ 275 /* Better not delay acks, sender can have a very low cwnd */
253 tcp_enter_quickack_mode((struct sock *)tp); 276 tcp_enter_quickack_mode(sk, 1);
254 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 277 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
255 } 278 }
256 tp->ecn_flags |= TCP_ECN_SEEN; 279 tp->ecn_flags |= TCP_ECN_SEEN;
257 break; 280 break;
258 default: 281 default:
259 if (tcp_ca_needs_ecn((struct sock *)tp)) 282 if (tcp_ca_needs_ecn(sk))
260 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); 283 tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
261 tp->ecn_flags |= TCP_ECN_SEEN; 284 tp->ecn_flags |= TCP_ECN_SEEN;
262 break; 285 break;
263 } 286 }
264} 287}
265 288
266static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) 289static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
267{ 290{
268 if (tp->ecn_flags & TCP_ECN_OK) 291 if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
269 __tcp_ecn_check_ce(tp, skb); 292 __tcp_ecn_check_ce(sk, skb);
270} 293}
271 294
272static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) 295static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
@@ -582,6 +605,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
582 u32 copied; 605 u32 copied;
583 int time; 606 int time;
584 607
608 trace_tcp_rcv_space_adjust(sk);
609
585 tcp_mstamp_refresh(tp); 610 tcp_mstamp_refresh(tp);
586 time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); 611 time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
587 if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) 612 if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
@@ -665,7 +690,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
665 /* The _first_ data packet received, initialize 690 /* The _first_ data packet received, initialize
666 * delayed ACK engine. 691 * delayed ACK engine.
667 */ 692 */
668 tcp_incr_quickack(sk); 693 tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
669 icsk->icsk_ack.ato = TCP_ATO_MIN; 694 icsk->icsk_ack.ato = TCP_ATO_MIN;
670 } else { 695 } else {
671 int m = now - icsk->icsk_ack.lrcvtime; 696 int m = now - icsk->icsk_ack.lrcvtime;
@@ -681,13 +706,13 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
681 /* Too long gap. Apparently sender failed to 706 /* Too long gap. Apparently sender failed to
682 * restart window, so that we send ACKs quickly. 707 * restart window, so that we send ACKs quickly.
683 */ 708 */
684 tcp_incr_quickack(sk); 709 tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
685 sk_mem_reclaim(sk); 710 sk_mem_reclaim(sk);
686 } 711 }
687 } 712 }
688 icsk->icsk_ack.lrcvtime = now; 713 icsk->icsk_ack.lrcvtime = now;
689 714
690 tcp_ecn_check_ce(tp, skb); 715 tcp_ecn_check_ce(sk, skb);
691 716
692 if (skb->len >= 128) 717 if (skb->len >= 128)
693 tcp_grow_window(sk, skb); 718 tcp_grow_window(sk, skb);
@@ -1896,19 +1921,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
1896 tp->undo_retrans = tp->retrans_out ? : -1; 1921 tp->undo_retrans = tp->retrans_out ? : -1;
1897} 1922}
1898 1923
1899/* Enter Loss state. If we detect SACK reneging, forget all SACK information 1924static bool tcp_is_rack(const struct sock *sk)
1925{
1926 return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
1927}
1928
1929/* If we detect SACK reneging, forget all SACK information
1900 * and reset tags completely, otherwise preserve SACKs. If receiver 1930 * and reset tags completely, otherwise preserve SACKs. If receiver
1901 * dropped its ofo queue, we will know this due to reneging detection. 1931 * dropped its ofo queue, we will know this due to reneging detection.
1902 */ 1932 */
1933static void tcp_timeout_mark_lost(struct sock *sk)
1934{
1935 struct tcp_sock *tp = tcp_sk(sk);
1936 struct sk_buff *skb, *head;
1937 bool is_reneg; /* is receiver reneging on SACKs? */
1938
1939 head = tcp_rtx_queue_head(sk);
1940 is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
1941 if (is_reneg) {
1942 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1943 tp->sacked_out = 0;
1944 /* Mark SACK reneging until we recover from this loss event. */
1945 tp->is_sack_reneg = 1;
1946 } else if (tcp_is_reno(tp)) {
1947 tcp_reset_reno_sack(tp);
1948 }
1949
1950 skb = head;
1951 skb_rbtree_walk_from(skb) {
1952 if (is_reneg)
1953 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1954 else if (tcp_is_rack(sk) && skb != head &&
1955 tcp_rack_skb_timeout(tp, skb, 0) > 0)
1956 continue; /* Don't mark recently sent ones lost yet */
1957 tcp_mark_skb_lost(sk, skb);
1958 }
1959 tcp_verify_left_out(tp);
1960 tcp_clear_all_retrans_hints(tp);
1961}
1962
1963/* Enter Loss state. */
1903void tcp_enter_loss(struct sock *sk) 1964void tcp_enter_loss(struct sock *sk)
1904{ 1965{
1905 const struct inet_connection_sock *icsk = inet_csk(sk); 1966 const struct inet_connection_sock *icsk = inet_csk(sk);
1906 struct tcp_sock *tp = tcp_sk(sk); 1967 struct tcp_sock *tp = tcp_sk(sk);
1907 struct net *net = sock_net(sk); 1968 struct net *net = sock_net(sk);
1908 struct sk_buff *skb;
1909 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; 1969 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1910 bool is_reneg; /* is receiver reneging on SACKs? */ 1970
1911 bool mark_lost; 1971 tcp_timeout_mark_lost(sk);
1912 1972
1913 /* Reduce ssthresh if it has not yet been made inside this window. */ 1973 /* Reduce ssthresh if it has not yet been made inside this window. */
1914 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1974 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1920,40 +1980,10 @@ void tcp_enter_loss(struct sock *sk)
1920 tcp_ca_event(sk, CA_EVENT_LOSS); 1980 tcp_ca_event(sk, CA_EVENT_LOSS);
1921 tcp_init_undo(tp); 1981 tcp_init_undo(tp);
1922 } 1982 }
1923 tp->snd_cwnd = 1; 1983 tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
1924 tp->snd_cwnd_cnt = 0; 1984 tp->snd_cwnd_cnt = 0;
1925 tp->snd_cwnd_stamp = tcp_jiffies32; 1985 tp->snd_cwnd_stamp = tcp_jiffies32;
1926 1986
1927 tp->retrans_out = 0;
1928 tp->lost_out = 0;
1929
1930 if (tcp_is_reno(tp))
1931 tcp_reset_reno_sack(tp);
1932
1933 skb = tcp_rtx_queue_head(sk);
1934 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1935 if (is_reneg) {
1936 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1937 tp->sacked_out = 0;
1938 /* Mark SACK reneging until we recover from this loss event. */
1939 tp->is_sack_reneg = 1;
1940 }
1941 tcp_clear_all_retrans_hints(tp);
1942
1943 skb_rbtree_walk_from(skb) {
1944 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1945 is_reneg);
1946 if (mark_lost)
1947 tcp_sum_lost(tp, skb);
1948 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1949 if (mark_lost) {
1950 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1951 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1952 tp->lost_out += tcp_skb_pcount(skb);
1953 }
1954 }
1955 tcp_verify_left_out(tp);
1956
1957 /* Timeout in disordered state after receiving substantial DUPACKs 1987 /* Timeout in disordered state after receiving substantial DUPACKs
1958 * suggests that the degree of reordering is over-estimated. 1988 * suggests that the degree of reordering is over-estimated.
1959 */ 1989 */
@@ -2120,7 +2150,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2120 return true; 2150 return true;
2121 2151
2122 /* Not-A-Trick#2 : Classic rule... */ 2152 /* Not-A-Trick#2 : Classic rule... */
2123 if (tcp_dupack_heuristics(tp) > tp->reordering) 2153 if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
2124 return true; 2154 return true;
2125 2155
2126 return false; 2156 return false;
@@ -2197,9 +2227,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2197{ 2227{
2198 struct tcp_sock *tp = tcp_sk(sk); 2228 struct tcp_sock *tp = tcp_sk(sk);
2199 2229
2200 if (tcp_is_reno(tp)) { 2230 if (tcp_is_sack(tp)) {
2201 tcp_mark_head_lost(sk, 1, 1);
2202 } else {
2203 int sacked_upto = tp->sacked_out - tp->reordering; 2231 int sacked_upto = tp->sacked_out - tp->reordering;
2204 if (sacked_upto >= 0) 2232 if (sacked_upto >= 0)
2205 tcp_mark_head_lost(sk, sacked_upto, 0); 2233 tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2697,12 +2725,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
2697 return false; 2725 return false;
2698} 2726}
2699 2727
2700static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) 2728static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
2701{ 2729{
2702 struct tcp_sock *tp = tcp_sk(sk); 2730 struct tcp_sock *tp = tcp_sk(sk);
2703 2731
2704 /* Use RACK to detect loss */ 2732 if (tcp_rtx_queue_empty(sk))
2705 if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { 2733 return;
2734
2735 if (unlikely(tcp_is_reno(tp))) {
2736 tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
2737 } else if (tcp_is_rack(sk)) {
2706 u32 prior_retrans = tp->retrans_out; 2738 u32 prior_retrans = tp->retrans_out;
2707 2739
2708 tcp_rack_mark_lost(sk); 2740 tcp_rack_mark_lost(sk);
@@ -2798,11 +2830,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2798 tcp_try_keep_open(sk); 2830 tcp_try_keep_open(sk);
2799 return; 2831 return;
2800 } 2832 }
2801 tcp_rack_identify_loss(sk, ack_flag); 2833 tcp_identify_packet_loss(sk, ack_flag);
2802 break; 2834 break;
2803 case TCP_CA_Loss: 2835 case TCP_CA_Loss:
2804 tcp_process_loss(sk, flag, is_dupack, rexmit); 2836 tcp_process_loss(sk, flag, is_dupack, rexmit);
2805 tcp_rack_identify_loss(sk, ack_flag); 2837 tcp_identify_packet_loss(sk, ack_flag);
2806 if (!(icsk->icsk_ca_state == TCP_CA_Open || 2838 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
2807 (*ack_flag & FLAG_LOST_RETRANS))) 2839 (*ack_flag & FLAG_LOST_RETRANS)))
2808 return; 2840 return;
@@ -2819,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2819 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 2851 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2820 tcp_try_undo_dsack(sk); 2852 tcp_try_undo_dsack(sk);
2821 2853
2822 tcp_rack_identify_loss(sk, ack_flag); 2854 tcp_identify_packet_loss(sk, ack_flag);
2823 if (!tcp_time_to_recover(sk, flag)) { 2855 if (!tcp_time_to_recover(sk, flag)) {
2824 tcp_try_to_open(sk, flag); 2856 tcp_try_to_open(sk, flag);
2825 return; 2857 return;
@@ -2841,7 +2873,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2841 fast_rexmit = 1; 2873 fast_rexmit = 1;
2842 } 2874 }
2843 2875
2844 if (do_lost) 2876 if (!tcp_is_rack(sk) && do_lost)
2845 tcp_update_scoreboard(sk, fast_rexmit); 2877 tcp_update_scoreboard(sk, fast_rexmit);
2846 *rexmit = REXMIT_LOST; 2878 *rexmit = REXMIT_LOST;
2847} 2879}
@@ -3496,6 +3528,22 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
3496 tcp_xmit_retransmit_queue(sk); 3528 tcp_xmit_retransmit_queue(sk);
3497} 3529}
3498 3530
3531/* Returns the number of packets newly acked or sacked by the current ACK */
3532static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
3533{
3534 const struct net *net = sock_net(sk);
3535 struct tcp_sock *tp = tcp_sk(sk);
3536 u32 delivered;
3537
3538 delivered = tp->delivered - prior_delivered;
3539 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3540 if (flag & FLAG_ECE) {
3541 tp->delivered_ce += delivered;
3542 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3543 }
3544 return delivered;
3545}
3546
3499/* This routine deals with incoming acks, but not outgoing ones. */ 3547/* This routine deals with incoming acks, but not outgoing ones. */
3500static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3548static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3501{ 3549{
@@ -3542,6 +3590,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3542 if (after(ack, prior_snd_una)) { 3590 if (after(ack, prior_snd_una)) {
3543 flag |= FLAG_SND_UNA_ADVANCED; 3591 flag |= FLAG_SND_UNA_ADVANCED;
3544 icsk->icsk_retransmits = 0; 3592 icsk->icsk_retransmits = 0;
3593
3594#if IS_ENABLED(CONFIG_TLS_DEVICE)
3595 if (static_branch_unlikely(&clean_acked_data_enabled))
3596 if (icsk->icsk_clean_acked)
3597 icsk->icsk_clean_acked(sk, ack);
3598#endif
3545 } 3599 }
3546 3600
3547 prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; 3601 prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
@@ -3619,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3619 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3673 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3620 sk_dst_confirm(sk); 3674 sk_dst_confirm(sk);
3621 3675
3622 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ 3676 delivered = tcp_newly_delivered(sk, delivered, flag);
3623 lost = tp->lost - lost; /* freshly marked lost */ 3677 lost = tp->lost - lost; /* freshly marked lost */
3624 rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); 3678 rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
3625 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); 3679 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3629,9 +3683,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3629 3683
3630no_queue: 3684no_queue:
3631 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3685 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3632 if (flag & FLAG_DSACKING_ACK) 3686 if (flag & FLAG_DSACKING_ACK) {
3633 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, 3687 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3634 &rexmit); 3688 &rexmit);
3689 tcp_newly_delivered(sk, delivered, flag);
3690 }
3635 /* If this ack opens up a zero window, clear backoff. It was 3691 /* If this ack opens up a zero window, clear backoff. It was
3636 * being used to time the probes, and is probably far higher than 3692 * being used to time the probes, and is probably far higher than
3637 * it needs to be for normal retransmission. 3693 * it needs to be for normal retransmission.
@@ -3655,6 +3711,7 @@ old_ack:
3655 &sack_state); 3711 &sack_state);
3656 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, 3712 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3657 &rexmit); 3713 &rexmit);
3714 tcp_newly_delivered(sk, delivered, flag);
3658 tcp_xmit_recovery(sk, rexmit); 3715 tcp_xmit_recovery(sk, rexmit);
3659 } 3716 }
3660 3717
@@ -4126,7 +4183,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4126 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 4183 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4127 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 4184 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4128 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4185 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4129 tcp_enter_quickack_mode(sk); 4186 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4130 4187
4131 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { 4188 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4132 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 4189 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4196,6 +4253,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4196 * If the sack array is full, forget about the last one. 4253 * If the sack array is full, forget about the last one.
4197 */ 4254 */
4198 if (this_sack >= TCP_NUM_SACKS) { 4255 if (this_sack >= TCP_NUM_SACKS) {
4256 if (tp->compressed_ack)
4257 tcp_send_ack(sk);
4199 this_sack--; 4258 this_sack--;
4200 tp->rx_opt.num_sacks--; 4259 tp->rx_opt.num_sacks--;
4201 sp--; 4260 sp--;
@@ -4377,7 +4436,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4377 u32 seq, end_seq; 4436 u32 seq, end_seq;
4378 bool fragstolen; 4437 bool fragstolen;
4379 4438
4380 tcp_ecn_check_ce(tp, skb); 4439 tcp_ecn_check_ce(sk, skb);
4381 4440
4382 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { 4441 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4383 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); 4442 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4573,6 +4632,17 @@ err:
4573 4632
4574} 4633}
4575 4634
4635void tcp_data_ready(struct sock *sk)
4636{
4637 const struct tcp_sock *tp = tcp_sk(sk);
4638 int avail = tp->rcv_nxt - tp->copied_seq;
4639
4640 if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
4641 return;
4642
4643 sk->sk_data_ready(sk);
4644}
4645
4576static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4646static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4577{ 4647{
4578 struct tcp_sock *tp = tcp_sk(sk); 4648 struct tcp_sock *tp = tcp_sk(sk);
@@ -4630,7 +4700,7 @@ queue_and_out:
4630 if (eaten > 0) 4700 if (eaten > 0)
4631 kfree_skb_partial(skb, fragstolen); 4701 kfree_skb_partial(skb, fragstolen);
4632 if (!sock_flag(sk, SOCK_DEAD)) 4702 if (!sock_flag(sk, SOCK_DEAD))
4633 sk->sk_data_ready(sk); 4703 tcp_data_ready(sk);
4634 return; 4704 return;
4635 } 4705 }
4636 4706
@@ -4640,7 +4710,7 @@ queue_and_out:
4640 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4710 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4641 4711
4642out_of_window: 4712out_of_window:
4643 tcp_enter_quickack_mode(sk); 4713 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4644 inet_csk_schedule_ack(sk); 4714 inet_csk_schedule_ack(sk);
4645drop: 4715drop:
4646 tcp_drop(sk, skb); 4716 tcp_drop(sk, skb);
@@ -4651,8 +4721,6 @@ drop:
4651 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) 4721 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
4652 goto out_of_window; 4722 goto out_of_window;
4653 4723
4654 tcp_enter_quickack_mode(sk);
4655
4656 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 4724 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4657 /* Partial packet, seq < rcv_next < end_seq */ 4725 /* Partial packet, seq < rcv_next < end_seq */
4658 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", 4726 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
@@ -5019,23 +5087,48 @@ static inline void tcp_data_snd_check(struct sock *sk)
5019static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) 5087static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5020{ 5088{
5021 struct tcp_sock *tp = tcp_sk(sk); 5089 struct tcp_sock *tp = tcp_sk(sk);
5090 unsigned long rtt, delay;
5022 5091
5023 /* More than one full frame received... */ 5092 /* More than one full frame received... */
5024 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && 5093 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5025 /* ... and right edge of window advances far enough. 5094 /* ... and right edge of window advances far enough.
5026 * (tcp_recvmsg() will send ACK otherwise). Or... 5095 * (tcp_recvmsg() will send ACK otherwise).
5096 * If application uses SO_RCVLOWAT, we want send ack now if
5097 * we have not received enough bytes to satisfy the condition.
5027 */ 5098 */
5028 __tcp_select_window(sk) >= tp->rcv_wnd) || 5099 (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
5100 __tcp_select_window(sk) >= tp->rcv_wnd)) ||
5029 /* We ACK each frame or... */ 5101 /* We ACK each frame or... */
5030 tcp_in_quickack_mode(sk) || 5102 tcp_in_quickack_mode(sk)) {
5031 /* We have out of order data. */ 5103send_now:
5032 (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5033 /* Then ack it now */
5034 tcp_send_ack(sk); 5104 tcp_send_ack(sk);
5035 } else { 5105 return;
5036 /* Else, send delayed ack. */ 5106 }
5107
5108 if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5037 tcp_send_delayed_ack(sk); 5109 tcp_send_delayed_ack(sk);
5110 return;
5038 } 5111 }
5112
5113 if (!tcp_is_sack(tp) ||
5114 tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
5115 goto send_now;
5116 tp->compressed_ack++;
5117
5118 if (hrtimer_is_queued(&tp->compressed_ack_timer))
5119 return;
5120
5121 /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
5122
5123 rtt = tp->rcv_rtt_est.rtt_us;
5124 if (tp->srtt_us && tp->srtt_us < rtt)
5125 rtt = tp->srtt_us;
5126
5127 delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
5128 rtt * (NSEC_PER_USEC >> 3)/20);
5129 sock_hold(sk);
5130 hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
5131 HRTIMER_MODE_REL_PINNED_SOFT);
5039} 5132}
5040 5133
5041static inline void tcp_ack_snd_check(struct sock *sk) 5134static inline void tcp_ack_snd_check(struct sock *sk)
@@ -5299,11 +5392,11 @@ discard:
5299 * the rest is checked inline. Fast processing is turned on in 5392 * the rest is checked inline. Fast processing is turned on in
5300 * tcp_data_queue when everything is OK. 5393 * tcp_data_queue when everything is OK.
5301 */ 5394 */
5302void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, 5395void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
5303 const struct tcphdr *th)
5304{ 5396{
5305 unsigned int len = skb->len; 5397 const struct tcphdr *th = (const struct tcphdr *)skb->data;
5306 struct tcp_sock *tp = tcp_sk(sk); 5398 struct tcp_sock *tp = tcp_sk(sk);
5399 unsigned int len = skb->len;
5307 5400
5308 /* TCP congestion window tracking */ 5401 /* TCP congestion window tracking */
5309 trace_tcp_probe(sk, skb); 5402 trace_tcp_probe(sk, skb);
@@ -5428,7 +5521,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5428no_ack: 5521no_ack:
5429 if (eaten) 5522 if (eaten)
5430 kfree_skb_partial(skb, fragstolen); 5523 kfree_skb_partial(skb, fragstolen);
5431 sk->sk_data_ready(sk); 5524 tcp_data_ready(sk);
5432 return; 5525 return;
5433 } 5526 }
5434 } 5527 }
@@ -5550,9 +5643,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5550 return true; 5643 return true;
5551 } 5644 }
5552 tp->syn_data_acked = tp->syn_data; 5645 tp->syn_data_acked = tp->syn_data;
5553 if (tp->syn_data_acked) 5646 if (tp->syn_data_acked) {
5554 NET_INC_STATS(sock_net(sk), 5647 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
5555 LINUX_MIB_TCPFASTOPENACTIVE); 5648 /* SYN-data is counted as two separate packets in tcp_ack() */
5649 if (tp->delivered > 1)
5650 --tp->delivered;
5651 }
5556 5652
5557 tcp_fastopen_add_skb(sk, synack); 5653 tcp_fastopen_add_skb(sk, synack);
5558 5654
@@ -5698,7 +5794,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5698 * to stand against the temptation 8) --ANK 5794 * to stand against the temptation 8) --ANK
5699 */ 5795 */
5700 inet_csk_schedule_ack(sk); 5796 inet_csk_schedule_ack(sk);
5701 tcp_enter_quickack_mode(sk); 5797 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
5702 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5798 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5703 TCP_DELACK_MAX, TCP_RTO_MAX); 5799 TCP_DELACK_MAX, TCP_RTO_MAX);
5704 5800
@@ -5884,6 +5980,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5884 } 5980 }
5885 switch (sk->sk_state) { 5981 switch (sk->sk_state) {
5886 case TCP_SYN_RECV: 5982 case TCP_SYN_RECV:
5983 tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
5887 if (!tp->srtt_us) 5984 if (!tp->srtt_us)
5888 tcp_synack_rtt_meas(sk, req); 5985 tcp_synack_rtt_meas(sk, req);
5889 5986
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c970626b398..fed3f1c66167 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
110 110
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{ 112{
113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk); 115 struct tcp_sock *tp = tcp_sk(sk);
116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118 if (reuse == 2) {
119 /* Still does not detect *everything* that goes through
120 * lo, since we require a loopback src or dst address
121 * or direct binding to 'lo' interface.
122 */
123 bool loopback = false;
124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 loopback = true;
126#if IS_ENABLED(CONFIG_IPV6)
127 if (tw->tw_family == AF_INET6) {
128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 loopback = true;
135 } else
136#endif
137 {
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
140 loopback = true;
141 }
142 if (!loopback)
143 reuse = 0;
144 }
115 145
116 /* With PAWS, it is safe from the viewpoint 146 /* With PAWS, it is safe from the viewpoint
117 of data integrity. Even without PAWS it is safe provided sequence 147 of data integrity. Even without PAWS it is safe provided sequence
@@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
125 and use initial timestamp retrieved from peer table. 155 and use initial timestamp retrieved from peer table.
126 */ 156 */
127 if (tcptw->tw_ts_recent_stamp && 157 if (tcptw->tw_ts_recent_stamp &&
128 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && 158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 159 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 if (tp->write_seq == 0) 160 if (tp->write_seq == 0)
132 tp->write_seq = 1; 161 tp->write_seq = 1;
@@ -621,6 +650,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
621 struct sock *sk1 = NULL; 650 struct sock *sk1 = NULL;
622#endif 651#endif
623 struct net *net; 652 struct net *net;
653 struct sock *ctl_sk;
624 654
625 /* Never send a reset in response to a reset. */ 655 /* Never send a reset in response to a reset. */
626 if (th->rst) 656 if (th->rst)
@@ -723,11 +753,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
723 arg.tos = ip_hdr(skb)->tos; 753 arg.tos = ip_hdr(skb)->tos;
724 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 754 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
725 local_bh_disable(); 755 local_bh_disable();
726 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 756 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
757 if (sk)
758 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
759 inet_twsk(sk)->tw_mark : sk->sk_mark;
760 ip_send_unicast_reply(ctl_sk,
727 skb, &TCP_SKB_CB(skb)->header.h4.opt, 761 skb, &TCP_SKB_CB(skb)->header.h4.opt,
728 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 762 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
729 &arg, arg.iov[0].iov_len); 763 &arg, arg.iov[0].iov_len);
730 764
765 ctl_sk->sk_mark = 0;
731 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 766 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
732 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 767 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
733 local_bh_enable(); 768 local_bh_enable();
@@ -759,6 +794,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
759 } rep; 794 } rep;
760 struct net *net = sock_net(sk); 795 struct net *net = sock_net(sk);
761 struct ip_reply_arg arg; 796 struct ip_reply_arg arg;
797 struct sock *ctl_sk;
762 798
763 memset(&rep.th, 0, sizeof(struct tcphdr)); 799 memset(&rep.th, 0, sizeof(struct tcphdr));
764 memset(&arg, 0, sizeof(arg)); 800 memset(&arg, 0, sizeof(arg));
@@ -809,11 +845,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
809 arg.tos = tos; 845 arg.tos = tos;
810 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 846 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
811 local_bh_disable(); 847 local_bh_disable();
812 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 848 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
849 if (sk)
850 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
851 inet_twsk(sk)->tw_mark : sk->sk_mark;
852 ip_send_unicast_reply(ctl_sk,
813 skb, &TCP_SKB_CB(skb)->header.h4.opt, 853 skb, &TCP_SKB_CB(skb)->header.h4.opt,
814 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 854 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
815 &arg, arg.iov[0].iov_len); 855 &arg, arg.iov[0].iov_len);
816 856
857 ctl_sk->sk_mark = 0;
817 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 858 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
818 local_bh_enable(); 859 local_bh_enable();
819} 860}
@@ -1474,7 +1515,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1474 sk->sk_rx_dst = NULL; 1515 sk->sk_rx_dst = NULL;
1475 } 1516 }
1476 } 1517 }
1477 tcp_rcv_established(sk, skb, tcp_hdr(skb)); 1518 tcp_rcv_established(sk, skb);
1478 return 0; 1519 return 0;
1479 } 1520 }
1480 1521
@@ -2481,7 +2522,7 @@ static int __net_init tcp_sk_init(struct net *net)
2481 net->ipv4.sysctl_tcp_orphan_retries = 0; 2522 net->ipv4.sysctl_tcp_orphan_retries = 0;
2482 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2523 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2483 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2524 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2484 net->ipv4.sysctl_tcp_tw_reuse = 0; 2525 net->ipv4.sysctl_tcp_tw_reuse = 2;
2485 2526
2486 cnt = tcp_hashinfo.ehash_mask + 1; 2527 cnt = tcp_hashinfo.ehash_mask + 1;
2487 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2528 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
@@ -2524,6 +2565,8 @@ static int __net_init tcp_sk_init(struct net *net)
2524 init_net.ipv4.sysctl_tcp_wmem, 2565 init_net.ipv4.sysctl_tcp_wmem,
2525 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2566 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2526 } 2567 }
2568 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2569 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2527 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2570 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2528 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2571 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2529 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2572 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..1dda1341a223 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
263 struct inet_sock *inet = inet_sk(sk); 263 struct inet_sock *inet = inet_sk(sk);
264 264
265 tw->tw_transparent = inet->transparent; 265 tw->tw_transparent = inet->transparent;
266 tw->tw_mark = sk->sk_mark;
266 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 267 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
267 tcptw->tw_rcv_nxt = tp->rcv_nxt; 268 tcptw->tw_rcv_nxt = tp->rcv_nxt;
268 tcptw->tw_snd_nxt = tp->snd_nxt; 269 tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -306,7 +307,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
306 if (timeo < rto) 307 if (timeo < rto)
307 timeo = rto; 308 timeo = rto;
308 309
309 tw->tw_timeout = TCP_TIMEWAIT_LEN;
310 if (state == TCP_TIME_WAIT) 310 if (state == TCP_TIME_WAIT)
311 timeo = TCP_TIMEWAIT_LEN; 311 timeo = TCP_TIMEWAIT_LEN;
312 312
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d07e34f8e309..8e08b409c71e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
162/* Account for an ACK we sent. */ 162/* Account for an ACK we sent. */
163static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 163static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
164{ 164{
165 struct tcp_sock *tp = tcp_sk(sk);
166
167 if (unlikely(tp->compressed_ack)) {
168 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
169 tp->compressed_ack);
170 tp->compressed_ack = 0;
171 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
172 __sock_put(sk);
173 }
165 tcp_dec_quickack_mode(sk, pkts); 174 tcp_dec_quickack_mode(sk, pkts);
166 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 175 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
167} 176}
@@ -229,11 +238,9 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
229 } 238 }
230 } 239 }
231 240
232 if (mss > (1 << *rcv_wscale)) { 241 if (!init_rcv_wnd) /* Use default unless specified otherwise */
233 if (!init_rcv_wnd) /* Use default unless specified otherwise */ 242 init_rcv_wnd = tcp_default_init_rwnd(mss);
234 init_rcv_wnd = tcp_default_init_rwnd(mss); 243 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
235 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
236 }
237 244
238 /* Set the clamp no higher than max representable value */ 245 /* Set the clamp no higher than max representable value */
239 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); 246 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -585,14 +592,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
585 unsigned int remaining = MAX_TCP_OPTION_SPACE; 592 unsigned int remaining = MAX_TCP_OPTION_SPACE;
586 struct tcp_fastopen_request *fastopen = tp->fastopen_req; 593 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
587 594
595 *md5 = NULL;
588#ifdef CONFIG_TCP_MD5SIG 596#ifdef CONFIG_TCP_MD5SIG
589 *md5 = tp->af_specific->md5_lookup(sk, sk); 597 if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
590 if (*md5) { 598 *md5 = tp->af_specific->md5_lookup(sk, sk);
591 opts->options |= OPTION_MD5; 599 if (*md5) {
592 remaining -= TCPOLEN_MD5SIG_ALIGNED; 600 opts->options |= OPTION_MD5;
601 remaining -= TCPOLEN_MD5SIG_ALIGNED;
602 }
593 } 603 }
594#else
595 *md5 = NULL;
596#endif 604#endif
597 605
598 /* We always get an MSS option. The option bytes which will be seen in 606 /* We always get an MSS option. The option bytes which will be seen in
@@ -720,14 +728,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
720 728
721 opts->options = 0; 729 opts->options = 0;
722 730
731 *md5 = NULL;
723#ifdef CONFIG_TCP_MD5SIG 732#ifdef CONFIG_TCP_MD5SIG
724 *md5 = tp->af_specific->md5_lookup(sk, sk); 733 if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
725 if (unlikely(*md5)) { 734 *md5 = tp->af_specific->md5_lookup(sk, sk);
726 opts->options |= OPTION_MD5; 735 if (*md5) {
727 size += TCPOLEN_MD5SIG_ALIGNED; 736 opts->options |= OPTION_MD5;
737 size += TCPOLEN_MD5SIG_ALIGNED;
738 }
728 } 739 }
729#else
730 *md5 = NULL;
731#endif 740#endif
732 741
733 if (likely(tp->rx_opt.tstamp_ok)) { 742 if (likely(tp->rx_opt.tstamp_ok)) {
@@ -772,7 +781,7 @@ struct tsq_tasklet {
772}; 781};
773static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); 782static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
774 783
775static void tcp_tsq_handler(struct sock *sk) 784static void tcp_tsq_write(struct sock *sk)
776{ 785{
777 if ((1 << sk->sk_state) & 786 if ((1 << sk->sk_state) &
778 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | 787 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
@@ -789,6 +798,16 @@ static void tcp_tsq_handler(struct sock *sk)
789 0, GFP_ATOMIC); 798 0, GFP_ATOMIC);
790 } 799 }
791} 800}
801
802static void tcp_tsq_handler(struct sock *sk)
803{
804 bh_lock_sock(sk);
805 if (!sock_owned_by_user(sk))
806 tcp_tsq_write(sk);
807 else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
808 sock_hold(sk);
809 bh_unlock_sock(sk);
810}
792/* 811/*
793 * One tasklet per cpu tries to send more skbs. 812 * One tasklet per cpu tries to send more skbs.
794 * We run in tasklet context but need to disable irqs when 813 * We run in tasklet context but need to disable irqs when
@@ -816,16 +835,7 @@ static void tcp_tasklet_func(unsigned long data)
816 smp_mb__before_atomic(); 835 smp_mb__before_atomic();
817 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); 836 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
818 837
819 if (!sk->sk_lock.owned && 838 tcp_tsq_handler(sk);
820 test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
821 bh_lock_sock(sk);
822 if (!sock_owned_by_user(sk)) {
823 clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
824 tcp_tsq_handler(sk);
825 }
826 bh_unlock_sock(sk);
827 }
828
829 sk_free(sk); 839 sk_free(sk);
830 } 840 }
831} 841}
@@ -853,9 +863,10 @@ void tcp_release_cb(struct sock *sk)
853 nflags = flags & ~TCP_DEFERRED_ALL; 863 nflags = flags & ~TCP_DEFERRED_ALL;
854 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); 864 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
855 865
856 if (flags & TCPF_TSQ_DEFERRED) 866 if (flags & TCPF_TSQ_DEFERRED) {
857 tcp_tsq_handler(sk); 867 tcp_tsq_write(sk);
858 868 __sock_put(sk);
869 }
859 /* Here begins the tricky part : 870 /* Here begins the tricky part :
860 * We are called from release_sock() with : 871 * We are called from release_sock() with :
861 * 1) BH disabled 872 * 1) BH disabled
@@ -929,7 +940,7 @@ void tcp_wfree(struct sk_buff *skb)
929 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) 940 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
930 goto out; 941 goto out;
931 942
932 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; 943 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
933 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); 944 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
934 if (nval != oval) 945 if (nval != oval)
935 continue; 946 continue;
@@ -948,37 +959,17 @@ out:
948 sk_free(sk); 959 sk_free(sk);
949} 960}
950 961
951/* Note: Called under hard irq. 962/* Note: Called under soft irq.
952 * We can not call TCP stack right away. 963 * We can call TCP stack right away, unless socket is owned by user.
953 */ 964 */
954enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) 965enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
955{ 966{
956 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); 967 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
957 struct sock *sk = (struct sock *)tp; 968 struct sock *sk = (struct sock *)tp;
958 unsigned long nval, oval;
959 969
960 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { 970 tcp_tsq_handler(sk);
961 struct tsq_tasklet *tsq; 971 sock_put(sk);
962 bool empty;
963
964 if (oval & TSQF_QUEUED)
965 break;
966
967 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
968 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
969 if (nval != oval)
970 continue;
971 972
972 if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
973 break;
974 /* queue this socket to tasklet queue */
975 tsq = this_cpu_ptr(&tsq_tasklet);
976 empty = list_empty(&tsq->head);
977 list_add(&tp->tsq_node, &tsq->head);
978 if (empty)
979 tasklet_schedule(&tsq->tasklet);
980 break;
981 }
982 return HRTIMER_NORESTART; 973 return HRTIMER_NORESTART;
983} 974}
984 975
@@ -1011,7 +1002,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
1011 do_div(len_ns, rate); 1002 do_div(len_ns, rate);
1012 hrtimer_start(&tcp_sk(sk)->pacing_timer, 1003 hrtimer_start(&tcp_sk(sk)->pacing_timer,
1013 ktime_add_ns(ktime_get(), len_ns), 1004 ktime_add_ns(ktime_get(), len_ns),
1014 HRTIMER_MODE_ABS_PINNED); 1005 HRTIMER_MODE_ABS_PINNED_SOFT);
1006 sock_hold(sk);
1015} 1007}
1016 1008
1017static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) 1009static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1078,7 +1070,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1078 1070
1079 /* if no packet is in qdisc/device queue, then allow XPS to select 1071 /* if no packet is in qdisc/device queue, then allow XPS to select
1080 * another queue. We can be called from tcp_tsq_handler() 1072 * another queue. We can be called from tcp_tsq_handler()
1081 * which holds one reference to sk_wmem_alloc. 1073 * which holds one reference to sk.
1082 * 1074 *
1083 * TODO: Ideally, in-flight pure ACK packets should not matter here. 1075 * TODO: Ideally, in-flight pure ACK packets should not matter here.
1084 * One way to get this would be to set skb->truesize = 2 on them. 1076 * One way to get this would be to set skb->truesize = 2 on them.
@@ -2185,7 +2177,7 @@ static int tcp_mtu_probe(struct sock *sk)
2185static bool tcp_pacing_check(const struct sock *sk) 2177static bool tcp_pacing_check(const struct sock *sk)
2186{ 2178{
2187 return tcp_needs_internal_pacing(sk) && 2179 return tcp_needs_internal_pacing(sk) &&
2188 hrtimer_active(&tcp_sk(sk)->pacing_timer); 2180 hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
2189} 2181}
2190 2182
2191/* TCP Small Queues : 2183/* TCP Small Queues :
@@ -2365,8 +2357,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2365 skb, limit, mss_now, gfp))) 2357 skb, limit, mss_now, gfp)))
2366 break; 2358 break;
2367 2359
2368 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
2369 clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
2370 if (tcp_small_queue_check(sk, skb, 0)) 2360 if (tcp_small_queue_check(sk, skb, 0))
2371 break; 2361 break;
2372 2362
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3a81720ac0c4..71593e4400ab 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,7 +2,7 @@
2#include <linux/tcp.h> 2#include <linux/tcp.h>
3#include <net/tcp.h> 3#include <net/tcp.h>
4 4
5static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) 5void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
6{ 6{
7 struct tcp_sock *tp = tcp_sk(sk); 7 struct tcp_sock *tp = tcp_sk(sk);
8 8
@@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
21 return t1 > t2 || (t1 == t2 && after(seq1, seq2)); 21 return t1 > t2 || (t1 == t2 && after(seq1, seq2));
22} 22}
23 23
24static u32 tcp_rack_reo_wnd(const struct sock *sk)
25{
26 struct tcp_sock *tp = tcp_sk(sk);
27
28 if (!tp->rack.reord) {
29 /* If reordering has not been observed, be aggressive during
30 * the recovery or starting the recovery by DUPACK threshold.
31 */
32 if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
33 return 0;
34
35 if (tp->sacked_out >= tp->reordering &&
36 !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
37 return 0;
38 }
39
40 /* To be more reordering resilient, allow min_rtt/4 settling delay.
41 * Use min_rtt instead of the smoothed RTT because reordering is
42 * often a path property and less related to queuing or delayed ACKs.
43 * Upon receiving DSACKs, linearly increase the window up to the
44 * smoothed RTT.
45 */
46 return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
47 tp->srtt_us >> 3);
48}
49
50s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
51{
52 return tp->rack.rtt_us + reo_wnd -
53 tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
54}
55
24/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): 56/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
25 * 57 *
26 * Marks a packet lost, if some packet sent later has been (s)acked. 58 * Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
44static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) 76static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
45{ 77{
46 struct tcp_sock *tp = tcp_sk(sk); 78 struct tcp_sock *tp = tcp_sk(sk);
47 u32 min_rtt = tcp_min_rtt(tp);
48 struct sk_buff *skb, *n; 79 struct sk_buff *skb, *n;
49 u32 reo_wnd; 80 u32 reo_wnd;
50 81
51 *reo_timeout = 0; 82 *reo_timeout = 0;
52 /* To be more reordering resilient, allow min_rtt/4 settling delay 83 reo_wnd = tcp_rack_reo_wnd(sk);
53 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
54 * RTT because reordering is often a path property and less related
55 * to queuing or delayed ACKs.
56 */
57 reo_wnd = 1000;
58 if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
59 min_rtt != ~0U) {
60 reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
61 reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
62 }
63
64 list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, 84 list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
65 tcp_tsorted_anchor) { 85 tcp_tsorted_anchor) {
66 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 86 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
78 /* A packet is lost if it has not been s/acked beyond 98 /* A packet is lost if it has not been s/acked beyond
79 * the recent RTT plus the reordering window. 99 * the recent RTT plus the reordering window.
80 */ 100 */
81 remaining = tp->rack.rtt_us + reo_wnd - 101 remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
82 tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
83 if (remaining <= 0) { 102 if (remaining <= 0) {
84 tcp_rack_mark_skb_lost(sk, skb); 103 tcp_mark_skb_lost(sk, skb);
85 list_del_init(&skb->tcp_tsorted_anchor); 104 list_del_init(&skb->tcp_tsorted_anchor);
86 } else { 105 } else {
87 /* Record maximum wait time */ 106 /* Record maximum wait time */
@@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
202 tp->rack.reo_wnd_steps = 1; 221 tp->rack.reo_wnd_steps = 1;
203 } 222 }
204} 223}
224
225/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
226 * the next unacked packet upon receiving
227 * a) three or more DUPACKs to start the fast recovery
228 * b) an ACK acknowledging new data during the fast recovery.
229 */
230void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
231{
232 const u8 state = inet_csk(sk)->icsk_ca_state;
233 struct tcp_sock *tp = tcp_sk(sk);
234
235 if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
236 (state == TCP_CA_Recovery && snd_una_advanced)) {
237 struct sk_buff *skb = tcp_rtx_queue_head(sk);
238 u32 mss;
239
240 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
241 return;
242
243 mss = tcp_skb_mss(skb);
244 if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
245 tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
246 mss, mss, GFP_ATOMIC);
247
248 tcp_skb_mark_lost_uncond_verify(tp, skb);
249 }
250}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f7d944855f8e..3b3611729928 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -708,11 +708,36 @@ out:
708 sock_put(sk); 708 sock_put(sk);
709} 709}
710 710
711static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
712{
713 struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
714 struct sock *sk = (struct sock *)tp;
715
716 bh_lock_sock(sk);
717 if (!sock_owned_by_user(sk)) {
718 if (tp->compressed_ack)
719 tcp_send_ack(sk);
720 } else {
721 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
722 &sk->sk_tsq_flags))
723 sock_hold(sk);
724 }
725 bh_unlock_sock(sk);
726
727 sock_put(sk);
728
729 return HRTIMER_NORESTART;
730}
731
711void tcp_init_xmit_timers(struct sock *sk) 732void tcp_init_xmit_timers(struct sock *sk)
712{ 733{
713 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, 734 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
714 &tcp_keepalive_timer); 735 &tcp_keepalive_timer);
715 hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, 736 hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
716 HRTIMER_MODE_ABS_PINNED); 737 HRTIMER_MODE_ABS_PINNED_SOFT);
717 tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; 738 tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
739
740 hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
741 HRTIMER_MODE_REL_PINNED_SOFT);
742 tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
718} 743}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 675433eb53a8..3365362cac88 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -544,9 +544,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
544/* Must be called under rcu_read_lock(). 544/* Must be called under rcu_read_lock().
545 * Does increment socket refcount. 545 * Does increment socket refcount.
546 */ 546 */
547#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ 547#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
548 IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
549 IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
550struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, 548struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
551 __be32 daddr, __be16 dport, int dif) 549 __be32 daddr, __be16 dport, int dif)
552{ 550{
@@ -757,7 +755,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
757} 755}
758EXPORT_SYMBOL(udp_set_csum); 756EXPORT_SYMBOL(udp_set_csum);
759 757
760static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) 758static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
759 struct inet_cork *cork)
761{ 760{
762 struct sock *sk = skb->sk; 761 struct sock *sk = skb->sk;
763 struct inet_sock *inet = inet_sk(sk); 762 struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +776,27 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
777 uh->len = htons(len); 776 uh->len = htons(len);
778 uh->check = 0; 777 uh->check = 0;
779 778
779 if (cork->gso_size) {
780 const int hlen = skb_network_header_len(skb) +
781 sizeof(struct udphdr);
782
783 if (hlen + cork->gso_size > cork->fragsize)
784 return -EINVAL;
785 if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
786 return -EINVAL;
787 if (sk->sk_no_check_tx)
788 return -EINVAL;
789 if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
790 dst_xfrm(skb_dst(skb)))
791 return -EIO;
792
793 skb_shinfo(skb)->gso_size = cork->gso_size;
794 skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
795 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
796 cork->gso_size);
797 goto csum_partial;
798 }
799
780 if (is_udplite) /* UDP-Lite */ 800 if (is_udplite) /* UDP-Lite */
781 csum = udplite_csum(skb); 801 csum = udplite_csum(skb);
782 802
@@ -786,6 +806,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
786 goto send; 806 goto send;
787 807
788 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 808 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
809csum_partial:
789 810
790 udp4_hwcsum(skb, fl4->saddr, fl4->daddr); 811 udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
791 goto send; 812 goto send;
@@ -828,7 +849,7 @@ int udp_push_pending_frames(struct sock *sk)
828 if (!skb) 849 if (!skb)
829 goto out; 850 goto out;
830 851
831 err = udp_send_skb(skb, fl4); 852 err = udp_send_skb(skb, fl4, &inet->cork.base);
832 853
833out: 854out:
834 up->len = 0; 855 up->len = 0;
@@ -837,10 +858,48 @@ out:
837} 858}
838EXPORT_SYMBOL(udp_push_pending_frames); 859EXPORT_SYMBOL(udp_push_pending_frames);
839 860
861static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
862{
863 switch (cmsg->cmsg_type) {
864 case UDP_SEGMENT:
865 if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
866 return -EINVAL;
867 *gso_size = *(__u16 *)CMSG_DATA(cmsg);
868 return 0;
869 default:
870 return -EINVAL;
871 }
872}
873
874int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
875{
876 struct cmsghdr *cmsg;
877 bool need_ip = false;
878 int err;
879
880 for_each_cmsghdr(cmsg, msg) {
881 if (!CMSG_OK(msg, cmsg))
882 return -EINVAL;
883
884 if (cmsg->cmsg_level != SOL_UDP) {
885 need_ip = true;
886 continue;
887 }
888
889 err = __udp_cmsg_send(cmsg, gso_size);
890 if (err)
891 return err;
892 }
893
894 return need_ip;
895}
896EXPORT_SYMBOL_GPL(udp_cmsg_send);
897
840int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 898int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
841{ 899{
842 struct inet_sock *inet = inet_sk(sk); 900 struct inet_sock *inet = inet_sk(sk);
843 struct udp_sock *up = udp_sk(sk); 901 struct udp_sock *up = udp_sk(sk);
902 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
844 struct flowi4 fl4_stack; 903 struct flowi4 fl4_stack;
845 struct flowi4 *fl4; 904 struct flowi4 *fl4;
846 int ulen = len; 905 int ulen = len;
@@ -895,8 +954,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
895 /* 954 /*
896 * Get and verify the address. 955 * Get and verify the address.
897 */ 956 */
898 if (msg->msg_name) { 957 if (usin) {
899 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
900 if (msg->msg_namelen < sizeof(*usin)) 958 if (msg->msg_namelen < sizeof(*usin))
901 return -EINVAL; 959 return -EINVAL;
902 if (usin->sin_family != AF_INET) { 960 if (usin->sin_family != AF_INET) {
@@ -922,10 +980,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
922 ipc.sockc.tsflags = sk->sk_tsflags; 980 ipc.sockc.tsflags = sk->sk_tsflags;
923 ipc.addr = inet->inet_saddr; 981 ipc.addr = inet->inet_saddr;
924 ipc.oif = sk->sk_bound_dev_if; 982 ipc.oif = sk->sk_bound_dev_if;
983 ipc.gso_size = up->gso_size;
925 984
926 if (msg->msg_controllen) { 985 if (msg->msg_controllen) {
927 err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); 986 err = udp_cmsg_send(sk, msg, &ipc.gso_size);
928 if (unlikely(err)) { 987 if (err > 0)
988 err = ip_cmsg_send(sk, msg, &ipc,
989 sk->sk_family == AF_INET6);
990 if (unlikely(err < 0)) {
929 kfree(ipc.opt); 991 kfree(ipc.opt);
930 return err; 992 return err;
931 } 993 }
@@ -946,6 +1008,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
946 rcu_read_unlock(); 1008 rcu_read_unlock();
947 } 1009 }
948 1010
1011 if (cgroup_bpf_enabled && !connected) {
1012 err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
1013 (struct sockaddr *)usin, &ipc.addr);
1014 if (err)
1015 goto out_free;
1016 if (usin) {
1017 if (usin->sin_port == 0) {
1018 /* BPF program set invalid port. Reject it. */
1019 err = -EINVAL;
1020 goto out_free;
1021 }
1022 daddr = usin->sin_addr.s_addr;
1023 dport = usin->sin_port;
1024 }
1025 }
1026
949 saddr = ipc.addr; 1027 saddr = ipc.addr;
950 ipc.addr = faddr = daddr; 1028 ipc.addr = faddr = daddr;
951 1029
@@ -1032,12 +1110,14 @@ back_from_confirm:
1032 1110
1033 /* Lockless fast path for the non-corking case. */ 1111 /* Lockless fast path for the non-corking case. */
1034 if (!corkreq) { 1112 if (!corkreq) {
1113 struct inet_cork cork;
1114
1035 skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, 1115 skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
1036 sizeof(struct udphdr), &ipc, &rt, 1116 sizeof(struct udphdr), &ipc, &rt,
1037 msg->msg_flags); 1117 &cork, msg->msg_flags);
1038 err = PTR_ERR(skb); 1118 err = PTR_ERR(skb);
1039 if (!IS_ERR_OR_NULL(skb)) 1119 if (!IS_ERR_OR_NULL(skb))
1040 err = udp_send_skb(skb, fl4); 1120 err = udp_send_skb(skb, fl4, &cork);
1041 goto out; 1121 goto out;
1042 } 1122 }
1043 1123
@@ -1813,10 +1893,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1813 return 0; 1893 return 0;
1814} 1894}
1815 1895
1816static struct static_key udp_encap_needed __read_mostly; 1896static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
1817void udp_encap_enable(void) 1897void udp_encap_enable(void)
1818{ 1898{
1819 static_key_enable(&udp_encap_needed); 1899 static_branch_enable(&udp_encap_needed_key);
1820} 1900}
1821EXPORT_SYMBOL(udp_encap_enable); 1901EXPORT_SYMBOL(udp_encap_enable);
1822 1902
@@ -1840,7 +1920,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1840 goto drop; 1920 goto drop;
1841 nf_reset(skb); 1921 nf_reset(skb);
1842 1922
1843 if (static_key_false(&udp_encap_needed) && up->encap_type) { 1923 if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
1844 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); 1924 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
1845 1925
1846 /* 1926 /*
@@ -2303,7 +2383,7 @@ void udp_destroy_sock(struct sock *sk)
2303 bool slow = lock_sock_fast(sk); 2383 bool slow = lock_sock_fast(sk);
2304 udp_flush_pending_frames(sk); 2384 udp_flush_pending_frames(sk);
2305 unlock_sock_fast(sk, slow); 2385 unlock_sock_fast(sk, slow);
2306 if (static_key_false(&udp_encap_needed) && up->encap_type) { 2386 if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
2307 void (*encap_destroy)(struct sock *sk); 2387 void (*encap_destroy)(struct sock *sk);
2308 encap_destroy = READ_ONCE(up->encap_destroy); 2388 encap_destroy = READ_ONCE(up->encap_destroy);
2309 if (encap_destroy) 2389 if (encap_destroy)
@@ -2368,6 +2448,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
2368 up->no_check6_rx = valbool; 2448 up->no_check6_rx = valbool;
2369 break; 2449 break;
2370 2450
2451 case UDP_SEGMENT:
2452 if (val < 0 || val > USHRT_MAX)
2453 return -EINVAL;
2454 up->gso_size = val;
2455 break;
2456
2371 /* 2457 /*
2372 * UDP-Lite's partial checksum coverage (RFC 3828). 2458 * UDP-Lite's partial checksum coverage (RFC 3828).
2373 */ 2459 */
@@ -2458,6 +2544,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
2458 val = up->no_check6_rx; 2544 val = up->no_check6_rx;
2459 break; 2545 break;
2460 2546
2547 case UDP_SEGMENT:
2548 val = up->gso_size;
2549 break;
2550
2461 /* The following two cannot be changed on UDP sockets, the return is 2551 /* The following two cannot be changed on UDP sockets, the return is
2462 * always 0 (which corresponds to the full checksum coverage of UDP). */ 2552 * always 0 (which corresponds to the full checksum coverage of UDP). */
2463 case UDPLITE_SEND_CSCOV: 2553 case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..92dc9e5a7ff3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,102 @@ out_unlock:
187} 187}
188EXPORT_SYMBOL(skb_udp_tunnel_segment); 188EXPORT_SYMBOL(skb_udp_tunnel_segment);
189 189
190struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
191 netdev_features_t features)
192{
193 struct sock *sk = gso_skb->sk;
194 unsigned int sum_truesize = 0;
195 struct sk_buff *segs, *seg;
196 struct udphdr *uh;
197 unsigned int mss;
198 bool copy_dtor;
199 __sum16 check;
200 __be16 newlen;
201
202 mss = skb_shinfo(gso_skb)->gso_size;
203 if (gso_skb->len <= sizeof(*uh) + mss)
204 return ERR_PTR(-EINVAL);
205
206 skb_pull(gso_skb, sizeof(*uh));
207
208 /* clear destructor to avoid skb_segment assigning it to tail */
209 copy_dtor = gso_skb->destructor == sock_wfree;
210 if (copy_dtor)
211 gso_skb->destructor = NULL;
212
213 segs = skb_segment(gso_skb, features);
214 if (unlikely(IS_ERR_OR_NULL(segs))) {
215 if (copy_dtor)
216 gso_skb->destructor = sock_wfree;
217 return segs;
218 }
219
220 /* GSO partial and frag_list segmentation only requires splitting
221 * the frame into an MSS multiple and possibly a remainder, both
222 * cases return a GSO skb. So update the mss now.
223 */
224 if (skb_is_gso(segs))
225 mss *= skb_shinfo(segs)->gso_segs;
226
227 seg = segs;
228 uh = udp_hdr(seg);
229
230 /* compute checksum adjustment based on old length versus new */
231 newlen = htons(sizeof(*uh) + mss);
232 check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
233
234 for (;;) {
235 if (copy_dtor) {
236 seg->destructor = sock_wfree;
237 seg->sk = sk;
238 sum_truesize += seg->truesize;
239 }
240
241 if (!seg->next)
242 break;
243
244 uh->len = newlen;
245 uh->check = check;
246
247 if (seg->ip_summed == CHECKSUM_PARTIAL)
248 gso_reset_checksum(seg, ~check);
249 else
250 uh->check = gso_make_checksum(seg, ~check) ? :
251 CSUM_MANGLED_0;
252
253 seg = seg->next;
254 uh = udp_hdr(seg);
255 }
256
257 /* last packet can be partial gso_size, account for that in checksum */
258 newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
259 seg->data_len);
260 check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
261
262 uh->len = newlen;
263 uh->check = check;
264
265 if (seg->ip_summed == CHECKSUM_PARTIAL)
266 gso_reset_checksum(seg, ~check);
267 else
268 uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
269
270 /* update refcount for the packet */
271 if (copy_dtor) {
272 int delta = sum_truesize - gso_skb->truesize;
273
274 /* In some pathological cases, delta can be negative.
275 * We need to either use refcount_add() or refcount_sub_and_test()
276 */
277 if (likely(delta >= 0))
278 refcount_add(delta, &sk->sk_wmem_alloc);
279 else
280 WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
281 }
282 return segs;
283}
284EXPORT_SYMBOL_GPL(__udp_gso_segment);
285
190static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 286static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
191 netdev_features_t features) 287 netdev_features_t features)
192{ 288{
@@ -203,12 +299,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
203 goto out; 299 goto out;
204 } 300 }
205 301
206 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) 302 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
207 goto out; 303 goto out;
208 304
209 if (!pskb_may_pull(skb, sizeof(struct udphdr))) 305 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
210 goto out; 306 goto out;
211 307
308 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
309 return __udp_gso_segment(skb, features);
310
212 mss = skb_shinfo(skb)->gso_size; 311 mss = skb_shinfo(skb)->gso_size;
213 if (unlikely(skb->len <= mss)) 312 if (unlikely(skb->len <= mss))
214 goto out; 313 goto out;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e4e80cf7e9..0eff75525da1 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC
329 329
330 If unsure, say N. 330 If unsure, say N.
331 331
332config IPV6_SEG6_BPF
333 def_bool y
334 depends on IPV6_SEG6_LWTUNNEL
335 depends on IPV6 = y
336
332endif # IPV6 337endif # IPV6
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b5ea3379d9b..89019bf59f46 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev,
170 unsigned long event); 170 unsigned long event);
171static int addrconf_ifdown(struct net_device *dev, int how); 171static int addrconf_ifdown(struct net_device *dev, int how);
172 172
173static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, 173static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
174 int plen, 174 int plen,
175 const struct net_device *dev, 175 const struct net_device *dev,
176 u32 flags, u32 noflags); 176 u32 flags, u32 noflags);
@@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
916 pr_warn("Freeing alive inet6 address %p\n", ifp); 916 pr_warn("Freeing alive inet6 address %p\n", ifp);
917 return; 917 return;
918 } 918 }
919 ip6_rt_put(ifp->rt);
920 919
921 kfree_rcu(ifp, rcu); 920 kfree_rcu(ifp, rcu);
922} 921}
@@ -987,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
987/* On success it returns ifp with increased reference count */ 986/* On success it returns ifp with increased reference count */
988 987
989static struct inet6_ifaddr * 988static struct inet6_ifaddr *
990ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, 989ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
991 const struct in6_addr *peer_addr, int pfxlen,
992 int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
993 bool can_block, struct netlink_ext_ack *extack) 990 bool can_block, struct netlink_ext_ack *extack)
994{ 991{
995 gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; 992 gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
993 int addr_type = ipv6_addr_type(cfg->pfx);
996 struct net *net = dev_net(idev->dev); 994 struct net *net = dev_net(idev->dev);
997 struct inet6_ifaddr *ifa = NULL; 995 struct inet6_ifaddr *ifa = NULL;
998 struct rt6_info *rt = NULL; 996 struct fib6_info *f6i = NULL;
999 int err = 0; 997 int err = 0;
1000 int addr_type = ipv6_addr_type(addr);
1001 998
1002 if (addr_type == IPV6_ADDR_ANY || 999 if (addr_type == IPV6_ADDR_ANY ||
1003 addr_type & IPV6_ADDR_MULTICAST || 1000 addr_type & IPV6_ADDR_MULTICAST ||
@@ -1020,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
1020 */ 1017 */
1021 if (can_block) { 1018 if (can_block) {
1022 struct in6_validator_info i6vi = { 1019 struct in6_validator_info i6vi = {
1023 .i6vi_addr = *addr, 1020 .i6vi_addr = *cfg->pfx,
1024 .i6vi_dev = idev, 1021 .i6vi_dev = idev,
1025 .extack = extack, 1022 .extack = extack,
1026 }; 1023 };
@@ -1037,38 +1034,39 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
1037 goto out; 1034 goto out;
1038 } 1035 }
1039 1036
1040 rt = addrconf_dst_alloc(idev, addr, false); 1037 f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
1041 if (IS_ERR(rt)) { 1038 if (IS_ERR(f6i)) {
1042 err = PTR_ERR(rt); 1039 err = PTR_ERR(f6i);
1043 rt = NULL; 1040 f6i = NULL;
1044 goto out; 1041 goto out;
1045 } 1042 }
1046 1043
1047 if (net->ipv6.devconf_all->disable_policy || 1044 if (net->ipv6.devconf_all->disable_policy ||
1048 idev->cnf.disable_policy) 1045 idev->cnf.disable_policy)
1049 rt->dst.flags |= DST_NOPOLICY; 1046 f6i->dst_nopolicy = true;
1050 1047
1051 neigh_parms_data_state_setall(idev->nd_parms); 1048 neigh_parms_data_state_setall(idev->nd_parms);
1052 1049
1053 ifa->addr = *addr; 1050 ifa->addr = *cfg->pfx;
1054 if (peer_addr) 1051 if (cfg->peer_pfx)
1055 ifa->peer_addr = *peer_addr; 1052 ifa->peer_addr = *cfg->peer_pfx;
1056 1053
1057 spin_lock_init(&ifa->lock); 1054 spin_lock_init(&ifa->lock);
1058 INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work); 1055 INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
1059 INIT_HLIST_NODE(&ifa->addr_lst); 1056 INIT_HLIST_NODE(&ifa->addr_lst);
1060 ifa->scope = scope; 1057 ifa->scope = cfg->scope;
1061 ifa->prefix_len = pfxlen; 1058 ifa->prefix_len = cfg->plen;
1062 ifa->flags = flags; 1059 ifa->rt_priority = cfg->rt_priority;
1060 ifa->flags = cfg->ifa_flags;
1063 /* No need to add the TENTATIVE flag for addresses with NODAD */ 1061 /* No need to add the TENTATIVE flag for addresses with NODAD */
1064 if (!(flags & IFA_F_NODAD)) 1062 if (!(cfg->ifa_flags & IFA_F_NODAD))
1065 ifa->flags |= IFA_F_TENTATIVE; 1063 ifa->flags |= IFA_F_TENTATIVE;
1066 ifa->valid_lft = valid_lft; 1064 ifa->valid_lft = cfg->valid_lft;
1067 ifa->prefered_lft = prefered_lft; 1065 ifa->prefered_lft = cfg->preferred_lft;
1068 ifa->cstamp = ifa->tstamp = jiffies; 1066 ifa->cstamp = ifa->tstamp = jiffies;
1069 ifa->tokenized = false; 1067 ifa->tokenized = false;
1070 1068
1071 ifa->rt = rt; 1069 ifa->rt = f6i;
1072 1070
1073 ifa->idev = idev; 1071 ifa->idev = idev;
1074 in6_dev_hold(idev); 1072 in6_dev_hold(idev);
@@ -1102,8 +1100,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
1102 inet6addr_notifier_call_chain(NETDEV_UP, ifa); 1100 inet6addr_notifier_call_chain(NETDEV_UP, ifa);
1103out: 1101out:
1104 if (unlikely(err < 0)) { 1102 if (unlikely(err < 0)) {
1105 if (rt) 1103 fib6_info_release(f6i);
1106 ip6_rt_put(rt); 1104
1107 if (ifa) { 1105 if (ifa) {
1108 if (ifa->idev) 1106 if (ifa->idev)
1109 in6_dev_put(ifa->idev); 1107 in6_dev_put(ifa->idev);
@@ -1179,19 +1177,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
1179static void 1177static void
1180cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) 1178cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
1181{ 1179{
1182 struct rt6_info *rt; 1180 struct fib6_info *f6i;
1183 1181
1184 rt = addrconf_get_prefix_route(&ifp->addr, 1182 f6i = addrconf_get_prefix_route(&ifp->addr,
1185 ifp->prefix_len, 1183 ifp->prefix_len,
1186 ifp->idev->dev, 1184 ifp->idev->dev,
1187 0, RTF_GATEWAY | RTF_DEFAULT); 1185 0, RTF_GATEWAY | RTF_DEFAULT);
1188 if (rt) { 1186 if (f6i) {
1189 if (del_rt) 1187 if (del_rt)
1190 ip6_del_rt(rt); 1188 ip6_del_rt(dev_net(ifp->idev->dev), f6i);
1191 else { 1189 else {
1192 if (!(rt->rt6i_flags & RTF_EXPIRES)) 1190 if (!(f6i->fib6_flags & RTF_EXPIRES))
1193 rt6_set_expires(rt, expires); 1191 fib6_set_expires(f6i, expires);
1194 ip6_rt_put(rt); 1192 fib6_info_release(f6i);
1195 } 1193 }
1196 } 1194 }
1197} 1195}
@@ -1261,11 +1259,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
1261{ 1259{
1262 struct inet6_dev *idev = ifp->idev; 1260 struct inet6_dev *idev = ifp->idev;
1263 struct in6_addr addr, *tmpaddr; 1261 struct in6_addr addr, *tmpaddr;
1264 unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age; 1262 unsigned long tmp_tstamp, age;
1265 unsigned long regen_advance; 1263 unsigned long regen_advance;
1266 int tmp_plen; 1264 struct ifa6_config cfg;
1267 int ret = 0; 1265 int ret = 0;
1268 u32 addr_flags;
1269 unsigned long now = jiffies; 1266 unsigned long now = jiffies;
1270 long max_desync_factor; 1267 long max_desync_factor;
1271 s32 cnf_temp_preferred_lft; 1268 s32 cnf_temp_preferred_lft;
@@ -1327,13 +1324,12 @@ retry:
1327 } 1324 }
1328 } 1325 }
1329 1326
1330 tmp_valid_lft = min_t(__u32, 1327 cfg.valid_lft = min_t(__u32, ifp->valid_lft,
1331 ifp->valid_lft,
1332 idev->cnf.temp_valid_lft + age); 1328 idev->cnf.temp_valid_lft + age);
1333 tmp_prefered_lft = cnf_temp_preferred_lft + age - 1329 cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
1334 idev->desync_factor; 1330 cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
1335 tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft); 1331
1336 tmp_plen = ifp->prefix_len; 1332 cfg.plen = ifp->prefix_len;
1337 tmp_tstamp = ifp->tstamp; 1333 tmp_tstamp = ifp->tstamp;
1338 spin_unlock_bh(&ifp->lock); 1334 spin_unlock_bh(&ifp->lock);
1339 1335
@@ -1347,21 +1343,23 @@ retry:
1347 * temporary addresses being generated. 1343 * temporary addresses being generated.
1348 */ 1344 */
1349 age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; 1345 age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
1350 if (tmp_prefered_lft <= regen_advance + age) { 1346 if (cfg.preferred_lft <= regen_advance + age) {
1351 in6_ifa_put(ifp); 1347 in6_ifa_put(ifp);
1352 in6_dev_put(idev); 1348 in6_dev_put(idev);
1353 ret = -1; 1349 ret = -1;
1354 goto out; 1350 goto out;
1355 } 1351 }
1356 1352
1357 addr_flags = IFA_F_TEMPORARY; 1353 cfg.ifa_flags = IFA_F_TEMPORARY;
1358 /* set in addrconf_prefix_rcv() */ 1354 /* set in addrconf_prefix_rcv() */
1359 if (ifp->flags & IFA_F_OPTIMISTIC) 1355 if (ifp->flags & IFA_F_OPTIMISTIC)
1360 addr_flags |= IFA_F_OPTIMISTIC; 1356 cfg.ifa_flags |= IFA_F_OPTIMISTIC;
1361 1357
1362 ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, 1358 cfg.pfx = &addr;
1363 ipv6_addr_scope(&addr), addr_flags, 1359 cfg.scope = ipv6_addr_scope(cfg.pfx);
1364 tmp_valid_lft, tmp_prefered_lft, block, NULL); 1360 cfg.rt_priority = 0;
1361
1362 ift = ipv6_add_addr(idev, &cfg, block, NULL);
1365 if (IS_ERR(ift)) { 1363 if (IS_ERR(ift)) {
1366 in6_ifa_put(ifp); 1364 in6_ifa_put(ifp);
1367 in6_dev_put(idev); 1365 in6_dev_put(idev);
@@ -2032,13 +2030,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
2032 spin_lock_bh(&ifp->lock); 2030 spin_lock_bh(&ifp->lock);
2033 2031
2034 if (ifp->flags & IFA_F_STABLE_PRIVACY) { 2032 if (ifp->flags & IFA_F_STABLE_PRIVACY) {
2035 int scope = ifp->scope;
2036 u32 flags = ifp->flags;
2037 struct in6_addr new_addr; 2033 struct in6_addr new_addr;
2038 struct inet6_ifaddr *ifp2; 2034 struct inet6_ifaddr *ifp2;
2039 u32 valid_lft, preferred_lft;
2040 int pfxlen = ifp->prefix_len;
2041 int retries = ifp->stable_privacy_retry + 1; 2035 int retries = ifp->stable_privacy_retry + 1;
2036 struct ifa6_config cfg = {
2037 .pfx = &new_addr,
2038 .plen = ifp->prefix_len,
2039 .ifa_flags = ifp->flags,
2040 .valid_lft = ifp->valid_lft,
2041 .preferred_lft = ifp->prefered_lft,
2042 .scope = ifp->scope,
2043 };
2042 2044
2043 if (retries > net->ipv6.sysctl.idgen_retries) { 2045 if (retries > net->ipv6.sysctl.idgen_retries) {
2044 net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n", 2046 net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
@@ -2051,9 +2053,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
2051 idev)) 2053 idev))
2052 goto errdad; 2054 goto errdad;
2053 2055
2054 valid_lft = ifp->valid_lft;
2055 preferred_lft = ifp->prefered_lft;
2056
2057 spin_unlock_bh(&ifp->lock); 2056 spin_unlock_bh(&ifp->lock);
2058 2057
2059 if (idev->cnf.max_addresses && 2058 if (idev->cnf.max_addresses &&
@@ -2064,9 +2063,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
2064 net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n", 2063 net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
2065 ifp->idev->dev->name); 2064 ifp->idev->dev->name);
2066 2065
2067 ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, 2066 ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
2068 scope, flags, valid_lft,
2069 preferred_lft, false, NULL);
2070 if (IS_ERR(ifp2)) 2067 if (IS_ERR(ifp2))
2071 goto lock_errdad; 2068 goto lock_errdad;
2072 2069
@@ -2254,6 +2251,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
2254 return addrconf_ifid_ieee1394(eui, dev); 2251 return addrconf_ifid_ieee1394(eui, dev);
2255 case ARPHRD_TUNNEL6: 2252 case ARPHRD_TUNNEL6:
2256 case ARPHRD_IP6GRE: 2253 case ARPHRD_IP6GRE:
2254 case ARPHRD_RAWIP:
2257 return addrconf_ifid_ip6tnl(eui, dev); 2255 return addrconf_ifid_ip6tnl(eui, dev);
2258 } 2256 }
2259 return -1; 2257 return -1;
@@ -2319,18 +2317,20 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
2319 */ 2317 */
2320 2318
2321static void 2319static void
2322addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, 2320addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
2323 unsigned long expires, u32 flags) 2321 struct net_device *dev, unsigned long expires,
2322 u32 flags, gfp_t gfp_flags)
2324{ 2323{
2325 struct fib6_config cfg = { 2324 struct fib6_config cfg = {
2326 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, 2325 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
2327 .fc_metric = IP6_RT_PRIO_ADDRCONF, 2326 .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
2328 .fc_ifindex = dev->ifindex, 2327 .fc_ifindex = dev->ifindex,
2329 .fc_expires = expires, 2328 .fc_expires = expires,
2330 .fc_dst_len = plen, 2329 .fc_dst_len = plen,
2331 .fc_flags = RTF_UP | flags, 2330 .fc_flags = RTF_UP | flags,
2332 .fc_nlinfo.nl_net = dev_net(dev), 2331 .fc_nlinfo.nl_net = dev_net(dev),
2333 .fc_protocol = RTPROT_KERNEL, 2332 .fc_protocol = RTPROT_KERNEL,
2333 .fc_type = RTN_UNICAST,
2334 }; 2334 };
2335 2335
2336 cfg.fc_dst = *pfx; 2336 cfg.fc_dst = *pfx;
@@ -2344,17 +2344,17 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
2344 cfg.fc_flags |= RTF_NONEXTHOP; 2344 cfg.fc_flags |= RTF_NONEXTHOP;
2345#endif 2345#endif
2346 2346
2347 ip6_route_add(&cfg, NULL); 2347 ip6_route_add(&cfg, gfp_flags, NULL);
2348} 2348}
2349 2349
2350 2350
2351static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, 2351static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
2352 int plen, 2352 int plen,
2353 const struct net_device *dev, 2353 const struct net_device *dev,
2354 u32 flags, u32 noflags) 2354 u32 flags, u32 noflags)
2355{ 2355{
2356 struct fib6_node *fn; 2356 struct fib6_node *fn;
2357 struct rt6_info *rt = NULL; 2357 struct fib6_info *rt = NULL;
2358 struct fib6_table *table; 2358 struct fib6_table *table;
2359 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX; 2359 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
2360 2360
@@ -2368,14 +2368,13 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
2368 goto out; 2368 goto out;
2369 2369
2370 for_each_fib6_node_rt_rcu(fn) { 2370 for_each_fib6_node_rt_rcu(fn) {
2371 if (rt->dst.dev->ifindex != dev->ifindex) 2371 if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
2372 continue; 2372 continue;
2373 if ((rt->rt6i_flags & flags) != flags) 2373 if ((rt->fib6_flags & flags) != flags)
2374 continue; 2374 continue;
2375 if ((rt->rt6i_flags & noflags) != 0) 2375 if ((rt->fib6_flags & noflags) != 0)
2376 continue; 2376 continue;
2377 if (!dst_hold_safe(&rt->dst)) 2377 fib6_info_hold(rt);
2378 rt = NULL;
2379 break; 2378 break;
2380 } 2379 }
2381out: 2380out:
@@ -2394,12 +2393,13 @@ static void addrconf_add_mroute(struct net_device *dev)
2394 .fc_ifindex = dev->ifindex, 2393 .fc_ifindex = dev->ifindex,
2395 .fc_dst_len = 8, 2394 .fc_dst_len = 8,
2396 .fc_flags = RTF_UP, 2395 .fc_flags = RTF_UP,
2396 .fc_type = RTN_UNICAST,
2397 .fc_nlinfo.nl_net = dev_net(dev), 2397 .fc_nlinfo.nl_net = dev_net(dev),
2398 }; 2398 };
2399 2399
2400 ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); 2400 ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
2401 2401
2402 ip6_route_add(&cfg, NULL); 2402 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
2403} 2403}
2404 2404
2405static struct inet6_dev *addrconf_add_dev(struct net_device *dev) 2405static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -2507,12 +2507,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
2507 2507
2508 if (!ifp && valid_lft) { 2508 if (!ifp && valid_lft) {
2509 int max_addresses = in6_dev->cnf.max_addresses; 2509 int max_addresses = in6_dev->cnf.max_addresses;
2510 struct ifa6_config cfg = {
2511 .pfx = addr,
2512 .plen = pinfo->prefix_len,
2513 .ifa_flags = addr_flags,
2514 .valid_lft = valid_lft,
2515 .preferred_lft = prefered_lft,
2516 .scope = addr_type & IPV6_ADDR_SCOPE_MASK,
2517 };
2510 2518
2511#ifdef CONFIG_IPV6_OPTIMISTIC_DAD 2519#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
2512 if ((net->ipv6.devconf_all->optimistic_dad || 2520 if ((net->ipv6.devconf_all->optimistic_dad ||
2513 in6_dev->cnf.optimistic_dad) && 2521 in6_dev->cnf.optimistic_dad) &&
2514 !net->ipv6.devconf_all->forwarding && sllao) 2522 !net->ipv6.devconf_all->forwarding && sllao)
2515 addr_flags |= IFA_F_OPTIMISTIC; 2523 cfg.ifa_flags |= IFA_F_OPTIMISTIC;
2516#endif 2524#endif
2517 2525
2518 /* Do not allow to create too much of autoconfigured 2526 /* Do not allow to create too much of autoconfigured
@@ -2520,16 +2528,11 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
2520 */ 2528 */
2521 if (!max_addresses || 2529 if (!max_addresses ||
2522 ipv6_count_addresses(in6_dev) < max_addresses) 2530 ipv6_count_addresses(in6_dev) < max_addresses)
2523 ifp = ipv6_add_addr(in6_dev, addr, NULL, 2531 ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
2524 pinfo->prefix_len,
2525 addr_type&IPV6_ADDR_SCOPE_MASK,
2526 addr_flags, valid_lft,
2527 prefered_lft, false, NULL);
2528 2532
2529 if (IS_ERR_OR_NULL(ifp)) 2533 if (IS_ERR_OR_NULL(ifp))
2530 return -1; 2534 return -1;
2531 2535
2532 update_lft = 0;
2533 create = 1; 2536 create = 1;
2534 spin_lock_bh(&ifp->lock); 2537 spin_lock_bh(&ifp->lock);
2535 ifp->flags |= IFA_F_MANAGETEMPADDR; 2538 ifp->flags |= IFA_F_MANAGETEMPADDR;
@@ -2551,7 +2554,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
2551 stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; 2554 stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
2552 else 2555 else
2553 stored_lft = 0; 2556 stored_lft = 0;
2554 if (!update_lft && !create && stored_lft) { 2557 if (!create && stored_lft) {
2555 const u32 minimum_lft = min_t(u32, 2558 const u32 minimum_lft = min_t(u32,
2556 stored_lft, MIN_VALID_LIFETIME); 2559 stored_lft, MIN_VALID_LIFETIME);
2557 valid_lft = max(valid_lft, minimum_lft); 2560 valid_lft = max(valid_lft, minimum_lft);
@@ -2642,7 +2645,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
2642 */ 2645 */
2643 2646
2644 if (pinfo->onlink) { 2647 if (pinfo->onlink) {
2645 struct rt6_info *rt; 2648 struct fib6_info *rt;
2646 unsigned long rt_expires; 2649 unsigned long rt_expires;
2647 2650
2648 /* Avoid arithmetic overflow. Really, we could 2651 /* Avoid arithmetic overflow. Really, we could
@@ -2667,13 +2670,13 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
2667 if (rt) { 2670 if (rt) {
2668 /* Autoconf prefix route */ 2671 /* Autoconf prefix route */
2669 if (valid_lft == 0) { 2672 if (valid_lft == 0) {
2670 ip6_del_rt(rt); 2673 ip6_del_rt(net, rt);
2671 rt = NULL; 2674 rt = NULL;
2672 } else if (addrconf_finite_timeout(rt_expires)) { 2675 } else if (addrconf_finite_timeout(rt_expires)) {
2673 /* not infinity */ 2676 /* not infinity */
2674 rt6_set_expires(rt, jiffies + rt_expires); 2677 fib6_set_expires(rt, jiffies + rt_expires);
2675 } else { 2678 } else {
2676 rt6_clean_expires(rt); 2679 fib6_clean_expires(rt);
2677 } 2680 }
2678 } else if (valid_lft) { 2681 } else if (valid_lft) {
2679 clock_t expires = 0; 2682 clock_t expires = 0;
@@ -2684,9 +2687,10 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
2684 expires = jiffies_to_clock_t(rt_expires); 2687 expires = jiffies_to_clock_t(rt_expires);
2685 } 2688 }
2686 addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, 2689 addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
2687 dev, expires, flags); 2690 0, dev, expires, flags,
2691 GFP_ATOMIC);
2688 } 2692 }
2689 ip6_rt_put(rt); 2693 fib6_info_release(rt);
2690 } 2694 }
2691 2695
2692 /* Try to figure out our local address for this prefix */ 2696 /* Try to figure out our local address for this prefix */
@@ -2831,10 +2835,7 @@ static int ipv6_mc_config(struct sock *sk, bool join,
2831 * Manual configuration of address on an interface 2835 * Manual configuration of address on an interface
2832 */ 2836 */
2833static int inet6_addr_add(struct net *net, int ifindex, 2837static int inet6_addr_add(struct net *net, int ifindex,
2834 const struct in6_addr *pfx, 2838 struct ifa6_config *cfg,
2835 const struct in6_addr *peer_pfx,
2836 unsigned int plen, __u32 ifa_flags,
2837 __u32 prefered_lft, __u32 valid_lft,
2838 struct netlink_ext_ack *extack) 2839 struct netlink_ext_ack *extack)
2839{ 2840{
2840 struct inet6_ifaddr *ifp; 2841 struct inet6_ifaddr *ifp;
@@ -2842,19 +2843,18 @@ static int inet6_addr_add(struct net *net, int ifindex,
2842 struct net_device *dev; 2843 struct net_device *dev;
2843 unsigned long timeout; 2844 unsigned long timeout;
2844 clock_t expires; 2845 clock_t expires;
2845 int scope;
2846 u32 flags; 2846 u32 flags;
2847 2847
2848 ASSERT_RTNL(); 2848 ASSERT_RTNL();
2849 2849
2850 if (plen > 128) 2850 if (cfg->plen > 128)
2851 return -EINVAL; 2851 return -EINVAL;
2852 2852
2853 /* check the lifetime */ 2853 /* check the lifetime */
2854 if (!valid_lft || prefered_lft > valid_lft) 2854 if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
2855 return -EINVAL; 2855 return -EINVAL;
2856 2856
2857 if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64) 2857 if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64)
2858 return -EINVAL; 2858 return -EINVAL;
2859 2859
2860 dev = __dev_get_by_index(net, ifindex); 2860 dev = __dev_get_by_index(net, ifindex);
@@ -2865,58 +2865,62 @@ static int inet6_addr_add(struct net *net, int ifindex,
2865 if (IS_ERR(idev)) 2865 if (IS_ERR(idev))
2866 return PTR_ERR(idev); 2866 return PTR_ERR(idev);
2867 2867
2868 if (ifa_flags & IFA_F_MCAUTOJOIN) { 2868 if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
2869 int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, 2869 int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
2870 true, pfx, ifindex); 2870 true, cfg->pfx, ifindex);
2871 2871
2872 if (ret < 0) 2872 if (ret < 0)
2873 return ret; 2873 return ret;
2874 } 2874 }
2875 2875
2876 scope = ipv6_addr_scope(pfx); 2876 cfg->scope = ipv6_addr_scope(cfg->pfx);
2877 2877
2878 timeout = addrconf_timeout_fixup(valid_lft, HZ); 2878 timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
2879 if (addrconf_finite_timeout(timeout)) { 2879 if (addrconf_finite_timeout(timeout)) {
2880 expires = jiffies_to_clock_t(timeout * HZ); 2880 expires = jiffies_to_clock_t(timeout * HZ);
2881 valid_lft = timeout; 2881 cfg->valid_lft = timeout;
2882 flags = RTF_EXPIRES; 2882 flags = RTF_EXPIRES;
2883 } else { 2883 } else {
2884 expires = 0; 2884 expires = 0;
2885 flags = 0; 2885 flags = 0;
2886 ifa_flags |= IFA_F_PERMANENT; 2886 cfg->ifa_flags |= IFA_F_PERMANENT;
2887 } 2887 }
2888 2888
2889 timeout = addrconf_timeout_fixup(prefered_lft, HZ); 2889 timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
2890 if (addrconf_finite_timeout(timeout)) { 2890 if (addrconf_finite_timeout(timeout)) {
2891 if (timeout == 0) 2891 if (timeout == 0)
2892 ifa_flags |= IFA_F_DEPRECATED; 2892 cfg->ifa_flags |= IFA_F_DEPRECATED;
2893 prefered_lft = timeout; 2893 cfg->preferred_lft = timeout;
2894 } 2894 }
2895 2895
2896 ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, 2896 ifp = ipv6_add_addr(idev, cfg, true, extack);
2897 valid_lft, prefered_lft, true, extack);
2898
2899 if (!IS_ERR(ifp)) { 2897 if (!IS_ERR(ifp)) {
2900 if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { 2898 if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
2901 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 2899 addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
2902 expires, flags); 2900 ifp->rt_priority, dev, expires,
2901 flags, GFP_KERNEL);
2903 } 2902 }
2904 2903
2904 /* Send a netlink notification if DAD is enabled and
2905 * optimistic flag is not set
2906 */
2907 if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
2908 ipv6_ifa_notify(0, ifp);
2905 /* 2909 /*
2906 * Note that section 3.1 of RFC 4429 indicates 2910 * Note that section 3.1 of RFC 4429 indicates
2907 * that the Optimistic flag should not be set for 2911 * that the Optimistic flag should not be set for
2908 * manually configured addresses 2912 * manually configured addresses
2909 */ 2913 */
2910 addrconf_dad_start(ifp); 2914 addrconf_dad_start(ifp);
2911 if (ifa_flags & IFA_F_MANAGETEMPADDR) 2915 if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
2912 manage_tempaddrs(idev, ifp, valid_lft, prefered_lft, 2916 manage_tempaddrs(idev, ifp, cfg->valid_lft,
2913 true, jiffies); 2917 cfg->preferred_lft, true, jiffies);
2914 in6_ifa_put(ifp); 2918 in6_ifa_put(ifp);
2915 addrconf_verify_rtnl(); 2919 addrconf_verify_rtnl();
2916 return 0; 2920 return 0;
2917 } else if (ifa_flags & IFA_F_MCAUTOJOIN) { 2921 } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
2918 ipv6_mc_config(net->ipv6.mc_autojoin_sk, 2922 ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
2919 false, pfx, ifindex); 2923 cfg->pfx, ifindex);
2920 } 2924 }
2921 2925
2922 return PTR_ERR(ifp); 2926 return PTR_ERR(ifp);
@@ -2967,6 +2971,11 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
2967 2971
2968int addrconf_add_ifaddr(struct net *net, void __user *arg) 2972int addrconf_add_ifaddr(struct net *net, void __user *arg)
2969{ 2973{
2974 struct ifa6_config cfg = {
2975 .ifa_flags = IFA_F_PERMANENT,
2976 .preferred_lft = INFINITY_LIFE_TIME,
2977 .valid_lft = INFINITY_LIFE_TIME,
2978 };
2970 struct in6_ifreq ireq; 2979 struct in6_ifreq ireq;
2971 int err; 2980 int err;
2972 2981
@@ -2976,10 +2985,11 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
2976 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) 2985 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
2977 return -EFAULT; 2986 return -EFAULT;
2978 2987
2988 cfg.pfx = &ireq.ifr6_addr;
2989 cfg.plen = ireq.ifr6_prefixlen;
2990
2979 rtnl_lock(); 2991 rtnl_lock();
2980 err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, 2992 err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
2981 ireq.ifr6_prefixlen, IFA_F_PERMANENT,
2982 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
2983 rtnl_unlock(); 2993 rtnl_unlock();
2984 return err; 2994 return err;
2985} 2995}
@@ -3006,11 +3016,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
3006 int plen, int scope) 3016 int plen, int scope)
3007{ 3017{
3008 struct inet6_ifaddr *ifp; 3018 struct inet6_ifaddr *ifp;
3019 struct ifa6_config cfg = {
3020 .pfx = addr,
3021 .plen = plen,
3022 .ifa_flags = IFA_F_PERMANENT,
3023 .valid_lft = INFINITY_LIFE_TIME,
3024 .preferred_lft = INFINITY_LIFE_TIME,
3025 .scope = scope
3026 };
3009 3027
3010 ifp = ipv6_add_addr(idev, addr, NULL, plen, 3028 ifp = ipv6_add_addr(idev, &cfg, true, NULL);
3011 scope, IFA_F_PERMANENT,
3012 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
3013 true, NULL);
3014 if (!IS_ERR(ifp)) { 3029 if (!IS_ERR(ifp)) {
3015 spin_lock_bh(&ifp->lock); 3030 spin_lock_bh(&ifp->lock);
3016 ifp->flags &= ~IFA_F_TENTATIVE; 3031 ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3047,7 +3062,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
3047 3062
3048 if (addr.s6_addr32[3]) { 3063 if (addr.s6_addr32[3]) {
3049 add_addr(idev, &addr, plen, scope); 3064 add_addr(idev, &addr, plen, scope);
3050 addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags); 3065 addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
3066 GFP_ATOMIC);
3051 return; 3067 return;
3052 } 3068 }
3053 3069
@@ -3071,8 +3087,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
3071 } 3087 }
3072 3088
3073 add_addr(idev, &addr, plen, flag); 3089 add_addr(idev, &addr, plen, flag);
3074 addrconf_prefix_route(&addr, plen, idev->dev, 0, 3090 addrconf_prefix_route(&addr, plen, 0, idev->dev,
3075 pflags); 3091 0, pflags, GFP_ATOMIC);
3076 } 3092 }
3077 } 3093 }
3078 } 3094 }
@@ -3099,20 +3115,27 @@ static void init_loopback(struct net_device *dev)
3099void addrconf_add_linklocal(struct inet6_dev *idev, 3115void addrconf_add_linklocal(struct inet6_dev *idev,
3100 const struct in6_addr *addr, u32 flags) 3116 const struct in6_addr *addr, u32 flags)
3101{ 3117{
3118 struct ifa6_config cfg = {
3119 .pfx = addr,
3120 .plen = 64,
3121 .ifa_flags = flags | IFA_F_PERMANENT,
3122 .valid_lft = INFINITY_LIFE_TIME,
3123 .preferred_lft = INFINITY_LIFE_TIME,
3124 .scope = IFA_LINK
3125 };
3102 struct inet6_ifaddr *ifp; 3126 struct inet6_ifaddr *ifp;
3103 u32 addr_flags = flags | IFA_F_PERMANENT;
3104 3127
3105#ifdef CONFIG_IPV6_OPTIMISTIC_DAD 3128#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
3106 if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || 3129 if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
3107 idev->cnf.optimistic_dad) && 3130 idev->cnf.optimistic_dad) &&
3108 !dev_net(idev->dev)->ipv6.devconf_all->forwarding) 3131 !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
3109 addr_flags |= IFA_F_OPTIMISTIC; 3132 cfg.ifa_flags |= IFA_F_OPTIMISTIC;
3110#endif 3133#endif
3111 3134
3112 ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, 3135 ifp = ipv6_add_addr(idev, &cfg, true, NULL);
3113 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
3114 if (!IS_ERR(ifp)) { 3136 if (!IS_ERR(ifp)) {
3115 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); 3137 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
3138 0, 0, GFP_ATOMIC);
3116 addrconf_dad_start(ifp); 3139 addrconf_dad_start(ifp);
3117 in6_ifa_put(ifp); 3140 in6_ifa_put(ifp);
3118 } 3141 }
@@ -3227,7 +3250,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
3227 addrconf_add_linklocal(idev, &addr, 3250 addrconf_add_linklocal(idev, &addr,
3228 IFA_F_STABLE_PRIVACY); 3251 IFA_F_STABLE_PRIVACY);
3229 else if (prefix_route) 3252 else if (prefix_route)
3230 addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); 3253 addrconf_prefix_route(&addr, 64, 0, idev->dev,
3254 0, 0, GFP_KERNEL);
3231 break; 3255 break;
3232 case IN6_ADDR_GEN_MODE_EUI64: 3256 case IN6_ADDR_GEN_MODE_EUI64:
3233 /* addrconf_add_linklocal also adds a prefix_route and we 3257 /* addrconf_add_linklocal also adds a prefix_route and we
@@ -3237,7 +3261,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
3237 if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) 3261 if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
3238 addrconf_add_linklocal(idev, &addr, 0); 3262 addrconf_add_linklocal(idev, &addr, 0);
3239 else if (prefix_route) 3263 else if (prefix_route)
3240 addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); 3264 addrconf_prefix_route(&addr, 64, 0, idev->dev,
3265 0, 0, GFP_KERNEL);
3241 break; 3266 break;
3242 case IN6_ADDR_GEN_MODE_NONE: 3267 case IN6_ADDR_GEN_MODE_NONE:
3243 default: 3268 default:
@@ -3262,7 +3287,8 @@ static void addrconf_dev_config(struct net_device *dev)
3262 (dev->type != ARPHRD_IP6GRE) && 3287 (dev->type != ARPHRD_IP6GRE) &&
3263 (dev->type != ARPHRD_IPGRE) && 3288 (dev->type != ARPHRD_IPGRE) &&
3264 (dev->type != ARPHRD_TUNNEL) && 3289 (dev->type != ARPHRD_TUNNEL) &&
3265 (dev->type != ARPHRD_NONE)) { 3290 (dev->type != ARPHRD_NONE) &&
3291 (dev->type != ARPHRD_RAWIP)) {
3266 /* Alas, we support only Ethernet autoconfiguration. */ 3292 /* Alas, we support only Ethernet autoconfiguration. */
3267 return; 3293 return;
3268 } 3294 }
@@ -3329,32 +3355,35 @@ static void addrconf_gre_config(struct net_device *dev)
3329} 3355}
3330#endif 3356#endif
3331 3357
3332static int fixup_permanent_addr(struct inet6_dev *idev, 3358static int fixup_permanent_addr(struct net *net,
3359 struct inet6_dev *idev,
3333 struct inet6_ifaddr *ifp) 3360 struct inet6_ifaddr *ifp)
3334{ 3361{
3335 /* !rt6i_node means the host route was removed from the 3362 /* !fib6_node means the host route was removed from the
3336 * FIB, for example, if 'lo' device is taken down. In that 3363 * FIB, for example, if 'lo' device is taken down. In that
3337 * case regenerate the host route. 3364 * case regenerate the host route.
3338 */ 3365 */
3339 if (!ifp->rt || !ifp->rt->rt6i_node) { 3366 if (!ifp->rt || !ifp->rt->fib6_node) {
3340 struct rt6_info *rt, *prev; 3367 struct fib6_info *f6i, *prev;
3341 3368
3342 rt = addrconf_dst_alloc(idev, &ifp->addr, false); 3369 f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
3343 if (IS_ERR(rt)) 3370 GFP_ATOMIC);
3344 return PTR_ERR(rt); 3371 if (IS_ERR(f6i))
3372 return PTR_ERR(f6i);
3345 3373
3346 /* ifp->rt can be accessed outside of rtnl */ 3374 /* ifp->rt can be accessed outside of rtnl */
3347 spin_lock(&ifp->lock); 3375 spin_lock(&ifp->lock);
3348 prev = ifp->rt; 3376 prev = ifp->rt;
3349 ifp->rt = rt; 3377 ifp->rt = f6i;
3350 spin_unlock(&ifp->lock); 3378 spin_unlock(&ifp->lock);
3351 3379
3352 ip6_rt_put(prev); 3380 fib6_info_release(prev);
3353 } 3381 }
3354 3382
3355 if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { 3383 if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
3356 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 3384 addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
3357 idev->dev, 0, 0); 3385 ifp->rt_priority, idev->dev, 0, 0,
3386 GFP_ATOMIC);
3358 } 3387 }
3359 3388
3360 if (ifp->state == INET6_IFADDR_STATE_PREDAD) 3389 if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -3363,7 +3392,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
3363 return 0; 3392 return 0;
3364} 3393}
3365 3394
3366static void addrconf_permanent_addr(struct net_device *dev) 3395static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
3367{ 3396{
3368 struct inet6_ifaddr *ifp, *tmp; 3397 struct inet6_ifaddr *ifp, *tmp;
3369 struct inet6_dev *idev; 3398 struct inet6_dev *idev;
@@ -3376,7 +3405,7 @@ static void addrconf_permanent_addr(struct net_device *dev)
3376 3405
3377 list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { 3406 list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
3378 if ((ifp->flags & IFA_F_PERMANENT) && 3407 if ((ifp->flags & IFA_F_PERMANENT) &&
3379 fixup_permanent_addr(idev, ifp) < 0) { 3408 fixup_permanent_addr(net, idev, ifp) < 0) {
3380 write_unlock_bh(&idev->lock); 3409 write_unlock_bh(&idev->lock);
3381 in6_ifa_hold(ifp); 3410 in6_ifa_hold(ifp);
3382 ipv6_del_addr(ifp); 3411 ipv6_del_addr(ifp);
@@ -3445,7 +3474,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
3445 3474
3446 if (event == NETDEV_UP) { 3475 if (event == NETDEV_UP) {
3447 /* restore routes for permanent addresses */ 3476 /* restore routes for permanent addresses */
3448 addrconf_permanent_addr(dev); 3477 addrconf_permanent_addr(net, dev);
3449 3478
3450 if (!addrconf_link_ready(dev)) { 3479 if (!addrconf_link_ready(dev)) {
3451 /* device is not ready yet. */ 3480 /* device is not ready yet. */
@@ -3612,8 +3641,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
3612 struct net *net = dev_net(dev); 3641 struct net *net = dev_net(dev);
3613 struct inet6_dev *idev; 3642 struct inet6_dev *idev;
3614 struct inet6_ifaddr *ifa, *tmp; 3643 struct inet6_ifaddr *ifa, *tmp;
3615 int _keep_addr; 3644 bool keep_addr = false;
3616 bool keep_addr;
3617 int state, i; 3645 int state, i;
3618 3646
3619 ASSERT_RTNL(); 3647 ASSERT_RTNL();
@@ -3639,15 +3667,18 @@ static int addrconf_ifdown(struct net_device *dev, int how)
3639 3667
3640 } 3668 }
3641 3669
3642 /* aggregate the system setting and interface setting */
3643 _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
3644 if (!_keep_addr)
3645 _keep_addr = idev->cnf.keep_addr_on_down;
3646
3647 /* combine the user config with event to determine if permanent 3670 /* combine the user config with event to determine if permanent
3648 * addresses are to be removed from address hash table 3671 * addresses are to be removed from address hash table
3649 */ 3672 */
3650 keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6); 3673 if (!how && !idev->cnf.disable_ipv6) {
3674 /* aggregate the system setting and interface setting */
3675 int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
3676
3677 if (!_keep_addr)
3678 _keep_addr = idev->cnf.keep_addr_on_down;
3679
3680 keep_addr = (_keep_addr > 0);
3681 }
3651 3682
3652 /* Step 2: clear hash table */ 3683 /* Step 2: clear hash table */
3653 for (i = 0; i < IN6_ADDR_HSIZE; i++) { 3684 for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3697,13 +3728,8 @@ restart:
3697 write_lock_bh(&idev->lock); 3728 write_lock_bh(&idev->lock);
3698 } 3729 }
3699 3730
3700 /* re-combine the user config with event to determine if permanent
3701 * addresses are to be removed from the interface list
3702 */
3703 keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
3704
3705 list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { 3731 list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
3706 struct rt6_info *rt = NULL; 3732 struct fib6_info *rt = NULL;
3707 bool keep; 3733 bool keep;
3708 3734
3709 addrconf_del_dad_work(ifa); 3735 addrconf_del_dad_work(ifa);
@@ -3731,7 +3757,7 @@ restart:
3731 spin_unlock_bh(&ifa->lock); 3757 spin_unlock_bh(&ifa->lock);
3732 3758
3733 if (rt) 3759 if (rt)
3734 ip6_del_rt(rt); 3760 ip6_del_rt(net, rt);
3735 3761
3736 if (state != INET6_IFADDR_STATE_DEAD) { 3762 if (state != INET6_IFADDR_STATE_DEAD) {
3737 __ipv6_ifa_notify(RTM_DELADDR, ifa); 3763 __ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3849,6 +3875,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
3849 struct inet6_dev *idev = ifp->idev; 3875 struct inet6_dev *idev = ifp->idev;
3850 struct net_device *dev = idev->dev; 3876 struct net_device *dev = idev->dev;
3851 bool bump_id, notify = false; 3877 bool bump_id, notify = false;
3878 struct net *net;
3852 3879
3853 addrconf_join_solict(dev, &ifp->addr); 3880 addrconf_join_solict(dev, &ifp->addr);
3854 3881
@@ -3859,8 +3886,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
3859 if (ifp->state == INET6_IFADDR_STATE_DEAD) 3886 if (ifp->state == INET6_IFADDR_STATE_DEAD)
3860 goto out; 3887 goto out;
3861 3888
3889 net = dev_net(dev);
3862 if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || 3890 if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
3863 (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && 3891 (net->ipv6.devconf_all->accept_dad < 1 &&
3864 idev->cnf.accept_dad < 1) || 3892 idev->cnf.accept_dad < 1) ||
3865 !(ifp->flags&IFA_F_TENTATIVE) || 3893 !(ifp->flags&IFA_F_TENTATIVE) ||
3866 ifp->flags & IFA_F_NODAD) { 3894 ifp->flags & IFA_F_NODAD) {
@@ -3896,8 +3924,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
3896 * Frames right away 3924 * Frames right away
3897 */ 3925 */
3898 if (ifp->flags & IFA_F_OPTIMISTIC) { 3926 if (ifp->flags & IFA_F_OPTIMISTIC) {
3899 ip6_ins_rt(ifp->rt); 3927 ip6_ins_rt(net, ifp->rt);
3900 if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { 3928 if (ipv6_use_optimistic_addr(net, idev)) {
3901 /* Because optimistic nodes can use this address, 3929 /* Because optimistic nodes can use this address,
3902 * notify listeners. If DAD fails, RTM_DELADDR is sent. 3930 * notify listeners. If DAD fails, RTM_DELADDR is sent.
3903 */ 3931 */
@@ -4463,6 +4491,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
4463 [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, 4491 [IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
4464 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, 4492 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
4465 [IFA_FLAGS] = { .len = sizeof(u32) }, 4493 [IFA_FLAGS] = { .len = sizeof(u32) },
4494 [IFA_RT_PRIORITY] = { .len = sizeof(u32) },
4466}; 4495};
4467 4496
4468static int 4497static int
@@ -4495,8 +4524,38 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
4495 ifm->ifa_prefixlen); 4524 ifm->ifa_prefixlen);
4496} 4525}
4497 4526
4498static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, 4527static int modify_prefix_route(struct inet6_ifaddr *ifp,
4499 u32 prefered_lft, u32 valid_lft) 4528 unsigned long expires, u32 flags)
4529{
4530 struct fib6_info *f6i;
4531
4532 f6i = addrconf_get_prefix_route(&ifp->addr,
4533 ifp->prefix_len,
4534 ifp->idev->dev,
4535 0, RTF_GATEWAY | RTF_DEFAULT);
4536 if (!f6i)
4537 return -ENOENT;
4538
4539 if (f6i->fib6_metric != ifp->rt_priority) {
4540 /* add new one */
4541 addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
4542 ifp->rt_priority, ifp->idev->dev,
4543 expires, flags, GFP_KERNEL);
4544 /* delete old one */
4545 ip6_del_rt(dev_net(ifp->idev->dev), f6i);
4546 } else {
4547 if (!expires)
4548 fib6_clean_expires(f6i);
4549 else
4550 fib6_set_expires(f6i, expires);
4551
4552 fib6_info_release(f6i);
4553 }
4554
4555 return 0;
4556}
4557
4558static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
4500{ 4559{
4501 u32 flags; 4560 u32 flags;
4502 clock_t expires; 4561 clock_t expires;
@@ -4506,32 +4565,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
4506 4565
4507 ASSERT_RTNL(); 4566 ASSERT_RTNL();
4508 4567
4509 if (!valid_lft || (prefered_lft > valid_lft)) 4568 if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
4510 return -EINVAL; 4569 return -EINVAL;
4511 4570
4512 if (ifa_flags & IFA_F_MANAGETEMPADDR && 4571 if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
4513 (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) 4572 (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
4514 return -EINVAL; 4573 return -EINVAL;
4515 4574
4516 if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) 4575 if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
4517 ifa_flags &= ~IFA_F_OPTIMISTIC; 4576 cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
4518 4577
4519 timeout = addrconf_timeout_fixup(valid_lft, HZ); 4578 timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
4520 if (addrconf_finite_timeout(timeout)) { 4579 if (addrconf_finite_timeout(timeout)) {
4521 expires = jiffies_to_clock_t(timeout * HZ); 4580 expires = jiffies_to_clock_t(timeout * HZ);
4522 valid_lft = timeout; 4581 cfg->valid_lft = timeout;
4523 flags = RTF_EXPIRES; 4582 flags = RTF_EXPIRES;
4524 } else { 4583 } else {
4525 expires = 0; 4584 expires = 0;
4526 flags = 0; 4585 flags = 0;
4527 ifa_flags |= IFA_F_PERMANENT; 4586 cfg->ifa_flags |= IFA_F_PERMANENT;
4528 } 4587 }
4529 4588
4530 timeout = addrconf_timeout_fixup(prefered_lft, HZ); 4589 timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
4531 if (addrconf_finite_timeout(timeout)) { 4590 if (addrconf_finite_timeout(timeout)) {
4532 if (timeout == 0) 4591 if (timeout == 0)
4533 ifa_flags |= IFA_F_DEPRECATED; 4592 cfg->ifa_flags |= IFA_F_DEPRECATED;
4534 prefered_lft = timeout; 4593 cfg->preferred_lft = timeout;
4535 } 4594 }
4536 4595
4537 spin_lock_bh(&ifp->lock); 4596 spin_lock_bh(&ifp->lock);
@@ -4541,18 +4600,30 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
4541 ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | 4600 ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
4542 IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | 4601 IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
4543 IFA_F_NOPREFIXROUTE); 4602 IFA_F_NOPREFIXROUTE);
4544 ifp->flags |= ifa_flags; 4603 ifp->flags |= cfg->ifa_flags;
4545 ifp->tstamp = jiffies; 4604 ifp->tstamp = jiffies;
4546 ifp->valid_lft = valid_lft; 4605 ifp->valid_lft = cfg->valid_lft;
4547 ifp->prefered_lft = prefered_lft; 4606 ifp->prefered_lft = cfg->preferred_lft;
4607
4608 if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
4609 ifp->rt_priority = cfg->rt_priority;
4548 4610
4549 spin_unlock_bh(&ifp->lock); 4611 spin_unlock_bh(&ifp->lock);
4550 if (!(ifp->flags&IFA_F_TENTATIVE)) 4612 if (!(ifp->flags&IFA_F_TENTATIVE))
4551 ipv6_ifa_notify(0, ifp); 4613 ipv6_ifa_notify(0, ifp);
4552 4614
4553 if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { 4615 if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
4554 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, 4616 int rc = -ENOENT;
4555 expires, flags); 4617
4618 if (had_prefixroute)
4619 rc = modify_prefix_route(ifp, expires, flags);
4620
4621 /* prefix route could have been deleted; if so restore it */
4622 if (rc == -ENOENT) {
4623 addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
4624 ifp->rt_priority, ifp->idev->dev,
4625 expires, flags, GFP_KERNEL);
4626 }
4556 } else if (had_prefixroute) { 4627 } else if (had_prefixroute) {
4557 enum cleanup_prefix_rt_t action; 4628 enum cleanup_prefix_rt_t action;
4558 unsigned long rt_expires; 4629 unsigned long rt_expires;
@@ -4568,10 +4639,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
4568 } 4639 }
4569 4640
4570 if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { 4641 if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
4571 if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) 4642 if (was_managetempaddr &&
4572 valid_lft = prefered_lft = 0; 4643 !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
4573 manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft, 4644 cfg->valid_lft = 0;
4574 !was_managetempaddr, jiffies); 4645 cfg->preferred_lft = 0;
4646 }
4647 manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
4648 cfg->preferred_lft, !was_managetempaddr,
4649 jiffies);
4575 } 4650 }
4576 4651
4577 addrconf_verify_rtnl(); 4652 addrconf_verify_rtnl();
@@ -4586,12 +4661,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
4586 struct net *net = sock_net(skb->sk); 4661 struct net *net = sock_net(skb->sk);
4587 struct ifaddrmsg *ifm; 4662 struct ifaddrmsg *ifm;
4588 struct nlattr *tb[IFA_MAX+1]; 4663 struct nlattr *tb[IFA_MAX+1];
4589 struct in6_addr *pfx, *peer_pfx; 4664 struct in6_addr *peer_pfx;
4590 struct inet6_ifaddr *ifa; 4665 struct inet6_ifaddr *ifa;
4591 struct net_device *dev; 4666 struct net_device *dev;
4592 struct inet6_dev *idev; 4667 struct inet6_dev *idev;
4593 u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; 4668 struct ifa6_config cfg;
4594 u32 ifa_flags;
4595 int err; 4669 int err;
4596 4670
4597 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, 4671 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
@@ -4599,60 +4673,70 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
4599 if (err < 0) 4673 if (err < 0)
4600 return err; 4674 return err;
4601 4675
4676 memset(&cfg, 0, sizeof(cfg));
4677
4602 ifm = nlmsg_data(nlh); 4678 ifm = nlmsg_data(nlh);
4603 pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); 4679 cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
4604 if (!pfx) 4680 if (!cfg.pfx)
4605 return -EINVAL; 4681 return -EINVAL;
4606 4682
4683 cfg.peer_pfx = peer_pfx;
4684 cfg.plen = ifm->ifa_prefixlen;
4685 if (tb[IFA_RT_PRIORITY])
4686 cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
4687
4688 cfg.valid_lft = INFINITY_LIFE_TIME;
4689 cfg.preferred_lft = INFINITY_LIFE_TIME;
4690
4607 if (tb[IFA_CACHEINFO]) { 4691 if (tb[IFA_CACHEINFO]) {
4608 struct ifa_cacheinfo *ci; 4692 struct ifa_cacheinfo *ci;
4609 4693
4610 ci = nla_data(tb[IFA_CACHEINFO]); 4694 ci = nla_data(tb[IFA_CACHEINFO]);
4611 valid_lft = ci->ifa_valid; 4695 cfg.valid_lft = ci->ifa_valid;
4612 preferred_lft = ci->ifa_prefered; 4696 cfg.preferred_lft = ci->ifa_prefered;
4613 } else {
4614 preferred_lft = INFINITY_LIFE_TIME;
4615 valid_lft = INFINITY_LIFE_TIME;
4616 } 4697 }
4617 4698
4618 dev = __dev_get_by_index(net, ifm->ifa_index); 4699 dev = __dev_get_by_index(net, ifm->ifa_index);
4619 if (!dev) 4700 if (!dev)
4620 return -ENODEV; 4701 return -ENODEV;
4621 4702
4622 ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; 4703 if (tb[IFA_FLAGS])
4704 cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
4705 else
4706 cfg.ifa_flags = ifm->ifa_flags;
4623 4707
4624 /* We ignore other flags so far. */ 4708 /* We ignore other flags so far. */
4625 ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | 4709 cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
4626 IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; 4710 IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
4711 IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
4627 4712
4628 idev = ipv6_find_idev(dev); 4713 idev = ipv6_find_idev(dev);
4629 if (IS_ERR(idev)) 4714 if (IS_ERR(idev))
4630 return PTR_ERR(idev); 4715 return PTR_ERR(idev);
4631 4716
4632 if (!ipv6_allow_optimistic_dad(net, idev)) 4717 if (!ipv6_allow_optimistic_dad(net, idev))
4633 ifa_flags &= ~IFA_F_OPTIMISTIC; 4718 cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
4634 4719
4635 if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) { 4720 if (cfg.ifa_flags & IFA_F_NODAD &&
4721 cfg.ifa_flags & IFA_F_OPTIMISTIC) {
4636 NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); 4722 NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
4637 return -EINVAL; 4723 return -EINVAL;
4638 } 4724 }
4639 4725
4640 ifa = ipv6_get_ifaddr(net, pfx, dev, 1); 4726 ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
4641 if (!ifa) { 4727 if (!ifa) {
4642 /* 4728 /*
4643 * It would be best to check for !NLM_F_CREATE here but 4729 * It would be best to check for !NLM_F_CREATE here but
4644 * userspace already relies on not having to provide this. 4730 * userspace already relies on not having to provide this.
4645 */ 4731 */
4646 return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, 4732 return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
4647 ifm->ifa_prefixlen, ifa_flags,
4648 preferred_lft, valid_lft, extack);
4649 } 4733 }
4650 4734
4651 if (nlh->nlmsg_flags & NLM_F_EXCL || 4735 if (nlh->nlmsg_flags & NLM_F_EXCL ||
4652 !(nlh->nlmsg_flags & NLM_F_REPLACE)) 4736 !(nlh->nlmsg_flags & NLM_F_REPLACE))
4653 err = -EEXIST; 4737 err = -EEXIST;
4654 else 4738 else
4655 err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft); 4739 err = inet6_addr_modify(ifa, &cfg);
4656 4740
4657 in6_ifa_put(ifa); 4741 in6_ifa_put(ifa);
4658 4742
@@ -4703,7 +4787,8 @@ static inline int inet6_ifaddr_msgsize(void)
4703 + nla_total_size(16) /* IFA_LOCAL */ 4787 + nla_total_size(16) /* IFA_LOCAL */
4704 + nla_total_size(16) /* IFA_ADDRESS */ 4788 + nla_total_size(16) /* IFA_ADDRESS */
4705 + nla_total_size(sizeof(struct ifa_cacheinfo)) 4789 + nla_total_size(sizeof(struct ifa_cacheinfo))
4706 + nla_total_size(4) /* IFA_FLAGS */; 4790 + nla_total_size(4) /* IFA_FLAGS */
4791 + nla_total_size(4) /* IFA_RT_PRIORITY */;
4707} 4792}
4708 4793
4709static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, 4794static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4749,6 +4834,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
4749 if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0) 4834 if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
4750 goto error; 4835 goto error;
4751 4836
4837 if (ifa->rt_priority &&
4838 nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
4839 goto error;
4840
4752 if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) 4841 if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
4753 goto error; 4842 goto error;
4754 4843
@@ -4792,9 +4881,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
4792static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, 4881static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
4793 u32 portid, u32 seq, int event, unsigned int flags) 4882 u32 portid, u32 seq, int event, unsigned int flags)
4794{ 4883{
4884 struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
4885 int ifindex = dev ? dev->ifindex : 1;
4795 struct nlmsghdr *nlh; 4886 struct nlmsghdr *nlh;
4796 u8 scope = RT_SCOPE_UNIVERSE; 4887 u8 scope = RT_SCOPE_UNIVERSE;
4797 int ifindex = ifaca->aca_idev->dev->ifindex;
4798 4888
4799 if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) 4889 if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
4800 scope = RT_SCOPE_SITE; 4890 scope = RT_SCOPE_SITE;
@@ -5017,14 +5107,6 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
5017 struct net *net = dev_net(ifa->idev->dev); 5107 struct net *net = dev_net(ifa->idev->dev);
5018 int err = -ENOBUFS; 5108 int err = -ENOBUFS;
5019 5109
5020 /* Don't send DELADDR notification for TENTATIVE address,
5021 * since NEWADDR notification is sent only after removing
5022 * TENTATIVE flag, if DAD has not failed.
5023 */
5024 if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) &&
5025 event == RTM_DELADDR)
5026 return;
5027
5028 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); 5110 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
5029 if (!skb) 5111 if (!skb)
5030 goto errout; 5112 goto errout;
@@ -5595,29 +5677,30 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
5595 * our DAD process, so we don't need 5677 * our DAD process, so we don't need
5596 * to do it again 5678 * to do it again
5597 */ 5679 */
5598 if (!rcu_access_pointer(ifp->rt->rt6i_node)) 5680 if (!rcu_access_pointer(ifp->rt->fib6_node))
5599 ip6_ins_rt(ifp->rt); 5681 ip6_ins_rt(net, ifp->rt);
5600 if (ifp->idev->cnf.forwarding) 5682 if (ifp->idev->cnf.forwarding)
5601 addrconf_join_anycast(ifp); 5683 addrconf_join_anycast(ifp);
5602 if (!ipv6_addr_any(&ifp->peer_addr)) 5684 if (!ipv6_addr_any(&ifp->peer_addr))
5603 addrconf_prefix_route(&ifp->peer_addr, 128, 5685 addrconf_prefix_route(&ifp->peer_addr, 128, 0,
5604 ifp->idev->dev, 0, 0); 5686 ifp->idev->dev, 0, 0,
5687 GFP_ATOMIC);
5605 break; 5688 break;
5606 case RTM_DELADDR: 5689 case RTM_DELADDR:
5607 if (ifp->idev->cnf.forwarding) 5690 if (ifp->idev->cnf.forwarding)
5608 addrconf_leave_anycast(ifp); 5691 addrconf_leave_anycast(ifp);
5609 addrconf_leave_solict(ifp->idev, &ifp->addr); 5692 addrconf_leave_solict(ifp->idev, &ifp->addr);
5610 if (!ipv6_addr_any(&ifp->peer_addr)) { 5693 if (!ipv6_addr_any(&ifp->peer_addr)) {
5611 struct rt6_info *rt; 5694 struct fib6_info *rt;
5612 5695
5613 rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, 5696 rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
5614 ifp->idev->dev, 0, 0); 5697 ifp->idev->dev, 0, 0);
5615 if (rt) 5698 if (rt)
5616 ip6_del_rt(rt); 5699 ip6_del_rt(net, rt);
5617 } 5700 }
5618 if (ifp->rt) { 5701 if (ifp->rt) {
5619 if (dst_hold_safe(&ifp->rt->dst)) 5702 ip6_del_rt(net, ifp->rt);
5620 ip6_del_rt(ifp->rt); 5703 ifp->rt = NULL;
5621 } 5704 }
5622 rt_genid_bump_ipv6(net); 5705 rt_genid_bump_ipv6(net);
5623 break; 5706 break;
@@ -5964,11 +6047,11 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
5964 list_for_each_entry(ifa, &idev->addr_list, if_list) { 6047 list_for_each_entry(ifa, &idev->addr_list, if_list) {
5965 spin_lock(&ifa->lock); 6048 spin_lock(&ifa->lock);
5966 if (ifa->rt) { 6049 if (ifa->rt) {
5967 struct rt6_info *rt = ifa->rt; 6050 struct fib6_info *rt = ifa->rt;
5968 int cpu; 6051 int cpu;
5969 6052
5970 rcu_read_lock(); 6053 rcu_read_lock();
5971 addrconf_set_nopolicy(ifa->rt, val); 6054 ifa->rt->dst_nopolicy = val ? true : false;
5972 if (rt->rt6i_pcpu) { 6055 if (rt->rt6i_pcpu) {
5973 for_each_possible_cpu(cpu) { 6056 for_each_possible_cpu(cpu) {
5974 struct rt6_info **rtp; 6057 struct rt6_info **rtp;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..5cd0029d930e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,47 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
134 return -EAFNOSUPPORT; 134 return -EAFNOSUPPORT;
135} 135}
136 136
137static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
138{
139 return NULL;
140}
141
142static struct fib6_info *
143eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
144 int oif, struct flowi6 *fl6, int flags)
145{
146 return NULL;
147}
148
149static struct fib6_info *
150eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
151 int flags)
152{
153 return NULL;
154}
155
156static struct fib6_info *
157eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
158 struct flowi6 *fl6, int oif,
159 const struct sk_buff *skb, int strict)
160{
161 return f6i;
162}
163
164static u32
165eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
166 struct in6_addr *saddr)
167{
168 return 0;
169}
170
137const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { 171const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
138 .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, 172 .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
173 .fib6_get_table = eafnosupport_fib6_get_table,
174 .fib6_table_lookup = eafnosupport_fib6_table_lookup,
175 .fib6_lookup = eafnosupport_fib6_lookup,
176 .fib6_multipath_select = eafnosupport_fib6_multipath_select,
177 .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
139}; 178};
140EXPORT_SYMBOL_GPL(ipv6_stub); 179EXPORT_SYMBOL_GPL(ipv6_stub);
141 180
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d443c18b45fe..74f2a261e8df 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -273,33 +273,8 @@ out_rcu_unlock:
273 goto out; 273 goto out;
274} 274}
275 275
276 276static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
277/* bind for INET6 API */ 277 bool force_bind_address_no_port, bool with_lock)
278int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
279{
280 struct sock *sk = sock->sk;
281 int err = 0;
282
283 /* If the socket has its own bind function then use it. */
284 if (sk->sk_prot->bind)
285 return sk->sk_prot->bind(sk, uaddr, addr_len);
286
287 if (addr_len < SIN6_LEN_RFC2133)
288 return -EINVAL;
289
290 /* BPF prog is run before any checks are done so that if the prog
291 * changes context in a wrong way it will be caught.
292 */
293 err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
294 if (err)
295 return err;
296
297 return __inet6_bind(sk, uaddr, addr_len, false, true);
298}
299EXPORT_SYMBOL(inet6_bind);
300
301int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
302 bool force_bind_address_no_port, bool with_lock)
303{ 278{
304 struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; 279 struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
305 struct inet_sock *inet = inet_sk(sk); 280 struct inet_sock *inet = inet_sk(sk);
@@ -444,6 +419,30 @@ out_unlock:
444 goto out; 419 goto out;
445} 420}
446 421
422/* bind for INET6 API */
423int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
424{
425 struct sock *sk = sock->sk;
426 int err = 0;
427
428 /* If the socket has its own bind function then use it. */
429 if (sk->sk_prot->bind)
430 return sk->sk_prot->bind(sk, uaddr, addr_len);
431
432 if (addr_len < SIN6_LEN_RFC2133)
433 return -EINVAL;
434
435 /* BPF prog is run before any checks are done so that if the prog
436 * changes context in a wrong way it will be caught.
437 */
438 err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
439 if (err)
440 return err;
441
442 return __inet6_bind(sk, uaddr, addr_len, false, true);
443}
444EXPORT_SYMBOL(inet6_bind);
445
447int inet6_release(struct socket *sock) 446int inet6_release(struct socket *sock)
448{ 447{
449 struct sock *sk = sock->sk; 448 struct sock *sk = sock->sk;
@@ -579,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
579 .getsockopt = sock_common_getsockopt, /* ok */ 578 .getsockopt = sock_common_getsockopt, /* ok */
580 .sendmsg = inet_sendmsg, /* ok */ 579 .sendmsg = inet_sendmsg, /* ok */
581 .recvmsg = inet_recvmsg, /* ok */ 580 .recvmsg = inet_recvmsg, /* ok */
582 .mmap = sock_no_mmap, 581#ifdef CONFIG_MMU
582 .mmap = tcp_mmap,
583#endif
583 .sendpage = inet_sendpage, 584 .sendpage = inet_sendpage,
584 .sendmsg_locked = tcp_sendmsg_locked, 585 .sendmsg_locked = tcp_sendmsg_locked,
585 .sendpage_locked = tcp_sendpage_locked, 586 .sendpage_locked = tcp_sendpage_locked,
@@ -590,6 +591,7 @@ const struct proto_ops inet6_stream_ops = {
590 .compat_setsockopt = compat_sock_common_setsockopt, 591 .compat_setsockopt = compat_sock_common_setsockopt,
591 .compat_getsockopt = compat_sock_common_getsockopt, 592 .compat_getsockopt = compat_sock_common_getsockopt,
592#endif 593#endif
594 .set_rcvlowat = tcp_set_rcvlowat,
593}; 595};
594 596
595const struct proto_ops inet6_dgram_ops = { 597const struct proto_ops inet6_dgram_ops = {
@@ -887,7 +889,12 @@ static struct pernet_operations inet6_net_ops = {
887static const struct ipv6_stub ipv6_stub_impl = { 889static const struct ipv6_stub ipv6_stub_impl = {
888 .ipv6_sock_mc_join = ipv6_sock_mc_join, 890 .ipv6_sock_mc_join = ipv6_sock_mc_join,
889 .ipv6_sock_mc_drop = ipv6_sock_mc_drop, 891 .ipv6_sock_mc_drop = ipv6_sock_mc_drop,
890 .ipv6_dst_lookup = ip6_dst_lookup, 892 .ipv6_dst_lookup = ip6_dst_lookup,
893 .fib6_get_table = fib6_get_table,
894 .fib6_table_lookup = fib6_table_lookup,
895 .fib6_lookup = fib6_lookup,
896 .fib6_multipath_select = fib6_multipath_select,
897 .ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
891 .udpv6_encap_enable = udpv6_encap_enable, 898 .udpv6_encap_enable = udpv6_encap_enable,
892 .ndisc_send_na = ndisc_send_na, 899 .ndisc_send_na = ndisc_send_na,
893 .nd_tbl = &nd_tbl, 900 .nd_tbl = &nd_tbl,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index ebeaf47d5c8d..4e0ff7031edd 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -212,16 +212,14 @@ static void aca_get(struct ifacaddr6 *aca)
212static void aca_put(struct ifacaddr6 *ac) 212static void aca_put(struct ifacaddr6 *ac)
213{ 213{
214 if (refcount_dec_and_test(&ac->aca_refcnt)) { 214 if (refcount_dec_and_test(&ac->aca_refcnt)) {
215 in6_dev_put(ac->aca_idev); 215 fib6_info_release(ac->aca_rt);
216 dst_release(&ac->aca_rt->dst);
217 kfree(ac); 216 kfree(ac);
218 } 217 }
219} 218}
220 219
221static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, 220static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
222 const struct in6_addr *addr) 221 const struct in6_addr *addr)
223{ 222{
224 struct inet6_dev *idev = rt->rt6i_idev;
225 struct ifacaddr6 *aca; 223 struct ifacaddr6 *aca;
226 224
227 aca = kzalloc(sizeof(*aca), GFP_ATOMIC); 225 aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -229,9 +227,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
229 return NULL; 227 return NULL;
230 228
231 aca->aca_addr = *addr; 229 aca->aca_addr = *addr;
232 in6_dev_hold(idev); 230 fib6_info_hold(f6i);
233 aca->aca_idev = idev; 231 aca->aca_rt = f6i;
234 aca->aca_rt = rt;
235 aca->aca_users = 1; 232 aca->aca_users = 1;
236 /* aca_tstamp should be updated upon changes */ 233 /* aca_tstamp should be updated upon changes */
237 aca->aca_cstamp = aca->aca_tstamp = jiffies; 234 aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -246,7 +243,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
246int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) 243int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
247{ 244{
248 struct ifacaddr6 *aca; 245 struct ifacaddr6 *aca;
249 struct rt6_info *rt; 246 struct fib6_info *f6i;
247 struct net *net;
250 int err; 248 int err;
251 249
252 ASSERT_RTNL(); 250 ASSERT_RTNL();
@@ -265,14 +263,15 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
265 } 263 }
266 } 264 }
267 265
268 rt = addrconf_dst_alloc(idev, addr, true); 266 net = dev_net(idev->dev);
269 if (IS_ERR(rt)) { 267 f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
270 err = PTR_ERR(rt); 268 if (IS_ERR(f6i)) {
269 err = PTR_ERR(f6i);
271 goto out; 270 goto out;
272 } 271 }
273 aca = aca_alloc(rt, addr); 272 aca = aca_alloc(f6i, addr);
274 if (!aca) { 273 if (!aca) {
275 ip6_rt_put(rt); 274 fib6_info_release(f6i);
276 err = -ENOMEM; 275 err = -ENOMEM;
277 goto out; 276 goto out;
278 } 277 }
@@ -286,7 +285,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
286 aca_get(aca); 285 aca_get(aca);
287 write_unlock_bh(&idev->lock); 286 write_unlock_bh(&idev->lock);
288 287
289 ip6_ins_rt(rt); 288 ip6_ins_rt(net, f6i);
290 289
291 addrconf_join_solict(idev->dev, &aca->aca_addr); 290 addrconf_join_solict(idev->dev, &aca->aca_addr);
292 291
@@ -328,8 +327,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
328 write_unlock_bh(&idev->lock); 327 write_unlock_bh(&idev->lock);
329 addrconf_leave_solict(idev, &aca->aca_addr); 328 addrconf_leave_solict(idev, &aca->aca_addr);
330 329
331 dst_hold(&aca->aca_rt->dst); 330 ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
332 ip6_del_rt(aca->aca_rt);
333 331
334 aca_put(aca); 332 aca_put(aca);
335 return 0; 333 return 0;
@@ -356,8 +354,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
356 354
357 addrconf_leave_solict(idev, &aca->aca_addr); 355 addrconf_leave_solict(idev, &aca->aca_addr);
358 356
359 dst_hold(&aca->aca_rt->dst); 357 ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
360 ip6_del_rt(aca->aca_rt);
361 358
362 aca_put(aca); 359 aca_put(aca);
363 360
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bc68eb661970..5bc2bf3733ab 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = {
280 280
281static int ipv6_destopt_rcv(struct sk_buff *skb) 281static int ipv6_destopt_rcv(struct sk_buff *skb)
282{ 282{
283 struct inet6_dev *idev = __in6_dev_get(skb->dev);
283 struct inet6_skb_parm *opt = IP6CB(skb); 284 struct inet6_skb_parm *opt = IP6CB(skb);
284#if IS_ENABLED(CONFIG_IPV6_MIP6) 285#if IS_ENABLED(CONFIG_IPV6_MIP6)
285 __u16 dstbuf; 286 __u16 dstbuf;
@@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
291 if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || 292 if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
292 !pskb_may_pull(skb, (skb_transport_offset(skb) + 293 !pskb_may_pull(skb, (skb_transport_offset(skb) +
293 ((skb_transport_header(skb)[1] + 1) << 3)))) { 294 ((skb_transport_header(skb)[1] + 1) << 3)))) {
294 __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 295 __IP6_INC_STATS(dev_net(dst->dev), idev,
295 IPSTATS_MIB_INHDRERRORS); 296 IPSTATS_MIB_INHDRERRORS);
296fail_and_free: 297fail_and_free:
297 kfree_skb(skb); 298 kfree_skb(skb);
@@ -319,8 +320,7 @@ fail_and_free:
319 return 1; 320 return 1;
320 } 321 }
321 322
322 __IP6_INC_STATS(dev_net(dst->dev), 323 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
323 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
324 return -1; 324 return -1;
325} 325}
326 326
@@ -416,8 +416,7 @@ looped_back:
416 } 416 }
417 417
418 if (hdr->segments_left >= (hdr->hdrlen >> 1)) { 418 if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
419 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 419 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
420 IPSTATS_MIB_INHDRERRORS);
421 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 420 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
422 ((&hdr->segments_left) - 421 ((&hdr->segments_left) -
423 skb_network_header(skb))); 422 skb_network_header(skb)));
@@ -456,8 +455,7 @@ looped_back:
456 455
457 if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { 456 if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
458 if (ipv6_hdr(skb)->hop_limit <= 1) { 457 if (ipv6_hdr(skb)->hop_limit <= 1) {
459 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 458 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
460 IPSTATS_MIB_INHDRERRORS);
461 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 459 icmpv6_send(skb, ICMPV6_TIME_EXCEED,
462 ICMPV6_EXC_HOPLIMIT, 0); 460 ICMPV6_EXC_HOPLIMIT, 0);
463 kfree_skb(skb); 461 kfree_skb(skb);
@@ -481,10 +479,10 @@ looped_back:
481/* called with rcu_read_lock() */ 479/* called with rcu_read_lock() */
482static int ipv6_rthdr_rcv(struct sk_buff *skb) 480static int ipv6_rthdr_rcv(struct sk_buff *skb)
483{ 481{
482 struct inet6_dev *idev = __in6_dev_get(skb->dev);
484 struct inet6_skb_parm *opt = IP6CB(skb); 483 struct inet6_skb_parm *opt = IP6CB(skb);
485 struct in6_addr *addr = NULL; 484 struct in6_addr *addr = NULL;
486 struct in6_addr daddr; 485 struct in6_addr daddr;
487 struct inet6_dev *idev;
488 int n, i; 486 int n, i;
489 struct ipv6_rt_hdr *hdr; 487 struct ipv6_rt_hdr *hdr;
490 struct rt0_hdr *rthdr; 488 struct rt0_hdr *rthdr;
@@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
498 if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || 496 if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
499 !pskb_may_pull(skb, (skb_transport_offset(skb) + 497 !pskb_may_pull(skb, (skb_transport_offset(skb) +
500 ((skb_transport_header(skb)[1] + 1) << 3)))) { 498 ((skb_transport_header(skb)[1] + 1) << 3)))) {
501 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 499 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
502 IPSTATS_MIB_INHDRERRORS);
503 kfree_skb(skb); 500 kfree_skb(skb);
504 return -1; 501 return -1;
505 } 502 }
@@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
508 505
509 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || 506 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
510 skb->pkt_type != PACKET_HOST) { 507 skb->pkt_type != PACKET_HOST) {
511 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 508 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
512 IPSTATS_MIB_INADDRERRORS);
513 kfree_skb(skb); 509 kfree_skb(skb);
514 return -1; 510 return -1;
515 } 511 }
@@ -527,7 +523,7 @@ looped_back:
527 * processed by own 523 * processed by own
528 */ 524 */
529 if (!addr) { 525 if (!addr) {
530 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 526 __IP6_INC_STATS(net, idev,
531 IPSTATS_MIB_INADDRERRORS); 527 IPSTATS_MIB_INADDRERRORS);
532 kfree_skb(skb); 528 kfree_skb(skb);
533 return -1; 529 return -1;
@@ -553,8 +549,7 @@ looped_back:
553 goto unknown_rh; 549 goto unknown_rh;
554 /* Silently discard invalid RTH type 2 */ 550 /* Silently discard invalid RTH type 2 */
555 if (hdr->hdrlen != 2 || hdr->segments_left != 1) { 551 if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
556 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 552 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
557 IPSTATS_MIB_INHDRERRORS);
558 kfree_skb(skb); 553 kfree_skb(skb);
559 return -1; 554 return -1;
560 } 555 }
@@ -572,8 +567,7 @@ looped_back:
572 n = hdr->hdrlen >> 1; 567 n = hdr->hdrlen >> 1;
573 568
574 if (hdr->segments_left > n) { 569 if (hdr->segments_left > n) {
575 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 570 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
576 IPSTATS_MIB_INHDRERRORS);
577 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 571 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
578 ((&hdr->segments_left) - 572 ((&hdr->segments_left) -
579 skb_network_header(skb))); 573 skb_network_header(skb)));
@@ -609,14 +603,12 @@ looped_back:
609 if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, 603 if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
610 (xfrm_address_t *)&ipv6_hdr(skb)->saddr, 604 (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
611 IPPROTO_ROUTING) < 0) { 605 IPPROTO_ROUTING) < 0) {
612 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 606 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
613 IPSTATS_MIB_INADDRERRORS);
614 kfree_skb(skb); 607 kfree_skb(skb);
615 return -1; 608 return -1;
616 } 609 }
617 if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { 610 if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
618 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 611 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
619 IPSTATS_MIB_INADDRERRORS);
620 kfree_skb(skb); 612 kfree_skb(skb);
621 return -1; 613 return -1;
622 } 614 }
@@ -627,8 +619,7 @@ looped_back:
627 } 619 }
628 620
629 if (ipv6_addr_is_multicast(addr)) { 621 if (ipv6_addr_is_multicast(addr)) {
630 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 622 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
631 IPSTATS_MIB_INADDRERRORS);
632 kfree_skb(skb); 623 kfree_skb(skb);
633 return -1; 624 return -1;
634 } 625 }
@@ -647,8 +638,7 @@ looped_back:
647 638
648 if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { 639 if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
649 if (ipv6_hdr(skb)->hop_limit <= 1) { 640 if (ipv6_hdr(skb)->hop_limit <= 1) {
650 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 641 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
651 IPSTATS_MIB_INHDRERRORS);
652 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 642 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
653 0); 643 0);
654 kfree_skb(skb); 644 kfree_skb(skb);
@@ -663,7 +653,7 @@ looped_back:
663 return -1; 653 return -1;
664 654
665unknown_rh: 655unknown_rh:
666 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); 656 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
667 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 657 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
668 (&hdr->type) - skb_network_header(skb)); 658 (&hdr->type) - skb_network_header(skb));
669 return -1; 659 return -1;
@@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
755static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) 745static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
756{ 746{
757 const unsigned char *nh = skb_network_header(skb); 747 const unsigned char *nh = skb_network_header(skb);
748 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
758 struct net *net = ipv6_skb_net(skb); 749 struct net *net = ipv6_skb_net(skb);
759 u32 pkt_len; 750 u32 pkt_len;
760 751
761 if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { 752 if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
762 net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", 753 net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
763 nh[optoff+1]); 754 nh[optoff+1]);
764 __IP6_INC_STATS(net, ipv6_skb_idev(skb), 755 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
765 IPSTATS_MIB_INHDRERRORS);
766 goto drop; 756 goto drop;
767 } 757 }
768 758
769 pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); 759 pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
770 if (pkt_len <= IPV6_MAXPLEN) { 760 if (pkt_len <= IPV6_MAXPLEN) {
771 __IP6_INC_STATS(net, ipv6_skb_idev(skb), 761 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
772 IPSTATS_MIB_INHDRERRORS);
773 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); 762 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
774 return false; 763 return false;
775 } 764 }
776 if (ipv6_hdr(skb)->payload_len) { 765 if (ipv6_hdr(skb)->payload_len) {
777 __IP6_INC_STATS(net, ipv6_skb_idev(skb), 766 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
778 IPSTATS_MIB_INHDRERRORS);
779 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); 767 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
780 return false; 768 return false;
781 } 769 }
782 770
783 if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { 771 if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
784 __IP6_INC_STATS(net, ipv6_skb_idev(skb), 772 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
785 IPSTATS_MIB_INTRUNCATEDPKTS);
786 goto drop; 773 goto drop;
787 } 774 }
788 775
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index b643f5ce6c80..ae365df8abf7 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
161 * if target < 0. "last header" is transport protocol header, ESP, or 161 * if target < 0. "last header" is transport protocol header, ESP, or
162 * "No next header". 162 * "No next header".
163 * 163 *
164 * Note that *offset is used as input/output parameter. an if it is not zero, 164 * Note that *offset is used as input/output parameter, and if it is not zero,
165 * then it must be a valid offset to an inner IPv6 header. This can be used 165 * then it must be a valid offset to an inner IPv6 header. This can be used
166 * to explore inner IPv6 header, eg. ICMPv6 error messages. 166 * to explore inner IPv6 header, eg. ICMPv6 error messages.
167 * 167 *
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
60 return fib_rules_seq_read(net, AF_INET6); 60 return fib_rules_seq_read(net, AF_INET6);
61} 61}
62 62
63/* called with rcu lock held; no reference taken on fib6_info */
64struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
65 int flags)
66{
67 struct fib6_info *f6i;
68 int err;
69
70 if (net->ipv6.fib6_has_custom_rules) {
71 struct fib_lookup_arg arg = {
72 .lookup_ptr = fib6_table_lookup,
73 .lookup_data = &oif,
74 .flags = FIB_LOOKUP_NOREF,
75 };
76
77 l3mdev_update_flow(net, flowi6_to_flowi(fl6));
78
79 err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
80 flowi6_to_flowi(fl6), flags, &arg);
81 if (err)
82 return ERR_PTR(err);
83
84 f6i = arg.result ? : net->ipv6.fib6_null_entry;
85 } else {
86 f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
87 oif, fl6, flags);
88 if (!f6i || f6i == net->ipv6.fib6_null_entry)
89 f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
90 oif, fl6, flags);
91 }
92
93 return f6i;
94}
95
63struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, 96struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
64 const struct sk_buff *skb, 97 const struct sk_buff *skb,
65 int flags, pol_lookup_t lookup) 98 int flags, pol_lookup_t lookup)
@@ -96,8 +129,73 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
96 return &net->ipv6.ip6_null_entry->dst; 129 return &net->ipv6.ip6_null_entry->dst;
97} 130}
98 131
99static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, 132static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
100 int flags, struct fib_lookup_arg *arg) 133 struct flowi6 *flp6, const struct net_device *dev)
134{
135 struct fib6_rule *r = (struct fib6_rule *)rule;
136
137 /* If we need to find a source address for this traffic,
138 * we check the result if it meets requirement of the rule.
139 */
140 if ((rule->flags & FIB_RULE_FIND_SADDR) &&
141 r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
142 struct in6_addr saddr;
143
144 if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
145 rt6_flags2srcprefs(flags), &saddr))
146 return -EAGAIN;
147
148 if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
149 return -EAGAIN;
150
151 flp6->saddr = saddr;
152 }
153
154 return 0;
155}
156
157static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
158 int flags, struct fib_lookup_arg *arg)
159{
160 struct flowi6 *flp6 = &flp->u.ip6;
161 struct net *net = rule->fr_net;
162 struct fib6_table *table;
163 struct fib6_info *f6i;
164 int err = -EAGAIN, *oif;
165 u32 tb_id;
166
167 switch (rule->action) {
168 case FR_ACT_TO_TBL:
169 break;
170 case FR_ACT_UNREACHABLE:
171 return -ENETUNREACH;
172 case FR_ACT_PROHIBIT:
173 return -EACCES;
174 case FR_ACT_BLACKHOLE:
175 default:
176 return -EINVAL;
177 }
178
179 tb_id = fib_rule_get_table(rule, arg);
180 table = fib6_get_table(net, tb_id);
181 if (!table)
182 return -EAGAIN;
183
184 oif = (int *)arg->lookup_data;
185 f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
186 if (f6i != net->ipv6.fib6_null_entry) {
187 err = fib6_rule_saddr(net, rule, flags, flp6,
188 fib6_info_nh_dev(f6i));
189
190 if (likely(!err))
191 arg->result = f6i;
192 }
193
194 return err;
195}
196
197static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
198 int flags, struct fib_lookup_arg *arg)
101{ 199{
102 struct flowi6 *flp6 = &flp->u.ip6; 200 struct flowi6 *flp6 = &flp->u.ip6;
103 struct rt6_info *rt = NULL; 201 struct rt6_info *rt = NULL;
@@ -134,27 +232,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
134 232
135 rt = lookup(net, table, flp6, arg->lookup_data, flags); 233 rt = lookup(net, table, flp6, arg->lookup_data, flags);
136 if (rt != net->ipv6.ip6_null_entry) { 234 if (rt != net->ipv6.ip6_null_entry) {
137 struct fib6_rule *r = (struct fib6_rule *)rule; 235 err = fib6_rule_saddr(net, rule, flags, flp6,
138 236 ip6_dst_idev(&rt->dst)->dev);
139 /* 237
140 * If we need to find a source address for this traffic, 238 if (err == -EAGAIN)
141 * we check the result if it meets requirement of the rule. 239 goto again;
142 */ 240
143 if ((rule->flags & FIB_RULE_FIND_SADDR) &&
144 r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
145 struct in6_addr saddr;
146
147 if (ipv6_dev_get_saddr(net,
148 ip6_dst_idev(&rt->dst)->dev,
149 &flp6->daddr,
150 rt6_flags2srcprefs(flags),
151 &saddr))
152 goto again;
153 if (!ipv6_prefix_equal(&saddr, &r->src.addr,
154 r->src.plen))
155 goto again;
156 flp6->saddr = saddr;
157 }
158 err = rt->dst.error; 241 err = rt->dst.error;
159 if (err != -EAGAIN) 242 if (err != -EAGAIN)
160 goto out; 243 goto out;
@@ -172,6 +255,15 @@ out:
172 return err; 255 return err;
173} 256}
174 257
258static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
259 int flags, struct fib_lookup_arg *arg)
260{
261 if (arg->lookup_ptr == fib6_table_lookup)
262 return fib6_rule_action_alt(rule, flp, flags, arg);
263
264 return __fib6_rule_action(rule, flp, flags, arg);
265}
266
175static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) 267static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
176{ 268{
177 struct rt6_info *rt = (struct rt6_info *) arg->result; 269 struct rt6_info *rt = (struct rt6_info *) arg->result;
@@ -245,15 +337,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
245 337
246static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, 338static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
247 struct fib_rule_hdr *frh, 339 struct fib_rule_hdr *frh,
248 struct nlattr **tb) 340 struct nlattr **tb,
341 struct netlink_ext_ack *extack)
249{ 342{
250 int err = -EINVAL; 343 int err = -EINVAL;
251 struct net *net = sock_net(skb->sk); 344 struct net *net = sock_net(skb->sk);
252 struct fib6_rule *rule6 = (struct fib6_rule *) rule; 345 struct fib6_rule *rule6 = (struct fib6_rule *) rule;
253 346
254 if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { 347 if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
255 if (rule->table == RT6_TABLE_UNSPEC) 348 if (rule->table == RT6_TABLE_UNSPEC) {
349 NL_SET_ERR_MSG(extack, "Invalid table");
256 goto errout; 350 goto errout;
351 }
257 352
258 if (fib6_new_table(net, rule->table) == NULL) { 353 if (fib6_new_table(net, rule->table) == NULL) {
259 err = -ENOBUFS; 354 err = -ENOBUFS;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 01372dd74e38..7aa4c41a3bd9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly;
43struct fib6_cleaner { 43struct fib6_cleaner {
44 struct fib6_walker w; 44 struct fib6_walker w;
45 struct net *net; 45 struct net *net;
46 int (*func)(struct rt6_info *, void *arg); 46 int (*func)(struct fib6_info *, void *arg);
47 int sernum; 47 int sernum;
48 void *arg; 48 void *arg;
49}; 49};
@@ -54,7 +54,7 @@ struct fib6_cleaner {
54#define FWS_INIT FWS_L 54#define FWS_INIT FWS_L
55#endif 55#endif
56 56
57static struct rt6_info *fib6_find_prefix(struct net *net, 57static struct fib6_info *fib6_find_prefix(struct net *net,
58 struct fib6_table *table, 58 struct fib6_table *table,
59 struct fib6_node *fn); 59 struct fib6_node *fn);
60static struct fib6_node *fib6_repair_tree(struct net *net, 60static struct fib6_node *fib6_repair_tree(struct net *net,
@@ -105,13 +105,12 @@ enum {
105 FIB6_NO_SERNUM_CHANGE = 0, 105 FIB6_NO_SERNUM_CHANGE = 0,
106}; 106};
107 107
108void fib6_update_sernum(struct rt6_info *rt) 108void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
109{ 109{
110 struct net *net = dev_net(rt->dst.dev);
111 struct fib6_node *fn; 110 struct fib6_node *fn;
112 111
113 fn = rcu_dereference_protected(rt->rt6i_node, 112 fn = rcu_dereference_protected(f6i->fib6_node,
114 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 113 lockdep_is_held(&f6i->fib6_table->tb6_lock));
115 if (fn) 114 if (fn)
116 fn->fn_sernum = fib6_new_sernum(net); 115 fn->fn_sernum = fib6_new_sernum(net);
117} 116}
@@ -146,6 +145,69 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
146 addr[fn_bit >> 5]; 145 addr[fn_bit >> 5];
147} 146}
148 147
148struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
149{
150 struct fib6_info *f6i;
151
152 f6i = kzalloc(sizeof(*f6i), gfp_flags);
153 if (!f6i)
154 return NULL;
155
156 f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
157 if (!f6i->rt6i_pcpu) {
158 kfree(f6i);
159 return NULL;
160 }
161
162 INIT_LIST_HEAD(&f6i->fib6_siblings);
163 f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
164
165 atomic_inc(&f6i->fib6_ref);
166
167 return f6i;
168}
169
170void fib6_info_destroy(struct fib6_info *f6i)
171{
172 struct rt6_exception_bucket *bucket;
173 struct dst_metrics *m;
174
175 WARN_ON(f6i->fib6_node);
176
177 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
178 if (bucket) {
179 f6i->rt6i_exception_bucket = NULL;
180 kfree(bucket);
181 }
182
183 if (f6i->rt6i_pcpu) {
184 int cpu;
185
186 for_each_possible_cpu(cpu) {
187 struct rt6_info **ppcpu_rt;
188 struct rt6_info *pcpu_rt;
189
190 ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
191 pcpu_rt = *ppcpu_rt;
192 if (pcpu_rt) {
193 dst_dev_put(&pcpu_rt->dst);
194 dst_release(&pcpu_rt->dst);
195 *ppcpu_rt = NULL;
196 }
197 }
198 }
199
200 if (f6i->fib6_nh.nh_dev)
201 dev_put(f6i->fib6_nh.nh_dev);
202
203 m = f6i->fib6_metrics;
204 if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
205 kfree(m);
206
207 kfree(f6i);
208}
209EXPORT_SYMBOL_GPL(fib6_info_destroy);
210
149static struct fib6_node *node_alloc(struct net *net) 211static struct fib6_node *node_alloc(struct net *net)
150{ 212{
151 struct fib6_node *fn; 213 struct fib6_node *fn;
@@ -176,28 +238,6 @@ static void node_free(struct net *net, struct fib6_node *fn)
176 net->ipv6.rt6_stats->fib_nodes--; 238 net->ipv6.rt6_stats->fib_nodes--;
177} 239}
178 240
179void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
180{
181 int cpu;
182
183 if (!non_pcpu_rt->rt6i_pcpu)
184 return;
185
186 for_each_possible_cpu(cpu) {
187 struct rt6_info **ppcpu_rt;
188 struct rt6_info *pcpu_rt;
189
190 ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
191 pcpu_rt = *ppcpu_rt;
192 if (pcpu_rt) {
193 dst_dev_put(&pcpu_rt->dst);
194 dst_release(&pcpu_rt->dst);
195 *ppcpu_rt = NULL;
196 }
197 }
198}
199EXPORT_SYMBOL_GPL(rt6_free_pcpu);
200
201static void fib6_free_table(struct fib6_table *table) 241static void fib6_free_table(struct fib6_table *table)
202{ 242{
203 inetpeer_invalidate_tree(&table->tb6_peers); 243 inetpeer_invalidate_tree(&table->tb6_peers);
@@ -232,7 +272,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
232 if (table) { 272 if (table) {
233 table->tb6_id = id; 273 table->tb6_id = id;
234 rcu_assign_pointer(table->tb6_root.leaf, 274 rcu_assign_pointer(table->tb6_root.leaf,
235 net->ipv6.ip6_null_entry); 275 net->ipv6.fib6_null_entry);
236 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 276 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
237 inet_peer_base_init(&table->tb6_peers); 277 inet_peer_base_init(&table->tb6_peers);
238 } 278 }
@@ -314,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
314 return &rt->dst; 354 return &rt->dst;
315} 355}
316 356
357/* called with rcu lock held; no reference taken on fib6_info */
358struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
359 int flags)
360{
361 return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
362}
363
317static void __net_init fib6_tables_init(struct net *net) 364static void __net_init fib6_tables_init(struct net *net)
318{ 365{
319 fib6_link_table(net, net->ipv6.fib6_main_tbl); 366 fib6_link_table(net, net->ipv6.fib6_main_tbl);
@@ -340,7 +387,7 @@ unsigned int fib6_tables_seq_read(struct net *net)
340 387
341static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, 388static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
342 enum fib_event_type event_type, 389 enum fib_event_type event_type,
343 struct rt6_info *rt) 390 struct fib6_info *rt)
344{ 391{
345 struct fib6_entry_notifier_info info = { 392 struct fib6_entry_notifier_info info = {
346 .rt = rt, 393 .rt = rt,
@@ -351,7 +398,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
351 398
352static int call_fib6_entry_notifiers(struct net *net, 399static int call_fib6_entry_notifiers(struct net *net,
353 enum fib_event_type event_type, 400 enum fib_event_type event_type,
354 struct rt6_info *rt, 401 struct fib6_info *rt,
355 struct netlink_ext_ack *extack) 402 struct netlink_ext_ack *extack)
356{ 403{
357 struct fib6_entry_notifier_info info = { 404 struct fib6_entry_notifier_info info = {
@@ -359,7 +406,7 @@ static int call_fib6_entry_notifiers(struct net *net,
359 .rt = rt, 406 .rt = rt,
360 }; 407 };
361 408
362 rt->rt6i_table->fib_seq++; 409 rt->fib6_table->fib_seq++;
363 return call_fib6_notifiers(net, event_type, &info.info); 410 return call_fib6_notifiers(net, event_type, &info.info);
364} 411}
365 412
@@ -368,16 +415,16 @@ struct fib6_dump_arg {
368 struct notifier_block *nb; 415 struct notifier_block *nb;
369}; 416};
370 417
371static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) 418static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
372{ 419{
373 if (rt == arg->net->ipv6.ip6_null_entry) 420 if (rt == arg->net->ipv6.fib6_null_entry)
374 return; 421 return;
375 call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); 422 call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
376} 423}
377 424
378static int fib6_node_dump(struct fib6_walker *w) 425static int fib6_node_dump(struct fib6_walker *w)
379{ 426{
380 struct rt6_info *rt; 427 struct fib6_info *rt;
381 428
382 for_each_fib6_walker_rt(w) 429 for_each_fib6_walker_rt(w)
383 fib6_rt_dump(rt, w->args); 430 fib6_rt_dump(rt, w->args);
@@ -426,7 +473,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
426static int fib6_dump_node(struct fib6_walker *w) 473static int fib6_dump_node(struct fib6_walker *w)
427{ 474{
428 int res; 475 int res;
429 struct rt6_info *rt; 476 struct fib6_info *rt;
430 477
431 for_each_fib6_walker_rt(w) { 478 for_each_fib6_walker_rt(w) {
432 res = rt6_dump_route(rt, w->args); 479 res = rt6_dump_route(rt, w->args);
@@ -441,10 +488,10 @@ static int fib6_dump_node(struct fib6_walker *w)
441 * last sibling of this route (no need to dump the 488 * last sibling of this route (no need to dump the
442 * sibling routes again) 489 * sibling routes again)
443 */ 490 */
444 if (rt->rt6i_nsiblings) 491 if (rt->fib6_nsiblings)
445 rt = list_last_entry(&rt->rt6i_siblings, 492 rt = list_last_entry(&rt->fib6_siblings,
446 struct rt6_info, 493 struct fib6_info,
447 rt6i_siblings); 494 fib6_siblings);
448 } 495 }
449 w->leaf = NULL; 496 w->leaf = NULL;
450 return 0; 497 return 0;
@@ -579,6 +626,24 @@ out:
579 return res; 626 return res;
580} 627}
581 628
629void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
630{
631 if (!f6i)
632 return;
633
634 if (f6i->fib6_metrics == &dst_default_metrics) {
635 struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
636
637 if (!p)
638 return;
639
640 refcount_set(&p->refcnt, 1);
641 f6i->fib6_metrics = p;
642 }
643
644 f6i->fib6_metrics->metrics[metric - 1] = val;
645}
646
582/* 647/*
583 * Routing Table 648 * Routing Table
584 * 649 *
@@ -608,7 +673,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
608 fn = root; 673 fn = root;
609 674
610 do { 675 do {
611 struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, 676 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
612 lockdep_is_held(&table->tb6_lock)); 677 lockdep_is_held(&table->tb6_lock));
613 key = (struct rt6key *)((u8 *)leaf + offset); 678 key = (struct rt6key *)((u8 *)leaf + offset);
614 679
@@ -637,11 +702,11 @@ static struct fib6_node *fib6_add_1(struct net *net,
637 /* clean up an intermediate node */ 702 /* clean up an intermediate node */
638 if (!(fn->fn_flags & RTN_RTINFO)) { 703 if (!(fn->fn_flags & RTN_RTINFO)) {
639 RCU_INIT_POINTER(fn->leaf, NULL); 704 RCU_INIT_POINTER(fn->leaf, NULL);
640 rt6_release(leaf); 705 fib6_info_release(leaf);
641 /* remove null_entry in the root node */ 706 /* remove null_entry in the root node */
642 } else if (fn->fn_flags & RTN_TL_ROOT && 707 } else if (fn->fn_flags & RTN_TL_ROOT &&
643 rcu_access_pointer(fn->leaf) == 708 rcu_access_pointer(fn->leaf) ==
644 net->ipv6.ip6_null_entry) { 709 net->ipv6.fib6_null_entry) {
645 RCU_INIT_POINTER(fn->leaf, NULL); 710 RCU_INIT_POINTER(fn->leaf, NULL);
646 } 711 }
647 712
@@ -750,7 +815,7 @@ insert_above:
750 RCU_INIT_POINTER(in->parent, pn); 815 RCU_INIT_POINTER(in->parent, pn);
751 in->leaf = fn->leaf; 816 in->leaf = fn->leaf;
752 atomic_inc(&rcu_dereference_protected(in->leaf, 817 atomic_inc(&rcu_dereference_protected(in->leaf,
753 lockdep_is_held(&table->tb6_lock))->rt6i_ref); 818 lockdep_is_held(&table->tb6_lock))->fib6_ref);
754 819
755 /* update parent pointer */ 820 /* update parent pointer */
756 if (dir) 821 if (dir)
@@ -802,44 +867,37 @@ insert_above:
802 return ln; 867 return ln;
803} 868}
804 869
805static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) 870static void fib6_drop_pcpu_from(struct fib6_info *f6i,
806{ 871 const struct fib6_table *table)
807 int i;
808
809 for (i = 0; i < RTAX_MAX; i++) {
810 if (test_bit(i, mxc->mx_valid))
811 mp[i] = mxc->mx[i];
812 }
813}
814
815static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
816{ 872{
817 if (!mxc->mx) 873 int cpu;
818 return 0;
819
820 if (dst->flags & DST_HOST) {
821 u32 *mp = dst_metrics_write_ptr(dst);
822 874
823 if (unlikely(!mp)) 875 /* release the reference to this fib entry from
824 return -ENOMEM; 876 * all of its cached pcpu routes
877 */
878 for_each_possible_cpu(cpu) {
879 struct rt6_info **ppcpu_rt;
880 struct rt6_info *pcpu_rt;
825 881
826 fib6_copy_metrics(mp, mxc); 882 ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
827 } else { 883 pcpu_rt = *ppcpu_rt;
828 dst_init_metrics(dst, mxc->mx, false); 884 if (pcpu_rt) {
885 struct fib6_info *from;
829 886
830 /* We've stolen mx now. */ 887 from = rcu_dereference_protected(pcpu_rt->from,
831 mxc->mx = NULL; 888 lockdep_is_held(&table->tb6_lock));
889 rcu_assign_pointer(pcpu_rt->from, NULL);
890 fib6_info_release(from);
891 }
832 } 892 }
833
834 return 0;
835} 893}
836 894
837static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, 895static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
838 struct net *net) 896 struct net *net)
839{ 897{
840 struct fib6_table *table = rt->rt6i_table; 898 struct fib6_table *table = rt->fib6_table;
841 899
842 if (atomic_read(&rt->rt6i_ref) != 1) { 900 if (atomic_read(&rt->fib6_ref) != 1) {
843 /* This route is used as dummy address holder in some split 901 /* This route is used as dummy address holder in some split
844 * nodes. It is not leaked, but it still holds other resources, 902 * nodes. It is not leaked, but it still holds other resources,
845 * which must be released in time. So, scan ascendant nodes 903 * which must be released in time. So, scan ascendant nodes
@@ -847,18 +905,22 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
847 * to still alive ones. 905 * to still alive ones.
848 */ 906 */
849 while (fn) { 907 while (fn) {
850 struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, 908 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
851 lockdep_is_held(&table->tb6_lock)); 909 lockdep_is_held(&table->tb6_lock));
852 struct rt6_info *new_leaf; 910 struct fib6_info *new_leaf;
853 if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { 911 if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
854 new_leaf = fib6_find_prefix(net, table, fn); 912 new_leaf = fib6_find_prefix(net, table, fn);
855 atomic_inc(&new_leaf->rt6i_ref); 913 atomic_inc(&new_leaf->fib6_ref);
914
856 rcu_assign_pointer(fn->leaf, new_leaf); 915 rcu_assign_pointer(fn->leaf, new_leaf);
857 rt6_release(rt); 916 fib6_info_release(rt);
858 } 917 }
859 fn = rcu_dereference_protected(fn->parent, 918 fn = rcu_dereference_protected(fn->parent,
860 lockdep_is_held(&table->tb6_lock)); 919 lockdep_is_held(&table->tb6_lock));
861 } 920 }
921
922 if (rt->rt6i_pcpu)
923 fib6_drop_pcpu_from(rt, table);
862 } 924 }
863} 925}
864 926
@@ -866,37 +928,37 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
866 * Insert routing information in a node. 928 * Insert routing information in a node.
867 */ 929 */
868 930
869static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 931static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
870 struct nl_info *info, struct mx6_config *mxc, 932 struct nl_info *info,
871 struct netlink_ext_ack *extack) 933 struct netlink_ext_ack *extack)
872{ 934{
873 struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, 935 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
874 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 936 lockdep_is_held(&rt->fib6_table->tb6_lock));
875 struct rt6_info *iter = NULL; 937 struct fib6_info *iter = NULL, *match = NULL;
876 struct rt6_info __rcu **ins; 938 struct fib6_info __rcu **ins;
877 struct rt6_info __rcu **fallback_ins = NULL;
878 int replace = (info->nlh && 939 int replace = (info->nlh &&
879 (info->nlh->nlmsg_flags & NLM_F_REPLACE)); 940 (info->nlh->nlmsg_flags & NLM_F_REPLACE));
941 int append = (info->nlh &&
942 (info->nlh->nlmsg_flags & NLM_F_APPEND));
880 int add = (!info->nlh || 943 int add = (!info->nlh ||
881 (info->nlh->nlmsg_flags & NLM_F_CREATE)); 944 (info->nlh->nlmsg_flags & NLM_F_CREATE));
882 int found = 0; 945 int found = 0;
883 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
884 u16 nlflags = NLM_F_EXCL; 946 u16 nlflags = NLM_F_EXCL;
885 int err; 947 int err;
886 948
887 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) 949 if (append)
888 nlflags |= NLM_F_APPEND; 950 nlflags |= NLM_F_APPEND;
889 951
890 ins = &fn->leaf; 952 ins = &fn->leaf;
891 953
892 for (iter = leaf; iter; 954 for (iter = leaf; iter;
893 iter = rcu_dereference_protected(iter->rt6_next, 955 iter = rcu_dereference_protected(iter->fib6_next,
894 lockdep_is_held(&rt->rt6i_table->tb6_lock))) { 956 lockdep_is_held(&rt->fib6_table->tb6_lock))) {
895 /* 957 /*
896 * Search for duplicates 958 * Search for duplicates
897 */ 959 */
898 960
899 if (iter->rt6i_metric == rt->rt6i_metric) { 961 if (iter->fib6_metric == rt->fib6_metric) {
900 /* 962 /*
901 * Same priority level 963 * Same priority level
902 */ 964 */
@@ -906,56 +968,32 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
906 968
907 nlflags &= ~NLM_F_EXCL; 969 nlflags &= ~NLM_F_EXCL;
908 if (replace) { 970 if (replace) {
909 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { 971 found++;
910 found++; 972 break;
911 break;
912 }
913 if (rt_can_ecmp)
914 fallback_ins = fallback_ins ?: ins;
915 goto next_iter;
916 } 973 }
917 974
918 if (rt6_duplicate_nexthop(iter, rt)) { 975 if (rt6_duplicate_nexthop(iter, rt)) {
919 if (rt->rt6i_nsiblings) 976 if (rt->fib6_nsiblings)
920 rt->rt6i_nsiblings = 0; 977 rt->fib6_nsiblings = 0;
921 if (!(iter->rt6i_flags & RTF_EXPIRES)) 978 if (!(iter->fib6_flags & RTF_EXPIRES))
922 return -EEXIST; 979 return -EEXIST;
923 if (!(rt->rt6i_flags & RTF_EXPIRES)) 980 if (!(rt->fib6_flags & RTF_EXPIRES))
924 rt6_clean_expires(iter); 981 fib6_clean_expires(iter);
925 else 982 else
926 rt6_set_expires(iter, rt->dst.expires); 983 fib6_set_expires(iter, rt->expires);
927 iter->rt6i_pmtu = rt->rt6i_pmtu; 984 fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
928 return -EEXIST; 985 return -EEXIST;
929 } 986 }
930 /* If we have the same destination and the same metric, 987
931 * but not the same gateway, then the route we try to 988 /* first route that matches */
932 * add is sibling to this route, increment our counter 989 if (!match)
933 * of siblings, and later we will add our route to the 990 match = iter;
934 * list.
935 * Only static routes (which don't have flag
936 * RTF_EXPIRES) are used for ECMPv6.
937 *
938 * To avoid long list, we only had siblings if the
939 * route have a gateway.
940 */
941 if (rt_can_ecmp &&
942 rt6_qualify_for_ecmp(iter))
943 rt->rt6i_nsiblings++;
944 } 991 }
945 992
946 if (iter->rt6i_metric > rt->rt6i_metric) 993 if (iter->fib6_metric > rt->fib6_metric)
947 break; 994 break;
948 995
949next_iter: 996 ins = &iter->fib6_next;
950 ins = &iter->rt6_next;
951 }
952
953 if (fallback_ins && !found) {
954 /* No ECMP-able route found, replace first non-ECMP one */
955 ins = fallback_ins;
956 iter = rcu_dereference_protected(*ins,
957 lockdep_is_held(&rt->rt6i_table->tb6_lock));
958 found++;
959 } 997 }
960 998
961 /* Reset round-robin state, if necessary */ 999 /* Reset round-robin state, if necessary */
@@ -963,59 +1001,56 @@ next_iter:
963 fn->rr_ptr = NULL; 1001 fn->rr_ptr = NULL;
964 1002
965 /* Link this route to others same route. */ 1003 /* Link this route to others same route. */
966 if (rt->rt6i_nsiblings) { 1004 if (append && match) {
967 unsigned int rt6i_nsiblings; 1005 struct fib6_info *sibling, *temp_sibling;
968 struct rt6_info *sibling, *temp_sibling; 1006
969 1007 if (rt->fib6_flags & RTF_REJECT) {
970 /* Find the first route that have the same metric */ 1008 NL_SET_ERR_MSG(extack,
971 sibling = leaf; 1009 "Can not append a REJECT route");
972 while (sibling) { 1010 return -EINVAL;
973 if (sibling->rt6i_metric == rt->rt6i_metric && 1011 } else if (match->fib6_flags & RTF_REJECT) {
974 rt6_qualify_for_ecmp(sibling)) { 1012 NL_SET_ERR_MSG(extack,
975 list_add_tail(&rt->rt6i_siblings, 1013 "Can not append to a REJECT route");
976 &sibling->rt6i_siblings); 1014 return -EINVAL;
977 break;
978 }
979 sibling = rcu_dereference_protected(sibling->rt6_next,
980 lockdep_is_held(&rt->rt6i_table->tb6_lock));
981 } 1015 }
1016 rt->fib6_nsiblings = match->fib6_nsiblings;
1017 list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
1018 match->fib6_nsiblings++;
1019
982 /* For each sibling in the list, increment the counter of 1020 /* For each sibling in the list, increment the counter of
983 * siblings. BUG() if counters does not match, list of siblings 1021 * siblings. BUG() if counters does not match, list of siblings
984 * is broken! 1022 * is broken!
985 */ 1023 */
986 rt6i_nsiblings = 0;
987 list_for_each_entry_safe(sibling, temp_sibling, 1024 list_for_each_entry_safe(sibling, temp_sibling,
988 &rt->rt6i_siblings, rt6i_siblings) { 1025 &match->fib6_siblings, fib6_siblings) {
989 sibling->rt6i_nsiblings++; 1026 sibling->fib6_nsiblings++;
990 BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); 1027 BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
991 rt6i_nsiblings++;
992 } 1028 }
993 BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); 1029
994 rt6_multipath_rebalance(temp_sibling); 1030 rt6_multipath_rebalance(match);
995 } 1031 }
996 1032
997 /* 1033 /*
998 * insert node 1034 * insert node
999 */ 1035 */
1000 if (!replace) { 1036 if (!replace) {
1037 enum fib_event_type event;
1038
1001 if (!add) 1039 if (!add)
1002 pr_warn("NLM_F_CREATE should be set when creating new route\n"); 1040 pr_warn("NLM_F_CREATE should be set when creating new route\n");
1003 1041
1004add: 1042add:
1005 nlflags |= NLM_F_CREATE; 1043 nlflags |= NLM_F_CREATE;
1006 err = fib6_commit_metrics(&rt->dst, mxc);
1007 if (err)
1008 return err;
1009 1044
1010 err = call_fib6_entry_notifiers(info->nl_net, 1045 event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
1011 FIB_EVENT_ENTRY_ADD, 1046 err = call_fib6_entry_notifiers(info->nl_net, event, rt,
1012 rt, extack); 1047 extack);
1013 if (err) 1048 if (err)
1014 return err; 1049 return err;
1015 1050
1016 rcu_assign_pointer(rt->rt6_next, iter); 1051 rcu_assign_pointer(rt->fib6_next, iter);
1017 atomic_inc(&rt->rt6i_ref); 1052 atomic_inc(&rt->fib6_ref);
1018 rcu_assign_pointer(rt->rt6i_node, fn); 1053 rcu_assign_pointer(rt->fib6_node, fn);
1019 rcu_assign_pointer(*ins, rt); 1054 rcu_assign_pointer(*ins, rt);
1020 if (!info->skip_notify) 1055 if (!info->skip_notify)
1021 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 1056 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -1027,7 +1062,7 @@ add:
1027 } 1062 }
1028 1063
1029 } else { 1064 } else {
1030 int nsiblings; 1065 struct fib6_info *tmp;
1031 1066
1032 if (!found) { 1067 if (!found) {
1033 if (add) 1068 if (add)
@@ -1036,67 +1071,72 @@ add:
1036 return -ENOENT; 1071 return -ENOENT;
1037 } 1072 }
1038 1073
1039 err = fib6_commit_metrics(&rt->dst, mxc);
1040 if (err)
1041 return err;
1042
1043 err = call_fib6_entry_notifiers(info->nl_net, 1074 err = call_fib6_entry_notifiers(info->nl_net,
1044 FIB_EVENT_ENTRY_REPLACE, 1075 FIB_EVENT_ENTRY_REPLACE,
1045 rt, extack); 1076 rt, extack);
1046 if (err) 1077 if (err)
1047 return err; 1078 return err;
1048 1079
1049 atomic_inc(&rt->rt6i_ref); 1080 /* if route being replaced has siblings, set tmp to
1050 rcu_assign_pointer(rt->rt6i_node, fn); 1081 * last one, otherwise tmp is current route. this is
1051 rt->rt6_next = iter->rt6_next; 1082 * used to set fib6_next for new route
1083 */
1084 if (iter->fib6_nsiblings)
1085 tmp = list_last_entry(&iter->fib6_siblings,
1086 struct fib6_info,
1087 fib6_siblings);
1088 else
1089 tmp = iter;
1090
1091 /* insert new route */
1092 atomic_inc(&rt->fib6_ref);
1093 rcu_assign_pointer(rt->fib6_node, fn);
1094 rt->fib6_next = tmp->fib6_next;
1052 rcu_assign_pointer(*ins, rt); 1095 rcu_assign_pointer(*ins, rt);
1096
1053 if (!info->skip_notify) 1097 if (!info->skip_notify)
1054 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); 1098 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1055 if (!(fn->fn_flags & RTN_RTINFO)) { 1099 if (!(fn->fn_flags & RTN_RTINFO)) {
1056 info->nl_net->ipv6.rt6_stats->fib_route_nodes++; 1100 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1057 fn->fn_flags |= RTN_RTINFO; 1101 fn->fn_flags |= RTN_RTINFO;
1058 } 1102 }
1059 nsiblings = iter->rt6i_nsiblings;
1060 iter->rt6i_node = NULL;
1061 fib6_purge_rt(iter, fn, info->nl_net);
1062 if (rcu_access_pointer(fn->rr_ptr) == iter)
1063 fn->rr_ptr = NULL;
1064 rt6_release(iter);
1065 1103
1066 if (nsiblings) { 1104 /* delete old route */
1105 rt = iter;
1106
1107 if (rt->fib6_nsiblings) {
1108 struct fib6_info *tmp;
1109
1067 /* Replacing an ECMP route, remove all siblings */ 1110 /* Replacing an ECMP route, remove all siblings */
1068 ins = &rt->rt6_next; 1111 list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
1069 iter = rcu_dereference_protected(*ins, 1112 fib6_siblings) {
1070 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 1113 iter->fib6_node = NULL;
1071 while (iter) { 1114 fib6_purge_rt(iter, fn, info->nl_net);
1072 if (iter->rt6i_metric > rt->rt6i_metric) 1115 if (rcu_access_pointer(fn->rr_ptr) == iter)
1073 break; 1116 fn->rr_ptr = NULL;
1074 if (rt6_qualify_for_ecmp(iter)) { 1117 fib6_info_release(iter);
1075 *ins = iter->rt6_next; 1118
1076 iter->rt6i_node = NULL; 1119 rt->fib6_nsiblings--;
1077 fib6_purge_rt(iter, fn, info->nl_net); 1120 info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1078 if (rcu_access_pointer(fn->rr_ptr) == iter)
1079 fn->rr_ptr = NULL;
1080 rt6_release(iter);
1081 nsiblings--;
1082 info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1083 } else {
1084 ins = &iter->rt6_next;
1085 }
1086 iter = rcu_dereference_protected(*ins,
1087 lockdep_is_held(&rt->rt6i_table->tb6_lock));
1088 } 1121 }
1089 WARN_ON(nsiblings != 0);
1090 } 1122 }
1123
1124 WARN_ON(rt->fib6_nsiblings != 0);
1125
1126 rt->fib6_node = NULL;
1127 fib6_purge_rt(rt, fn, info->nl_net);
1128 if (rcu_access_pointer(fn->rr_ptr) == rt)
1129 fn->rr_ptr = NULL;
1130 fib6_info_release(rt);
1091 } 1131 }
1092 1132
1093 return 0; 1133 return 0;
1094} 1134}
1095 1135
1096static void fib6_start_gc(struct net *net, struct rt6_info *rt) 1136static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1097{ 1137{
1098 if (!timer_pending(&net->ipv6.ip6_fib_timer) && 1138 if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1099 (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) 1139 (rt->fib6_flags & RTF_EXPIRES))
1100 mod_timer(&net->ipv6.ip6_fib_timer, 1140 mod_timer(&net->ipv6.ip6_fib_timer,
1101 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); 1141 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1102} 1142}
@@ -1108,22 +1148,22 @@ void fib6_force_start_gc(struct net *net)
1108 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); 1148 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1109} 1149}
1110 1150
1111static void __fib6_update_sernum_upto_root(struct rt6_info *rt, 1151static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1112 int sernum) 1152 int sernum)
1113{ 1153{
1114 struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, 1154 struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1115 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 1155 lockdep_is_held(&rt->fib6_table->tb6_lock));
1116 1156
1117 /* paired with smp_rmb() in rt6_get_cookie_safe() */ 1157 /* paired with smp_rmb() in rt6_get_cookie_safe() */
1118 smp_wmb(); 1158 smp_wmb();
1119 while (fn) { 1159 while (fn) {
1120 fn->fn_sernum = sernum; 1160 fn->fn_sernum = sernum;
1121 fn = rcu_dereference_protected(fn->parent, 1161 fn = rcu_dereference_protected(fn->parent,
1122 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 1162 lockdep_is_held(&rt->fib6_table->tb6_lock));
1123 } 1163 }
1124} 1164}
1125 1165
1126void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) 1166void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1127{ 1167{
1128 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); 1168 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1129} 1169}
@@ -1135,22 +1175,16 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
1135 * Need to own table->tb6_lock 1175 * Need to own table->tb6_lock
1136 */ 1176 */
1137 1177
1138int fib6_add(struct fib6_node *root, struct rt6_info *rt, 1178int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1139 struct nl_info *info, struct mx6_config *mxc, 1179 struct nl_info *info, struct netlink_ext_ack *extack)
1140 struct netlink_ext_ack *extack)
1141{ 1180{
1142 struct fib6_table *table = rt->rt6i_table; 1181 struct fib6_table *table = rt->fib6_table;
1143 struct fib6_node *fn, *pn = NULL; 1182 struct fib6_node *fn, *pn = NULL;
1144 int err = -ENOMEM; 1183 int err = -ENOMEM;
1145 int allow_create = 1; 1184 int allow_create = 1;
1146 int replace_required = 0; 1185 int replace_required = 0;
1147 int sernum = fib6_new_sernum(info->nl_net); 1186 int sernum = fib6_new_sernum(info->nl_net);
1148 1187
1149 if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
1150 return -EINVAL;
1151 if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
1152 return -EINVAL;
1153
1154 if (info->nlh) { 1188 if (info->nlh) {
1155 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) 1189 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1156 allow_create = 0; 1190 allow_create = 0;
@@ -1161,8 +1195,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
1161 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); 1195 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1162 1196
1163 fn = fib6_add_1(info->nl_net, table, root, 1197 fn = fib6_add_1(info->nl_net, table, root,
1164 &rt->rt6i_dst.addr, rt->rt6i_dst.plen, 1198 &rt->fib6_dst.addr, rt->fib6_dst.plen,
1165 offsetof(struct rt6_info, rt6i_dst), allow_create, 1199 offsetof(struct fib6_info, fib6_dst), allow_create,
1166 replace_required, extack); 1200 replace_required, extack);
1167 if (IS_ERR(fn)) { 1201 if (IS_ERR(fn)) {
1168 err = PTR_ERR(fn); 1202 err = PTR_ERR(fn);
@@ -1173,7 +1207,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
1173 pn = fn; 1207 pn = fn;
1174 1208
1175#ifdef CONFIG_IPV6_SUBTREES 1209#ifdef CONFIG_IPV6_SUBTREES
1176 if (rt->rt6i_src.plen) { 1210 if (rt->fib6_src.plen) {
1177 struct fib6_node *sn; 1211 struct fib6_node *sn;
1178 1212
1179 if (!rcu_access_pointer(fn->subtree)) { 1213 if (!rcu_access_pointer(fn->subtree)) {
@@ -1194,16 +1228,16 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
1194 if (!sfn) 1228 if (!sfn)
1195 goto failure; 1229 goto failure;
1196 1230
1197 atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); 1231 atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
1198 rcu_assign_pointer(sfn->leaf, 1232 rcu_assign_pointer(sfn->leaf,
1199 info->nl_net->ipv6.ip6_null_entry); 1233 info->nl_net->ipv6.fib6_null_entry);
1200 sfn->fn_flags = RTN_ROOT; 1234 sfn->fn_flags = RTN_ROOT;
1201 1235
1202 /* Now add the first leaf node to new subtree */ 1236 /* Now add the first leaf node to new subtree */
1203 1237
1204 sn = fib6_add_1(info->nl_net, table, sfn, 1238 sn = fib6_add_1(info->nl_net, table, sfn,
1205 &rt->rt6i_src.addr, rt->rt6i_src.plen, 1239 &rt->fib6_src.addr, rt->fib6_src.plen,
1206 offsetof(struct rt6_info, rt6i_src), 1240 offsetof(struct fib6_info, fib6_src),
1207 allow_create, replace_required, extack); 1241 allow_create, replace_required, extack);
1208 1242
1209 if (IS_ERR(sn)) { 1243 if (IS_ERR(sn)) {
@@ -1221,8 +1255,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
1221 rcu_assign_pointer(fn->subtree, sfn); 1255 rcu_assign_pointer(fn->subtree, sfn);
1222 } else { 1256 } else {
1223 sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), 1257 sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1224 &rt->rt6i_src.addr, rt->rt6i_src.plen, 1258 &rt->fib6_src.addr, rt->fib6_src.plen,
1225 offsetof(struct rt6_info, rt6i_src), 1259 offsetof(struct fib6_info, fib6_src),
1226 allow_create, replace_required, extack); 1260 allow_create, replace_required, extack);
1227 1261
1228 if (IS_ERR(sn)) { 1262 if (IS_ERR(sn)) {
@@ -1235,9 +1269,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
1235 if (fn->fn_flags & RTN_TL_ROOT) { 1269 if (fn->fn_flags & RTN_TL_ROOT) {
1236 /* put back null_entry for root node */ 1270 /* put back null_entry for root node */
1237 rcu_assign_pointer(fn->leaf, 1271 rcu_assign_pointer(fn->leaf,
1238 info->nl_net->ipv6.ip6_null_entry); 1272 info->nl_net->ipv6.fib6_null_entry);
1239 } else { 1273 } else {
1240 atomic_inc(&rt->rt6i_ref); 1274 atomic_inc(&rt->fib6_ref);
1241 rcu_assign_pointer(fn->leaf, rt); 1275 rcu_assign_pointer(fn->leaf, rt);
1242 } 1276 }
1243 } 1277 }
@@ -1245,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
1245 } 1279 }
1246#endif 1280#endif
1247 1281
1248 err = fib6_add_rt2node(fn, rt, info, mxc, extack); 1282 err = fib6_add_rt2node(fn, rt, info, extack);
1249 if (!err) { 1283 if (!err) {
1250 __fib6_update_sernum_upto_root(rt, sernum); 1284 __fib6_update_sernum_upto_root(rt, sernum);
1251 fib6_start_gc(info->nl_net, rt); 1285 fib6_start_gc(info->nl_net, rt);
@@ -1259,13 +1293,13 @@ out:
1259 * super-tree leaf node we have to find a new one for it. 1293 * super-tree leaf node we have to find a new one for it.
1260 */ 1294 */
1261 if (pn != fn) { 1295 if (pn != fn) {
1262 struct rt6_info *pn_leaf = 1296 struct fib6_info *pn_leaf =
1263 rcu_dereference_protected(pn->leaf, 1297 rcu_dereference_protected(pn->leaf,
1264 lockdep_is_held(&table->tb6_lock)); 1298 lockdep_is_held(&table->tb6_lock));
1265 if (pn_leaf == rt) { 1299 if (pn_leaf == rt) {
1266 pn_leaf = NULL; 1300 pn_leaf = NULL;
1267 RCU_INIT_POINTER(pn->leaf, NULL); 1301 RCU_INIT_POINTER(pn->leaf, NULL);
1268 atomic_dec(&rt->rt6i_ref); 1302 fib6_info_release(rt);
1269 } 1303 }
1270 if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { 1304 if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1271 pn_leaf = fib6_find_prefix(info->nl_net, table, 1305 pn_leaf = fib6_find_prefix(info->nl_net, table,
@@ -1274,10 +1308,10 @@ out:
1274 if (!pn_leaf) { 1308 if (!pn_leaf) {
1275 WARN_ON(!pn_leaf); 1309 WARN_ON(!pn_leaf);
1276 pn_leaf = 1310 pn_leaf =
1277 info->nl_net->ipv6.ip6_null_entry; 1311 info->nl_net->ipv6.fib6_null_entry;
1278 } 1312 }
1279#endif 1313#endif
1280 atomic_inc(&pn_leaf->rt6i_ref); 1314 fib6_info_hold(pn_leaf);
1281 rcu_assign_pointer(pn->leaf, pn_leaf); 1315 rcu_assign_pointer(pn->leaf, pn_leaf);
1282 } 1316 }
1283 } 1317 }
@@ -1299,10 +1333,6 @@ failure:
1299 (fn->fn_flags & RTN_TL_ROOT && 1333 (fn->fn_flags & RTN_TL_ROOT &&
1300 !rcu_access_pointer(fn->leaf)))) 1334 !rcu_access_pointer(fn->leaf))))
1301 fib6_repair_tree(info->nl_net, table, fn); 1335 fib6_repair_tree(info->nl_net, table, fn);
1302 /* Always release dst as dst->__refcnt is guaranteed
1303 * to be taken before entering this function
1304 */
1305 dst_release_immediate(&rt->dst);
1306 return err; 1336 return err;
1307} 1337}
1308 1338
@@ -1312,12 +1342,12 @@ failure:
1312 */ 1342 */
1313 1343
1314struct lookup_args { 1344struct lookup_args {
1315 int offset; /* key offset on rt6_info */ 1345 int offset; /* key offset on fib6_info */
1316 const struct in6_addr *addr; /* search key */ 1346 const struct in6_addr *addr; /* search key */
1317}; 1347};
1318 1348
1319static struct fib6_node *fib6_lookup_1(struct fib6_node *root, 1349static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1320 struct lookup_args *args) 1350 struct lookup_args *args)
1321{ 1351{
1322 struct fib6_node *fn; 1352 struct fib6_node *fn;
1323 __be32 dir; 1353 __be32 dir;
@@ -1350,7 +1380,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
1350 struct fib6_node *subtree = FIB6_SUBTREE(fn); 1380 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1351 1381
1352 if (subtree || fn->fn_flags & RTN_RTINFO) { 1382 if (subtree || fn->fn_flags & RTN_RTINFO) {
1353 struct rt6_info *leaf = rcu_dereference(fn->leaf); 1383 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1354 struct rt6key *key; 1384 struct rt6key *key;
1355 1385
1356 if (!leaf) 1386 if (!leaf)
@@ -1362,7 +1392,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
1362#ifdef CONFIG_IPV6_SUBTREES 1392#ifdef CONFIG_IPV6_SUBTREES
1363 if (subtree) { 1393 if (subtree) {
1364 struct fib6_node *sfn; 1394 struct fib6_node *sfn;
1365 sfn = fib6_lookup_1(subtree, args + 1); 1395 sfn = fib6_node_lookup_1(subtree,
1396 args + 1);
1366 if (!sfn) 1397 if (!sfn)
1367 goto backtrack; 1398 goto backtrack;
1368 fn = sfn; 1399 fn = sfn;
@@ -1384,18 +1415,19 @@ backtrack:
1384 1415
1385/* called with rcu_read_lock() held 1416/* called with rcu_read_lock() held
1386 */ 1417 */
1387struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, 1418struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1388 const struct in6_addr *saddr) 1419 const struct in6_addr *daddr,
1420 const struct in6_addr *saddr)
1389{ 1421{
1390 struct fib6_node *fn; 1422 struct fib6_node *fn;
1391 struct lookup_args args[] = { 1423 struct lookup_args args[] = {
1392 { 1424 {
1393 .offset = offsetof(struct rt6_info, rt6i_dst), 1425 .offset = offsetof(struct fib6_info, fib6_dst),
1394 .addr = daddr, 1426 .addr = daddr,
1395 }, 1427 },
1396#ifdef CONFIG_IPV6_SUBTREES 1428#ifdef CONFIG_IPV6_SUBTREES
1397 { 1429 {
1398 .offset = offsetof(struct rt6_info, rt6i_src), 1430 .offset = offsetof(struct fib6_info, fib6_src),
1399 .addr = saddr, 1431 .addr = saddr,
1400 }, 1432 },
1401#endif 1433#endif
@@ -1404,7 +1436,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
1404 } 1436 }
1405 }; 1437 };
1406 1438
1407 fn = fib6_lookup_1(root, daddr ? args : args + 1); 1439 fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1408 if (!fn || fn->fn_flags & RTN_TL_ROOT) 1440 if (!fn || fn->fn_flags & RTN_TL_ROOT)
1409 fn = root; 1441 fn = root;
1410 1442
@@ -1431,7 +1463,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1431 struct fib6_node *fn, *prev = NULL; 1463 struct fib6_node *fn, *prev = NULL;
1432 1464
1433 for (fn = root; fn ; ) { 1465 for (fn = root; fn ; ) {
1434 struct rt6_info *leaf = rcu_dereference(fn->leaf); 1466 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1435 struct rt6key *key; 1467 struct rt6key *key;
1436 1468
1437 /* This node is being deleted */ 1469 /* This node is being deleted */
@@ -1480,7 +1512,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
1480 struct fib6_node *fn; 1512 struct fib6_node *fn;
1481 1513
1482 fn = fib6_locate_1(root, daddr, dst_len, 1514 fn = fib6_locate_1(root, daddr, dst_len,
1483 offsetof(struct rt6_info, rt6i_dst), 1515 offsetof(struct fib6_info, fib6_dst),
1484 exact_match); 1516 exact_match);
1485 1517
1486#ifdef CONFIG_IPV6_SUBTREES 1518#ifdef CONFIG_IPV6_SUBTREES
@@ -1491,7 +1523,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
1491 1523
1492 if (subtree) { 1524 if (subtree) {
1493 fn = fib6_locate_1(subtree, saddr, src_len, 1525 fn = fib6_locate_1(subtree, saddr, src_len,
1494 offsetof(struct rt6_info, rt6i_src), 1526 offsetof(struct fib6_info, fib6_src),
1495 exact_match); 1527 exact_match);
1496 } 1528 }
1497 } 1529 }
@@ -1510,14 +1542,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
1510 * 1542 *
1511 */ 1543 */
1512 1544
1513static struct rt6_info *fib6_find_prefix(struct net *net, 1545static struct fib6_info *fib6_find_prefix(struct net *net,
1514 struct fib6_table *table, 1546 struct fib6_table *table,
1515 struct fib6_node *fn) 1547 struct fib6_node *fn)
1516{ 1548{
1517 struct fib6_node *child_left, *child_right; 1549 struct fib6_node *child_left, *child_right;
1518 1550
1519 if (fn->fn_flags & RTN_ROOT) 1551 if (fn->fn_flags & RTN_ROOT)
1520 return net->ipv6.ip6_null_entry; 1552 return net->ipv6.fib6_null_entry;
1521 1553
1522 while (fn) { 1554 while (fn) {
1523 child_left = rcu_dereference_protected(fn->left, 1555 child_left = rcu_dereference_protected(fn->left,
@@ -1554,7 +1586,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
1554 1586
1555 /* Set fn->leaf to null_entry for root node. */ 1587 /* Set fn->leaf to null_entry for root node. */
1556 if (fn->fn_flags & RTN_TL_ROOT) { 1588 if (fn->fn_flags & RTN_TL_ROOT) {
1557 rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); 1589 rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1558 return fn; 1590 return fn;
1559 } 1591 }
1560 1592
@@ -1569,11 +1601,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
1569 lockdep_is_held(&table->tb6_lock)); 1601 lockdep_is_held(&table->tb6_lock));
1570 struct fib6_node *pn_l = rcu_dereference_protected(pn->left, 1602 struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1571 lockdep_is_held(&table->tb6_lock)); 1603 lockdep_is_held(&table->tb6_lock));
1572 struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, 1604 struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1573 lockdep_is_held(&table->tb6_lock)); 1605 lockdep_is_held(&table->tb6_lock));
1574 struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, 1606 struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1575 lockdep_is_held(&table->tb6_lock)); 1607 lockdep_is_held(&table->tb6_lock));
1576 struct rt6_info *new_fn_leaf; 1608 struct fib6_info *new_fn_leaf;
1577 1609
1578 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); 1610 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1579 iter++; 1611 iter++;
@@ -1599,10 +1631,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
1599#if RT6_DEBUG >= 2 1631#if RT6_DEBUG >= 2
1600 if (!new_fn_leaf) { 1632 if (!new_fn_leaf) {
1601 WARN_ON(!new_fn_leaf); 1633 WARN_ON(!new_fn_leaf);
1602 new_fn_leaf = net->ipv6.ip6_null_entry; 1634 new_fn_leaf = net->ipv6.fib6_null_entry;
1603 } 1635 }
1604#endif 1636#endif
1605 atomic_inc(&new_fn_leaf->rt6i_ref); 1637 fib6_info_hold(new_fn_leaf);
1606 rcu_assign_pointer(fn->leaf, new_fn_leaf); 1638 rcu_assign_pointer(fn->leaf, new_fn_leaf);
1607 return pn; 1639 return pn;
1608 } 1640 }
@@ -1658,26 +1690,24 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
1658 return pn; 1690 return pn;
1659 1691
1660 RCU_INIT_POINTER(pn->leaf, NULL); 1692 RCU_INIT_POINTER(pn->leaf, NULL);
1661 rt6_release(pn_leaf); 1693 fib6_info_release(pn_leaf);
1662 fn = pn; 1694 fn = pn;
1663 } 1695 }
1664} 1696}
1665 1697
1666static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, 1698static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1667 struct rt6_info __rcu **rtp, struct nl_info *info) 1699 struct fib6_info __rcu **rtp, struct nl_info *info)
1668{ 1700{
1669 struct fib6_walker *w; 1701 struct fib6_walker *w;
1670 struct rt6_info *rt = rcu_dereference_protected(*rtp, 1702 struct fib6_info *rt = rcu_dereference_protected(*rtp,
1671 lockdep_is_held(&table->tb6_lock)); 1703 lockdep_is_held(&table->tb6_lock));
1672 struct net *net = info->nl_net; 1704 struct net *net = info->nl_net;
1673 1705
1674 RT6_TRACE("fib6_del_route\n"); 1706 RT6_TRACE("fib6_del_route\n");
1675 1707
1676 WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
1677
1678 /* Unlink it */ 1708 /* Unlink it */
1679 *rtp = rt->rt6_next; 1709 *rtp = rt->fib6_next;
1680 rt->rt6i_node = NULL; 1710 rt->fib6_node = NULL;
1681 net->ipv6.rt6_stats->fib_rt_entries--; 1711 net->ipv6.rt6_stats->fib_rt_entries--;
1682 net->ipv6.rt6_stats->fib_discarded_routes++; 1712 net->ipv6.rt6_stats->fib_discarded_routes++;
1683 1713
@@ -1689,14 +1719,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1689 fn->rr_ptr = NULL; 1719 fn->rr_ptr = NULL;
1690 1720
1691 /* Remove this entry from other siblings */ 1721 /* Remove this entry from other siblings */
1692 if (rt->rt6i_nsiblings) { 1722 if (rt->fib6_nsiblings) {
1693 struct rt6_info *sibling, *next_sibling; 1723 struct fib6_info *sibling, *next_sibling;
1694 1724
1695 list_for_each_entry_safe(sibling, next_sibling, 1725 list_for_each_entry_safe(sibling, next_sibling,
1696 &rt->rt6i_siblings, rt6i_siblings) 1726 &rt->fib6_siblings, fib6_siblings)
1697 sibling->rt6i_nsiblings--; 1727 sibling->fib6_nsiblings--;
1698 rt->rt6i_nsiblings = 0; 1728 rt->fib6_nsiblings = 0;
1699 list_del_init(&rt->rt6i_siblings); 1729 list_del_init(&rt->fib6_siblings);
1700 rt6_multipath_rebalance(next_sibling); 1730 rt6_multipath_rebalance(next_sibling);
1701 } 1731 }
1702 1732
@@ -1705,7 +1735,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1705 FOR_WALKERS(net, w) { 1735 FOR_WALKERS(net, w) {
1706 if (w->state == FWS_C && w->leaf == rt) { 1736 if (w->state == FWS_C && w->leaf == rt) {
1707 RT6_TRACE("walker %p adjusted by delroute\n", w); 1737 RT6_TRACE("walker %p adjusted by delroute\n", w);
1708 w->leaf = rcu_dereference_protected(rt->rt6_next, 1738 w->leaf = rcu_dereference_protected(rt->fib6_next,
1709 lockdep_is_held(&table->tb6_lock)); 1739 lockdep_is_held(&table->tb6_lock));
1710 if (!w->leaf) 1740 if (!w->leaf)
1711 w->state = FWS_U; 1741 w->state = FWS_U;
@@ -1730,46 +1760,36 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1730 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); 1760 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
1731 if (!info->skip_notify) 1761 if (!info->skip_notify)
1732 inet6_rt_notify(RTM_DELROUTE, rt, info, 0); 1762 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1733 rt6_release(rt); 1763 fib6_info_release(rt);
1734} 1764}
1735 1765
1736/* Need to own table->tb6_lock */ 1766/* Need to own table->tb6_lock */
1737int fib6_del(struct rt6_info *rt, struct nl_info *info) 1767int fib6_del(struct fib6_info *rt, struct nl_info *info)
1738{ 1768{
1739 struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, 1769 struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1740 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 1770 lockdep_is_held(&rt->fib6_table->tb6_lock));
1741 struct fib6_table *table = rt->rt6i_table; 1771 struct fib6_table *table = rt->fib6_table;
1742 struct net *net = info->nl_net; 1772 struct net *net = info->nl_net;
1743 struct rt6_info __rcu **rtp; 1773 struct fib6_info __rcu **rtp;
1744 struct rt6_info __rcu **rtp_next; 1774 struct fib6_info __rcu **rtp_next;
1745 1775
1746#if RT6_DEBUG >= 2 1776 if (!fn || rt == net->ipv6.fib6_null_entry)
1747 if (rt->dst.obsolete > 0) {
1748 WARN_ON(fn);
1749 return -ENOENT;
1750 }
1751#endif
1752 if (!fn || rt == net->ipv6.ip6_null_entry)
1753 return -ENOENT; 1777 return -ENOENT;
1754 1778
1755 WARN_ON(!(fn->fn_flags & RTN_RTINFO)); 1779 WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1756 1780
1757 /* remove cached dst from exception table */
1758 if (rt->rt6i_flags & RTF_CACHE)
1759 return rt6_remove_exception_rt(rt);
1760
1761 /* 1781 /*
1762 * Walk the leaf entries looking for ourself 1782 * Walk the leaf entries looking for ourself
1763 */ 1783 */
1764 1784
1765 for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { 1785 for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
1766 struct rt6_info *cur = rcu_dereference_protected(*rtp, 1786 struct fib6_info *cur = rcu_dereference_protected(*rtp,
1767 lockdep_is_held(&table->tb6_lock)); 1787 lockdep_is_held(&table->tb6_lock));
1768 if (rt == cur) { 1788 if (rt == cur) {
1769 fib6_del_route(table, fn, rtp, info); 1789 fib6_del_route(table, fn, rtp, info);
1770 return 0; 1790 return 0;
1771 } 1791 }
1772 rtp_next = &cur->rt6_next; 1792 rtp_next = &cur->fib6_next;
1773 } 1793 }
1774 return -ENOENT; 1794 return -ENOENT;
1775} 1795}
@@ -1907,7 +1927,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w)
1907static int fib6_clean_node(struct fib6_walker *w) 1927static int fib6_clean_node(struct fib6_walker *w)
1908{ 1928{
1909 int res; 1929 int res;
1910 struct rt6_info *rt; 1930 struct fib6_info *rt;
1911 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); 1931 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
1912 struct nl_info info = { 1932 struct nl_info info = {
1913 .nl_net = c->net, 1933 .nl_net = c->net,
@@ -1932,17 +1952,17 @@ static int fib6_clean_node(struct fib6_walker *w)
1932#if RT6_DEBUG >= 2 1952#if RT6_DEBUG >= 2
1933 pr_debug("%s: del failed: rt=%p@%p err=%d\n", 1953 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
1934 __func__, rt, 1954 __func__, rt,
1935 rcu_access_pointer(rt->rt6i_node), 1955 rcu_access_pointer(rt->fib6_node),
1936 res); 1956 res);
1937#endif 1957#endif
1938 continue; 1958 continue;
1939 } 1959 }
1940 return 0; 1960 return 0;
1941 } else if (res == -2) { 1961 } else if (res == -2) {
1942 if (WARN_ON(!rt->rt6i_nsiblings)) 1962 if (WARN_ON(!rt->fib6_nsiblings))
1943 continue; 1963 continue;
1944 rt = list_last_entry(&rt->rt6i_siblings, 1964 rt = list_last_entry(&rt->fib6_siblings,
1945 struct rt6_info, rt6i_siblings); 1965 struct fib6_info, fib6_siblings);
1946 continue; 1966 continue;
1947 } 1967 }
1948 WARN_ON(res != 0); 1968 WARN_ON(res != 0);
@@ -1961,7 +1981,7 @@ static int fib6_clean_node(struct fib6_walker *w)
1961 */ 1981 */
1962 1982
1963static void fib6_clean_tree(struct net *net, struct fib6_node *root, 1983static void fib6_clean_tree(struct net *net, struct fib6_node *root,
1964 int (*func)(struct rt6_info *, void *arg), 1984 int (*func)(struct fib6_info *, void *arg),
1965 int sernum, void *arg) 1985 int sernum, void *arg)
1966{ 1986{
1967 struct fib6_cleaner c; 1987 struct fib6_cleaner c;
@@ -1979,7 +1999,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
1979} 1999}
1980 2000
1981static void __fib6_clean_all(struct net *net, 2001static void __fib6_clean_all(struct net *net,
1982 int (*func)(struct rt6_info *, void *), 2002 int (*func)(struct fib6_info *, void *),
1983 int sernum, void *arg) 2003 int sernum, void *arg)
1984{ 2004{
1985 struct fib6_table *table; 2005 struct fib6_table *table;
@@ -1999,7 +2019,7 @@ static void __fib6_clean_all(struct net *net,
1999 rcu_read_unlock(); 2019 rcu_read_unlock();
2000} 2020}
2001 2021
2002void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), 2022void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2003 void *arg) 2023 void *arg)
2004{ 2024{
2005 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); 2025 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
@@ -2016,7 +2036,7 @@ static void fib6_flush_trees(struct net *net)
2016 * Garbage collection 2036 * Garbage collection
2017 */ 2037 */
2018 2038
2019static int fib6_age(struct rt6_info *rt, void *arg) 2039static int fib6_age(struct fib6_info *rt, void *arg)
2020{ 2040{
2021 struct fib6_gc_args *gc_args = arg; 2041 struct fib6_gc_args *gc_args = arg;
2022 unsigned long now = jiffies; 2042 unsigned long now = jiffies;
@@ -2026,8 +2046,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
2026 * Routes are expired even if they are in use. 2046 * Routes are expired even if they are in use.
2027 */ 2047 */
2028 2048
2029 if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { 2049 if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2030 if (time_after(now, rt->dst.expires)) { 2050 if (time_after(now, rt->expires)) {
2031 RT6_TRACE("expiring %p\n", rt); 2051 RT6_TRACE("expiring %p\n", rt);
2032 return -1; 2052 return -1;
2033 } 2053 }
@@ -2110,7 +2130,7 @@ static int __net_init fib6_net_init(struct net *net)
2110 2130
2111 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; 2131 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2112 rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, 2132 rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2113 net->ipv6.ip6_null_entry); 2133 net->ipv6.fib6_null_entry);
2114 net->ipv6.fib6_main_tbl->tb6_root.fn_flags = 2134 net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2115 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 2135 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2116 inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); 2136 inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2122,7 +2142,7 @@ static int __net_init fib6_net_init(struct net *net)
2122 goto out_fib6_main_tbl; 2142 goto out_fib6_main_tbl;
2123 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; 2143 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2124 rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, 2144 rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2125 net->ipv6.ip6_null_entry); 2145 net->ipv6.fib6_null_entry);
2126 net->ipv6.fib6_local_tbl->tb6_root.fn_flags = 2146 net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2127 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 2147 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2128 inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); 2148 inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2211,25 +2231,26 @@ void fib6_gc_cleanup(void)
2211#ifdef CONFIG_PROC_FS 2231#ifdef CONFIG_PROC_FS
2212static int ipv6_route_seq_show(struct seq_file *seq, void *v) 2232static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2213{ 2233{
2214 struct rt6_info *rt = v; 2234 struct fib6_info *rt = v;
2215 struct ipv6_route_iter *iter = seq->private; 2235 struct ipv6_route_iter *iter = seq->private;
2236 const struct net_device *dev;
2216 2237
2217 seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); 2238 seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2218 2239
2219#ifdef CONFIG_IPV6_SUBTREES 2240#ifdef CONFIG_IPV6_SUBTREES
2220 seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); 2241 seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2221#else 2242#else
2222 seq_puts(seq, "00000000000000000000000000000000 00 "); 2243 seq_puts(seq, "00000000000000000000000000000000 00 ");
2223#endif 2244#endif
2224 if (rt->rt6i_flags & RTF_GATEWAY) 2245 if (rt->fib6_flags & RTF_GATEWAY)
2225 seq_printf(seq, "%pi6", &rt->rt6i_gateway); 2246 seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
2226 else 2247 else
2227 seq_puts(seq, "00000000000000000000000000000000"); 2248 seq_puts(seq, "00000000000000000000000000000000");
2228 2249
2250 dev = rt->fib6_nh.nh_dev;
2229 seq_printf(seq, " %08x %08x %08x %08x %8s\n", 2251 seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2230 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), 2252 rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
2231 rt->dst.__use, rt->rt6i_flags, 2253 rt->fib6_flags, dev ? dev->name : "");
2232 rt->dst.dev ? rt->dst.dev->name : "");
2233 iter->w.leaf = NULL; 2254 iter->w.leaf = NULL;
2234 return 0; 2255 return 0;
2235} 2256}
@@ -2243,7 +2264,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
2243 2264
2244 do { 2265 do {
2245 iter->w.leaf = rcu_dereference_protected( 2266 iter->w.leaf = rcu_dereference_protected(
2246 iter->w.leaf->rt6_next, 2267 iter->w.leaf->fib6_next,
2247 lockdep_is_held(&iter->tbl->tb6_lock)); 2268 lockdep_is_held(&iter->tbl->tb6_lock));
2248 iter->skip--; 2269 iter->skip--;
2249 if (!iter->skip && iter->w.leaf) 2270 if (!iter->skip && iter->w.leaf)
@@ -2302,14 +2323,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2302static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2323static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2303{ 2324{
2304 int r; 2325 int r;
2305 struct rt6_info *n; 2326 struct fib6_info *n;
2306 struct net *net = seq_file_net(seq); 2327 struct net *net = seq_file_net(seq);
2307 struct ipv6_route_iter *iter = seq->private; 2328 struct ipv6_route_iter *iter = seq->private;
2308 2329
2309 if (!v) 2330 if (!v)
2310 goto iter_table; 2331 goto iter_table;
2311 2332
2312 n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); 2333 n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
2313 if (n) { 2334 if (n) {
2314 ++*pos; 2335 ++*pos;
2315 return n; 2336 return n;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 458de353f5d9..c8cf2fdbb13b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -848,7 +848,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
848} 848}
849 849
850/** 850/**
851 * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own 851 * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
852 * @t: the outgoing tunnel device 852 * @t: the outgoing tunnel device
853 * @hdr: IPv6 header from the incoming packet 853 * @hdr: IPv6 header from the incoming packet
854 * 854 *
@@ -937,6 +937,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
937 struct flowi6 fl6; 937 struct flowi6 fl6;
938 int err = -EINVAL; 938 int err = -EINVAL;
939 __u32 mtu; 939 __u32 mtu;
940 int nhoff;
941 int thoff;
940 942
941 if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) 943 if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
942 goto tx_err; 944 goto tx_err;
@@ -949,6 +951,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
949 truncate = true; 951 truncate = true;
950 } 952 }
951 953
954 nhoff = skb_network_header(skb) - skb_mac_header(skb);
955 if (skb->protocol == htons(ETH_P_IP) &&
956 (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
957 truncate = true;
958
959 thoff = skb_transport_header(skb) - skb_mac_header(skb);
960 if (skb->protocol == htons(ETH_P_IPV6) &&
961 (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
962 truncate = true;
963
952 if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) 964 if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
953 goto tx_err; 965 goto tx_err;
954 966
@@ -1376,6 +1388,7 @@ static void ip6gre_dev_free(struct net_device *dev)
1376{ 1388{
1377 struct ip6_tnl *t = netdev_priv(dev); 1389 struct ip6_tnl *t = netdev_priv(dev);
1378 1390
1391 gro_cells_destroy(&t->gro_cells);
1379 dst_cache_destroy(&t->dst_cache); 1392 dst_cache_destroy(&t->dst_cache);
1380 free_percpu(dev->tstats); 1393 free_percpu(dev->tstats);
1381} 1394}
@@ -1443,11 +1456,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
1443 return -ENOMEM; 1456 return -ENOMEM;
1444 1457
1445 ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1458 ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1446 if (ret) { 1459 if (ret)
1447 free_percpu(dev->tstats); 1460 goto cleanup_alloc_pcpu_stats;
1448 dev->tstats = NULL; 1461
1449 return ret; 1462 ret = gro_cells_init(&tunnel->gro_cells, dev);
1450 } 1463 if (ret)
1464 goto cleanup_dst_cache_init;
1451 1465
1452 t_hlen = ip6gre_calc_hlen(tunnel); 1466 t_hlen = ip6gre_calc_hlen(tunnel);
1453 dev->mtu = ETH_DATA_LEN - t_hlen; 1467 dev->mtu = ETH_DATA_LEN - t_hlen;
@@ -1463,6 +1477,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
1463 ip6gre_tnl_init_features(dev); 1477 ip6gre_tnl_init_features(dev);
1464 1478
1465 return 0; 1479 return 0;
1480
1481cleanup_dst_cache_init:
1482 dst_cache_destroy(&tunnel->dst_cache);
1483cleanup_alloc_pcpu_stats:
1484 free_percpu(dev->tstats);
1485 dev->tstats = NULL;
1486 return ret;
1466} 1487}
1467 1488
1468static int ip6gre_tunnel_init(struct net_device *dev) 1489static int ip6gre_tunnel_init(struct net_device *dev)
@@ -1822,11 +1843,12 @@ static int ip6erspan_tap_init(struct net_device *dev)
1822 return -ENOMEM; 1843 return -ENOMEM;
1823 1844
1824 ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1845 ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1825 if (ret) { 1846 if (ret)
1826 free_percpu(dev->tstats); 1847 goto cleanup_alloc_pcpu_stats;
1827 dev->tstats = NULL; 1848
1828 return ret; 1849 ret = gro_cells_init(&tunnel->gro_cells, dev);
1829 } 1850 if (ret)
1851 goto cleanup_dst_cache_init;
1830 1852
1831 t_hlen = ip6erspan_calc_hlen(tunnel); 1853 t_hlen = ip6erspan_calc_hlen(tunnel);
1832 dev->mtu = ETH_DATA_LEN - t_hlen; 1854 dev->mtu = ETH_DATA_LEN - t_hlen;
@@ -1839,6 +1861,13 @@ static int ip6erspan_tap_init(struct net_device *dev)
1839 ip6erspan_tnl_link_config(tunnel, 1); 1861 ip6erspan_tnl_link_config(tunnel, 1);
1840 1862
1841 return 0; 1863 return 0;
1864
1865cleanup_dst_cache_init:
1866 dst_cache_destroy(&tunnel->dst_cache);
1867cleanup_alloc_pcpu_stats:
1868 free_percpu(dev->tstats);
1869 dev->tstats = NULL;
1870 return ret;
1842} 1871}
1843 1872
1844static const struct net_device_ops ip6erspan_netdev_ops = { 1873static const struct net_device_ops ip6erspan_netdev_ops = {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9ee208a348f5..f08d34491ece 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb)
336 bool deliver; 336 bool deliver;
337 337
338 __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), 338 __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
339 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, 339 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
340 skb->len); 340 skb->len);
341 341
342 hdr = ipv6_hdr(skb); 342 hdr = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..5b3f2f89ef41 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
88 88
89 if (skb->encapsulation && 89 if (skb->encapsulation &&
90 skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) 90 skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
91 udpfrag = proto == IPPROTO_UDP && encap; 91 udpfrag = proto == IPPROTO_UDP && encap &&
92 (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
92 else 93 else
93 udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; 94 udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
95 (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
94 96
95 ops = rcu_dereference(inet6_offloads[proto]); 97 ops = rcu_dereference(inet6_offloads[proto]);
96 if (likely(ops && ops->callbacks.gso_segment)) { 98 if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7b6d1689087b..021e5aef6ba3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
383 return dst_output(net, sk, skb); 383 return dst_output(net, sk, skb);
384} 384}
385 385
386unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
387{
388 unsigned int mtu;
389 struct inet6_dev *idev;
390
391 if (dst_metric_locked(dst, RTAX_MTU)) {
392 mtu = dst_metric_raw(dst, RTAX_MTU);
393 if (mtu)
394 return mtu;
395 }
396
397 mtu = IPV6_MIN_MTU;
398 rcu_read_lock();
399 idev = __in6_dev_get(dst->dev);
400 if (idev)
401 mtu = idev->cnf.mtu6;
402 rcu_read_unlock();
403
404 return mtu;
405}
406EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
407
408static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 386static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
409{ 387{
410 if (skb->len <= mtu) 388 if (skb->len <= mtu)
@@ -425,6 +403,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
425 403
426int ip6_forward(struct sk_buff *skb) 404int ip6_forward(struct sk_buff *skb)
427{ 405{
406 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
428 struct dst_entry *dst = skb_dst(skb); 407 struct dst_entry *dst = skb_dst(skb);
429 struct ipv6hdr *hdr = ipv6_hdr(skb); 408 struct ipv6hdr *hdr = ipv6_hdr(skb);
430 struct inet6_skb_parm *opt = IP6CB(skb); 409 struct inet6_skb_parm *opt = IP6CB(skb);
@@ -444,8 +423,7 @@ int ip6_forward(struct sk_buff *skb)
444 goto drop; 423 goto drop;
445 424
446 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 425 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
447 __IP6_INC_STATS(net, ip6_dst_idev(dst), 426 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
448 IPSTATS_MIB_INDISCARDS);
449 goto drop; 427 goto drop;
450 } 428 }
451 429
@@ -476,8 +454,7 @@ int ip6_forward(struct sk_buff *skb)
476 /* Force OUTPUT device used as source address */ 454 /* Force OUTPUT device used as source address */
477 skb->dev = dst->dev; 455 skb->dev = dst->dev;
478 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 456 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
479 __IP6_INC_STATS(net, ip6_dst_idev(dst), 457 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
480 IPSTATS_MIB_INHDRERRORS);
481 458
482 kfree_skb(skb); 459 kfree_skb(skb);
483 return -ETIMEDOUT; 460 return -ETIMEDOUT;
@@ -490,15 +467,13 @@ int ip6_forward(struct sk_buff *skb)
490 if (proxied > 0) 467 if (proxied > 0)
491 return ip6_input(skb); 468 return ip6_input(skb);
492 else if (proxied < 0) { 469 else if (proxied < 0) {
493 __IP6_INC_STATS(net, ip6_dst_idev(dst), 470 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
494 IPSTATS_MIB_INDISCARDS);
495 goto drop; 471 goto drop;
496 } 472 }
497 } 473 }
498 474
499 if (!xfrm6_route_forward(skb)) { 475 if (!xfrm6_route_forward(skb)) {
500 __IP6_INC_STATS(net, ip6_dst_idev(dst), 476 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
501 IPSTATS_MIB_INDISCARDS);
502 goto drop; 477 goto drop;
503 } 478 }
504 dst = skb_dst(skb); 479 dst = skb_dst(skb);
@@ -507,7 +482,8 @@ int ip6_forward(struct sk_buff *skb)
507 send redirects to source routed frames. 482 send redirects to source routed frames.
508 We don't send redirects to frames decapsulated from IPsec. 483 We don't send redirects to frames decapsulated from IPsec.
509 */ 484 */
510 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 485 if (IP6CB(skb)->iif == dst->dev->ifindex &&
486 opt->srcrt == 0 && !skb_sec_path(skb)) {
511 struct in6_addr *target = NULL; 487 struct in6_addr *target = NULL;
512 struct inet_peer *peer; 488 struct inet_peer *peer;
513 struct rt6_info *rt; 489 struct rt6_info *rt;
@@ -554,8 +530,7 @@ int ip6_forward(struct sk_buff *skb)
554 /* Again, force OUTPUT device used as source address */ 530 /* Again, force OUTPUT device used as source address */
555 skb->dev = dst->dev; 531 skb->dev = dst->dev;
556 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 532 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
557 __IP6_INC_STATS(net, ip6_dst_idev(dst), 533 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
558 IPSTATS_MIB_INTOOBIGERRORS);
559 __IP6_INC_STATS(net, ip6_dst_idev(dst), 534 __IP6_INC_STATS(net, ip6_dst_idev(dst),
560 IPSTATS_MIB_FRAGFAILS); 535 IPSTATS_MIB_FRAGFAILS);
561 kfree_skb(skb); 536 kfree_skb(skb);
@@ -579,7 +554,7 @@ int ip6_forward(struct sk_buff *skb)
579 ip6_forward_finish); 554 ip6_forward_finish);
580 555
581error: 556error:
582 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 557 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
583drop: 558drop:
584 kfree_skb(skb); 559 kfree_skb(skb);
585 return -EINVAL; 560 return -EINVAL;
@@ -966,15 +941,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
966 * that's why we try it again later. 941 * that's why we try it again later.
967 */ 942 */
968 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 943 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
944 struct fib6_info *from;
969 struct rt6_info *rt; 945 struct rt6_info *rt;
970 bool had_dst = *dst != NULL; 946 bool had_dst = *dst != NULL;
971 947
972 if (!had_dst) 948 if (!had_dst)
973 *dst = ip6_route_output(net, sk, fl6); 949 *dst = ip6_route_output(net, sk, fl6);
974 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 950 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
975 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 951
952 rcu_read_lock();
953 from = rt ? rcu_dereference(rt->from) : NULL;
954 err = ip6_route_get_saddr(net, from, &fl6->daddr,
976 sk ? inet6_sk(sk)->srcprefs : 0, 955 sk ? inet6_sk(sk)->srcprefs : 0,
977 &fl6->saddr); 956 &fl6->saddr);
957 rcu_read_unlock();
958
978 if (err) 959 if (err)
979 goto out_err_release; 960 goto out_err_release;
980 961
@@ -1238,6 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1238 if (mtu < IPV6_MIN_MTU) 1219 if (mtu < IPV6_MIN_MTU)
1239 return -EINVAL; 1220 return -EINVAL;
1240 cork->base.fragsize = mtu; 1221 cork->base.fragsize = mtu;
1222 cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
1223
1241 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1224 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1242 cork->base.flags |= IPCORK_ALLFRAG; 1225 cork->base.flags |= IPCORK_ALLFRAG;
1243 cork->base.length = 0; 1226 cork->base.length = 0;
@@ -1272,6 +1255,7 @@ static int __ip6_append_data(struct sock *sk,
1272 int csummode = CHECKSUM_NONE; 1255 int csummode = CHECKSUM_NONE;
1273 unsigned int maxnonfragsize, headersize; 1256 unsigned int maxnonfragsize, headersize;
1274 unsigned int wmem_alloc_delta = 0; 1257 unsigned int wmem_alloc_delta = 0;
1258 bool paged;
1275 1259
1276 skb = skb_peek_tail(queue); 1260 skb = skb_peek_tail(queue);
1277 if (!skb) { 1261 if (!skb) {
@@ -1279,7 +1263,8 @@ static int __ip6_append_data(struct sock *sk,
1279 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1263 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280 } 1264 }
1281 1265
1282 mtu = cork->fragsize; 1266 paged = !!cork->gso_size;
1267 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1283 orig_mtu = mtu; 1268 orig_mtu = mtu;
1284 1269
1285 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1270 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1327,7 +1312,7 @@ emsgsize:
1327 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1312 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1328 headersize == sizeof(struct ipv6hdr) && 1313 headersize == sizeof(struct ipv6hdr) &&
1329 length <= mtu - headersize && 1314 length <= mtu - headersize &&
1330 !(flags & MSG_MORE) && 1315 (!(flags & MSG_MORE) || cork->gso_size) &&
1331 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1316 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1332 csummode = CHECKSUM_PARTIAL; 1317 csummode = CHECKSUM_PARTIAL;
1333 1318
@@ -1370,6 +1355,7 @@ emsgsize:
1370 unsigned int fraglen; 1355 unsigned int fraglen;
1371 unsigned int fraggap; 1356 unsigned int fraggap;
1372 unsigned int alloclen; 1357 unsigned int alloclen;
1358 unsigned int pagedlen = 0;
1373alloc_new_skb: 1359alloc_new_skb:
1374 /* There's no room in the current skb */ 1360 /* There's no room in the current skb */
1375 if (skb) 1361 if (skb)
@@ -1392,11 +1378,17 @@ alloc_new_skb:
1392 1378
1393 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1379 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1394 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1380 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1381 fraglen = datalen + fragheaderlen;
1382
1395 if ((flags & MSG_MORE) && 1383 if ((flags & MSG_MORE) &&
1396 !(rt->dst.dev->features&NETIF_F_SG)) 1384 !(rt->dst.dev->features&NETIF_F_SG))
1397 alloclen = mtu; 1385 alloclen = mtu;
1398 else 1386 else if (!paged)
1399 alloclen = datalen + fragheaderlen; 1387 alloclen = fraglen;
1388 else {
1389 alloclen = min_t(int, fraglen, MAX_HEADER);
1390 pagedlen = fraglen - alloclen;
1391 }
1400 1392
1401 alloclen += dst_exthdrlen; 1393 alloclen += dst_exthdrlen;
1402 1394
@@ -1418,7 +1410,7 @@ alloc_new_skb:
1418 */ 1410 */
1419 alloclen += sizeof(struct frag_hdr); 1411 alloclen += sizeof(struct frag_hdr);
1420 1412
1421 copy = datalen - transhdrlen - fraggap; 1413 copy = datalen - transhdrlen - fraggap - pagedlen;
1422 if (copy < 0) { 1414 if (copy < 0) {
1423 err = -EINVAL; 1415 err = -EINVAL;
1424 goto error; 1416 goto error;
@@ -1457,7 +1449,7 @@ alloc_new_skb:
1457 /* 1449 /*
1458 * Find where to start putting bytes 1450 * Find where to start putting bytes
1459 */ 1451 */
1460 data = skb_put(skb, fraglen); 1452 data = skb_put(skb, fraglen - pagedlen);
1461 skb_set_network_header(skb, exthdrlen); 1453 skb_set_network_header(skb, exthdrlen);
1462 data += fragheaderlen; 1454 data += fragheaderlen;
1463 skb->transport_header = (skb->network_header + 1455 skb->transport_header = (skb->network_header +
@@ -1480,7 +1472,7 @@ alloc_new_skb:
1480 } 1472 }
1481 1473
1482 offset += copy; 1474 offset += copy;
1483 length -= datalen - fraggap; 1475 length -= copy + transhdrlen;
1484 transhdrlen = 0; 1476 transhdrlen = 0;
1485 exthdrlen = 0; 1477 exthdrlen = 0;
1486 dst_exthdrlen = 0; 1478 dst_exthdrlen = 0;
@@ -1754,9 +1746,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
1754 void *from, int length, int transhdrlen, 1746 void *from, int length, int transhdrlen,
1755 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1747 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1756 struct rt6_info *rt, unsigned int flags, 1748 struct rt6_info *rt, unsigned int flags,
1749 struct inet_cork_full *cork,
1757 const struct sockcm_cookie *sockc) 1750 const struct sockcm_cookie *sockc)
1758{ 1751{
1759 struct inet_cork_full cork;
1760 struct inet6_cork v6_cork; 1752 struct inet6_cork v6_cork;
1761 struct sk_buff_head queue; 1753 struct sk_buff_head queue;
1762 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1754 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1767,27 +1759,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
1767 1759
1768 __skb_queue_head_init(&queue); 1760 __skb_queue_head_init(&queue);
1769 1761
1770 cork.base.flags = 0; 1762 cork->base.flags = 0;
1771 cork.base.addr = 0; 1763 cork->base.addr = 0;
1772 cork.base.opt = NULL; 1764 cork->base.opt = NULL;
1773 cork.base.dst = NULL; 1765 cork->base.dst = NULL;
1774 v6_cork.opt = NULL; 1766 v6_cork.opt = NULL;
1775 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); 1767 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1776 if (err) { 1768 if (err) {
1777 ip6_cork_release(&cork, &v6_cork); 1769 ip6_cork_release(cork, &v6_cork);
1778 return ERR_PTR(err); 1770 return ERR_PTR(err);
1779 } 1771 }
1780 if (ipc6->dontfrag < 0) 1772 if (ipc6->dontfrag < 0)
1781 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1773 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1782 1774
1783 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, 1775 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1784 &current->task_frag, getfrag, from, 1776 &current->task_frag, getfrag, from,
1785 length + exthdrlen, transhdrlen + exthdrlen, 1777 length + exthdrlen, transhdrlen + exthdrlen,
1786 flags, ipc6, sockc); 1778 flags, ipc6, sockc);
1787 if (err) { 1779 if (err) {
1788 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); 1780 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1789 return ERR_PTR(err); 1781 return ERR_PTR(err);
1790 } 1782 }
1791 1783
1792 return __ip6_make_skb(sk, &queue, &cork, &v6_cork); 1784 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1793} 1785}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ca957dd93a29..b7f28deddaea 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
743} 743}
744 744
745/** 745/**
746 * vti6_tnl_ioctl - configure vti6 tunnels from userspace 746 * vti6_ioctl - configure vti6 tunnels from userspace
747 * @dev: virtual device associated with tunnel 747 * @dev: virtual device associated with tunnel
748 * @ifr: parameters passed from userspace 748 * @ifr: parameters passed from userspace
749 * @cmd: command to be performed 749 * @cmd: command to be performed
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 4a15529d33eb..0d0f0053bb11 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
180}; 180};
181 181
182static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, 182static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
183 struct fib_rule_hdr *frh, struct nlattr **tb) 183 struct fib_rule_hdr *frh, struct nlattr **tb,
184 struct netlink_ext_ack *extack)
184{ 185{
185 return 0; 186 return 0;
186} 187}
@@ -227,8 +228,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
227 INIT_LIST_HEAD(&net->ipv6.mr6_tables); 228 INIT_LIST_HEAD(&net->ipv6.mr6_tables);
228 229
229 mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); 230 mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
230 if (!mrt) { 231 if (IS_ERR(mrt)) {
231 err = -ENOMEM; 232 err = PTR_ERR(mrt);
232 goto err1; 233 goto err1;
233 } 234 }
234 235
@@ -301,8 +302,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
301 302
302static int __net_init ip6mr_rules_init(struct net *net) 303static int __net_init ip6mr_rules_init(struct net *net)
303{ 304{
304 net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT); 305 struct mr_table *mrt;
305 return net->ipv6.mrt6 ? 0 : -ENOMEM; 306
307 mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
308 if (IS_ERR(mrt))
309 return PTR_ERR(mrt);
310 net->ipv6.mrt6 = mrt;
311 return 0;
306} 312}
307 313
308static void __net_exit ip6mr_rules_exit(struct net *net) 314static void __net_exit ip6mr_rules_exit(struct net *net)
@@ -1733,9 +1739,11 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1733 1739
1734 rtnl_lock(); 1740 rtnl_lock();
1735 ret = 0; 1741 ret = 0;
1736 if (!ip6mr_new_table(net, v)) 1742 mrt = ip6mr_new_table(net, v);
1737 ret = -ENOMEM; 1743 if (IS_ERR(mrt))
1738 raw6_sk(sk)->ip6mr_table = v; 1744 ret = PTR_ERR(mrt);
1745 else
1746 raw6_sk(sk)->ip6mr_table = v;
1739 rtnl_unlock(); 1747 rtnl_unlock();
1740 return ret; 1748 return ret;
1741 } 1749 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9de4dfb126ba..e640d2f3c55c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1155,7 +1155,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1155 struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); 1155 struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
1156 struct neighbour *neigh = NULL; 1156 struct neighbour *neigh = NULL;
1157 struct inet6_dev *in6_dev; 1157 struct inet6_dev *in6_dev;
1158 struct rt6_info *rt = NULL; 1158 struct fib6_info *rt = NULL;
1159 struct net *net;
1159 int lifetime; 1160 int lifetime;
1160 struct ndisc_options ndopts; 1161 struct ndisc_options ndopts;
1161 int optlen; 1162 int optlen;
@@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1253 /* Do not accept RA with source-addr found on local machine unless 1254 /* Do not accept RA with source-addr found on local machine unless
1254 * accept_ra_from_local is set to true. 1255 * accept_ra_from_local is set to true.
1255 */ 1256 */
1257 net = dev_net(in6_dev->dev);
1256 if (!in6_dev->cnf.accept_ra_from_local && 1258 if (!in6_dev->cnf.accept_ra_from_local &&
1257 ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, 1259 ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
1258 in6_dev->dev, 0)) {
1259 ND_PRINTK(2, info, 1260 ND_PRINTK(2, info,
1260 "RA from local address detected on dev: %s: default router ignored\n", 1261 "RA from local address detected on dev: %s: default router ignored\n",
1261 skb->dev->name); 1262 skb->dev->name);
@@ -1272,20 +1273,22 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1272 pref = ICMPV6_ROUTER_PREF_MEDIUM; 1273 pref = ICMPV6_ROUTER_PREF_MEDIUM;
1273#endif 1274#endif
1274 1275
1275 rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); 1276 rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
1276 1277
1277 if (rt) { 1278 if (rt) {
1278 neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); 1279 neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
1280 rt->fib6_nh.nh_dev, NULL,
1281 &ipv6_hdr(skb)->saddr);
1279 if (!neigh) { 1282 if (!neigh) {
1280 ND_PRINTK(0, err, 1283 ND_PRINTK(0, err,
1281 "RA: %s got default router without neighbour\n", 1284 "RA: %s got default router without neighbour\n",
1282 __func__); 1285 __func__);
1283 ip6_rt_put(rt); 1286 fib6_info_release(rt);
1284 return; 1287 return;
1285 } 1288 }
1286 } 1289 }
1287 if (rt && lifetime == 0) { 1290 if (rt && lifetime == 0) {
1288 ip6_del_rt(rt); 1291 ip6_del_rt(net, rt);
1289 rt = NULL; 1292 rt = NULL;
1290 } 1293 }
1291 1294
@@ -1294,7 +1297,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1294 if (!rt && lifetime) { 1297 if (!rt && lifetime) {
1295 ND_PRINTK(3, info, "RA: adding default router\n"); 1298 ND_PRINTK(3, info, "RA: adding default router\n");
1296 1299
1297 rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); 1300 rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
1301 skb->dev, pref);
1298 if (!rt) { 1302 if (!rt) {
1299 ND_PRINTK(0, err, 1303 ND_PRINTK(0, err,
1300 "RA: %s failed to add default route\n", 1304 "RA: %s failed to add default route\n",
@@ -1302,28 +1306,29 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1302 return; 1306 return;
1303 } 1307 }
1304 1308
1305 neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); 1309 neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
1310 rt->fib6_nh.nh_dev, NULL,
1311 &ipv6_hdr(skb)->saddr);
1306 if (!neigh) { 1312 if (!neigh) {
1307 ND_PRINTK(0, err, 1313 ND_PRINTK(0, err,
1308 "RA: %s got default router without neighbour\n", 1314 "RA: %s got default router without neighbour\n",
1309 __func__); 1315 __func__);
1310 ip6_rt_put(rt); 1316 fib6_info_release(rt);
1311 return; 1317 return;
1312 } 1318 }
1313 neigh->flags |= NTF_ROUTER; 1319 neigh->flags |= NTF_ROUTER;
1314 } else if (rt) { 1320 } else if (rt) {
1315 rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 1321 rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
1316 } 1322 }
1317 1323
1318 if (rt) 1324 if (rt)
1319 rt6_set_expires(rt, jiffies + (HZ * lifetime)); 1325 fib6_set_expires(rt, jiffies + (HZ * lifetime));
1320 if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && 1326 if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
1321 ra_msg->icmph.icmp6_hop_limit) { 1327 ra_msg->icmph.icmp6_hop_limit) {
1322 if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { 1328 if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
1323 in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; 1329 in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
1324 if (rt) 1330 fib6_metric_set(rt, RTAX_HOPLIMIT,
1325 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 1331 ra_msg->icmph.icmp6_hop_limit);
1326 ra_msg->icmph.icmp6_hop_limit);
1327 } else { 1332 } else {
1328 ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); 1333 ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
1329 } 1334 }
@@ -1475,10 +1480,7 @@ skip_routeinfo:
1475 ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); 1480 ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
1476 } else if (in6_dev->cnf.mtu6 != mtu) { 1481 } else if (in6_dev->cnf.mtu6 != mtu) {
1477 in6_dev->cnf.mtu6 = mtu; 1482 in6_dev->cnf.mtu6 = mtu;
1478 1483 fib6_metric_set(rt, RTAX_MTU, mtu);
1479 if (rt)
1480 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
1481
1482 rt6_mtu_change(skb->dev, mtu); 1484 rt6_mtu_change(skb->dev, mtu);
1483 } 1485 }
1484 } 1486 }
@@ -1497,7 +1499,7 @@ skip_routeinfo:
1497 ND_PRINTK(2, warn, "RA: invalid RA options\n"); 1499 ND_PRINTK(2, warn, "RA: invalid RA options\n");
1498 } 1500 }
1499out: 1501out:
1500 ip6_rt_put(rt); 1502 fib6_info_release(rt);
1501 if (neigh) 1503 if (neigh)
1502 neigh_release(neigh); 1504 neigh_release(neigh);
1503} 1505}
@@ -1576,6 +1578,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
1576 ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; 1578 ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
1577 bool ret; 1579 bool ret;
1578 1580
1581 if (netif_is_l3_master(skb->dev)) {
1582 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
1583 if (!dev)
1584 return;
1585 }
1586
1579 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { 1587 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
1580 ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", 1588 ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
1581 dev->name); 1589 dev->name);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ce77bcc2490c..37b14dc9d863 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV6
29 tristate "IPv6 socket lookup support" 29 tristate "IPv6 socket lookup support"
30 help 30 help
31 This option enables the IPv6 socket lookup infrastructure. This 31 This option enables the IPv6 socket lookup infrastructure. This
32 is used by the ip6tables socket match. 32 is used by the {ip6,nf}tables socket match.
33
34config NF_TPROXY_IPV6
35 tristate "IPv6 tproxy support"
33 36
34if NF_TABLES 37if NF_TABLES
35 38
@@ -136,10 +139,7 @@ config NF_NAT_IPV6
136if NF_NAT_IPV6 139if NF_NAT_IPV6
137 140
138config NF_NAT_MASQUERADE_IPV6 141config NF_NAT_MASQUERADE_IPV6
139 tristate "IPv6 masquerade support" 142 bool
140 help
141 This is the kernel functionality to provide NAT in the masquerade
142 flavour (automatic source address selection) for IPv6.
143 143
144endif # NF_NAT_IPV6 144endif # NF_NAT_IPV6
145 145
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 44273d6f03a5..10a5a1c87320 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -18,14 +18,15 @@ nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
18obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o 18obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
19 19
20nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o 20nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
21nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
21obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o 22obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
22obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
23 23
24# defrag 24# defrag
25nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o 25nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
26obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o 26obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
27 27
28obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o 28obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
29obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
29 30
30# logging 31# logging
31obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o 32obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 97f79dc943d7..0758b5bcfb29 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -529,7 +529,6 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
529 .family = NFPROTO_IPV6, 529 .family = NFPROTO_IPV6,
530 }; 530 };
531 531
532 t = ip6t_get_target(e);
533 return xt_check_target(&par, t->u.target_size - sizeof(*t), 532 return xt_check_target(&par, t->u.target_size - sizeof(*t),
534 e->ipv6.proto, 533 e->ipv6.proto,
535 e->ipv6.invflags & IP6T_INV_PROTO); 534 e->ipv6.invflags & IP6T_INV_PROTO);
@@ -1794,6 +1793,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
1794 1793
1795 /* set res now, will see skbs right after nf_register_net_hooks */ 1794 /* set res now, will see skbs right after nf_register_net_hooks */
1796 WRITE_ONCE(*res, new_table); 1795 WRITE_ONCE(*res, new_table);
1796 if (!ops)
1797 return 0;
1797 1798
1798 ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); 1799 ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
1799 if (ret != 0) { 1800 if (ret != 0) {
@@ -1811,7 +1812,8 @@ out_free:
1811void ip6t_unregister_table(struct net *net, struct xt_table *table, 1812void ip6t_unregister_table(struct net *net, struct xt_table *table,
1812 const struct nf_hook_ops *ops) 1813 const struct nf_hook_ops *ops)
1813{ 1814{
1814 nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); 1815 if (ops)
1816 nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
1815 __ip6t_unregister_table(net, table); 1817 __ip6t_unregister_table(net, table);
1816} 1818}
1817 1819
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 92c0047e7e33..491f808e356a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
29 29
30static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) 30static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
31{ 31{
32 const struct nf_nat_range *range = par->targinfo; 32 const struct nf_nat_range2 *range = par->targinfo;
33 33
34 if (range->flags & NF_NAT_RANGE_MAP_IPS) 34 if (range->flags & NF_NAT_RANGE_MAP_IPS)
35 return -EINVAL; 35 return -EINVAL;
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index d12f511929f5..0fe61ede77c6 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
48 } 48 }
49 49
50 fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; 50 fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
51 if ((flags & XT_RPFILTER_LOOSE) == 0)
52 fl6.flowi6_oif = dev->ifindex;
51 53
52 rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); 54 rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
53 if (rt->dst.error) 55 if (rt->dst.error)
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 33719d5560c8..1059894a6f4c 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
117 return true; 117 return true;
118} 118}
119 119
120static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
121{
122 int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
123 const struct ip6t_srh1 *srhinfo = par->matchinfo;
124 struct in6_addr *psid, *nsid, *lsid;
125 struct in6_addr _psid, _nsid, _lsid;
126 struct ipv6_sr_hdr *srh;
127 struct ipv6_sr_hdr _srh;
128
129 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
130 return false;
131 srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
132 if (!srh)
133 return false;
134
135 hdrlen = ipv6_optlen(srh);
136 if (skb->len - srhoff < hdrlen)
137 return false;
138
139 if (srh->type != IPV6_SRCRT_TYPE_4)
140 return false;
141
142 if (srh->segments_left > srh->first_segment)
143 return false;
144
145 /* Next Header matching */
146 if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
147 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
148 !(srh->nexthdr == srhinfo->next_hdr)))
149 return false;
150
151 /* Header Extension Length matching */
152 if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
153 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
154 !(srh->hdrlen == srhinfo->hdr_len)))
155 return false;
156 if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
157 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
158 !(srh->hdrlen > srhinfo->hdr_len)))
159 return false;
160 if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
161 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
162 !(srh->hdrlen < srhinfo->hdr_len)))
163 return false;
164
165 /* Segments Left matching */
166 if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
167 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
168 !(srh->segments_left == srhinfo->segs_left)))
169 return false;
170 if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
171 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
172 !(srh->segments_left > srhinfo->segs_left)))
173 return false;
174 if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
175 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
176 !(srh->segments_left < srhinfo->segs_left)))
177 return false;
178
179 /**
180 * Last Entry matching
181 * Last_Entry field was introduced in revision 6 of the SRH draft.
182 * It was called First_Segment in the previous revision
183 */
184 if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
185 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
186 !(srh->first_segment == srhinfo->last_entry)))
187 return false;
188 if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
189 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
190 !(srh->first_segment > srhinfo->last_entry)))
191 return false;
192 if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
193 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
194 !(srh->first_segment < srhinfo->last_entry)))
195 return false;
196
197 /**
198 * Tag matchig
199 * Tag field was introduced in revision 6 of the SRH draft
200 */
201 if (srhinfo->mt_flags & IP6T_SRH_TAG)
202 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
203 !(srh->tag == srhinfo->tag)))
204 return false;
205
206 /* Previous SID matching */
207 if (srhinfo->mt_flags & IP6T_SRH_PSID) {
208 if (srh->segments_left == srh->first_segment)
209 return false;
210 psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
211 ((srh->segments_left + 1) * sizeof(struct in6_addr));
212 psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
213 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
214 ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
215 &srhinfo->psid_addr)))
216 return false;
217 }
218
219 /* Next SID matching */
220 if (srhinfo->mt_flags & IP6T_SRH_NSID) {
221 if (srh->segments_left == 0)
222 return false;
223 nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
224 ((srh->segments_left - 1) * sizeof(struct in6_addr));
225 nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
226 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
227 ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
228 &srhinfo->nsid_addr)))
229 return false;
230 }
231
232 /* Last SID matching */
233 if (srhinfo->mt_flags & IP6T_SRH_LSID) {
234 lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
235 lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
236 if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
237 ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
238 &srhinfo->lsid_addr)))
239 return false;
240 }
241 return true;
242}
243
120static int srh_mt6_check(const struct xt_mtchk_param *par) 244static int srh_mt6_check(const struct xt_mtchk_param *par)
121{ 245{
122 const struct ip6t_srh *srhinfo = par->matchinfo; 246 const struct ip6t_srh *srhinfo = par->matchinfo;
@@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par)
136 return 0; 260 return 0;
137} 261}
138 262
139static struct xt_match srh_mt6_reg __read_mostly = { 263static int srh1_mt6_check(const struct xt_mtchk_param *par)
140 .name = "srh", 264{
141 .family = NFPROTO_IPV6, 265 const struct ip6t_srh1 *srhinfo = par->matchinfo;
142 .match = srh_mt6, 266
143 .matchsize = sizeof(struct ip6t_srh), 267 if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
144 .checkentry = srh_mt6_check, 268 pr_info_ratelimited("unknown srh match flags %X\n",
145 .me = THIS_MODULE, 269 srhinfo->mt_flags);
270 return -EINVAL;
271 }
272
273 if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
274 pr_info_ratelimited("unknown srh invflags %X\n",
275 srhinfo->mt_invflags);
276 return -EINVAL;
277 }
278
279 return 0;
280}
281
282static struct xt_match srh_mt6_reg[] __read_mostly = {
283 {
284 .name = "srh",
285 .revision = 0,
286 .family = NFPROTO_IPV6,
287 .match = srh_mt6,
288 .matchsize = sizeof(struct ip6t_srh),
289 .checkentry = srh_mt6_check,
290 .me = THIS_MODULE,
291 },
292 {
293 .name = "srh",
294 .revision = 1,
295 .family = NFPROTO_IPV6,
296 .match = srh1_mt6,
297 .matchsize = sizeof(struct ip6t_srh1),
298 .checkentry = srh1_mt6_check,
299 .me = THIS_MODULE,
300 }
146}; 301};
147 302
148static int __init srh_mt6_init(void) 303static int __init srh_mt6_init(void)
149{ 304{
150 return xt_register_match(&srh_mt6_reg); 305 return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
151} 306}
152 307
153static void __exit srh_mt6_exit(void) 308static void __exit srh_mt6_exit(void)
154{ 309{
155 xt_unregister_match(&srh_mt6_reg); 310 xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
156} 311}
157 312
158module_init(srh_mt6_init); 313module_init(srh_mt6_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 47306e45a80a..67ba70ab9f5c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -35,75 +35,63 @@ static const struct xt_table nf_nat_ipv6_table = {
35 35
36static unsigned int ip6table_nat_do_chain(void *priv, 36static unsigned int ip6table_nat_do_chain(void *priv,
37 struct sk_buff *skb, 37 struct sk_buff *skb,
38 const struct nf_hook_state *state,
39 struct nf_conn *ct)
40{
41 return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
42}
43
44static unsigned int ip6table_nat_fn(void *priv,
45 struct sk_buff *skb,
46 const struct nf_hook_state *state)
47{
48 return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain);
49}
50
51static unsigned int ip6table_nat_in(void *priv,
52 struct sk_buff *skb,
53 const struct nf_hook_state *state)
54{
55 return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain);
56}
57
58static unsigned int ip6table_nat_out(void *priv,
59 struct sk_buff *skb,
60 const struct nf_hook_state *state)
61{
62 return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain);
63}
64
65static unsigned int ip6table_nat_local_fn(void *priv,
66 struct sk_buff *skb,
67 const struct nf_hook_state *state) 38 const struct nf_hook_state *state)
68{ 39{
69 return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain); 40 return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
70} 41}
71 42
72static const struct nf_hook_ops nf_nat_ipv6_ops[] = { 43static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
73 /* Before packet filtering, change destination */
74 { 44 {
75 .hook = ip6table_nat_in, 45 .hook = ip6table_nat_do_chain,
76 .pf = NFPROTO_IPV6, 46 .pf = NFPROTO_IPV6,
77 .nat_hook = true,
78 .hooknum = NF_INET_PRE_ROUTING, 47 .hooknum = NF_INET_PRE_ROUTING,
79 .priority = NF_IP6_PRI_NAT_DST, 48 .priority = NF_IP6_PRI_NAT_DST,
80 }, 49 },
81 /* After packet filtering, change source */
82 { 50 {
83 .hook = ip6table_nat_out, 51 .hook = ip6table_nat_do_chain,
84 .pf = NFPROTO_IPV6, 52 .pf = NFPROTO_IPV6,
85 .nat_hook = true,
86 .hooknum = NF_INET_POST_ROUTING, 53 .hooknum = NF_INET_POST_ROUTING,
87 .priority = NF_IP6_PRI_NAT_SRC, 54 .priority = NF_IP6_PRI_NAT_SRC,
88 }, 55 },
89 /* Before packet filtering, change destination */
90 { 56 {
91 .hook = ip6table_nat_local_fn, 57 .hook = ip6table_nat_do_chain,
92 .pf = NFPROTO_IPV6, 58 .pf = NFPROTO_IPV6,
93 .nat_hook = true,
94 .hooknum = NF_INET_LOCAL_OUT, 59 .hooknum = NF_INET_LOCAL_OUT,
95 .priority = NF_IP6_PRI_NAT_DST, 60 .priority = NF_IP6_PRI_NAT_DST,
96 }, 61 },
97 /* After packet filtering, change source */
98 { 62 {
99 .hook = ip6table_nat_fn, 63 .hook = ip6table_nat_do_chain,
100 .nat_hook = true,
101 .pf = NFPROTO_IPV6, 64 .pf = NFPROTO_IPV6,
102 .hooknum = NF_INET_LOCAL_IN, 65 .hooknum = NF_INET_LOCAL_IN,
103 .priority = NF_IP6_PRI_NAT_SRC, 66 .priority = NF_IP6_PRI_NAT_SRC,
104 }, 67 },
105}; 68};
106 69
70static int ip6t_nat_register_lookups(struct net *net)
71{
72 int i, ret;
73
74 for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
75 ret = nf_nat_l3proto_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]);
76 if (ret) {
77 while (i)
78 nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]);
79
80 return ret;
81 }
82 }
83
84 return 0;
85}
86
87static void ip6t_nat_unregister_lookups(struct net *net)
88{
89 int i;
90
91 for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
92 nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]);
93}
94
107static int __net_init ip6table_nat_table_init(struct net *net) 95static int __net_init ip6table_nat_table_init(struct net *net)
108{ 96{
109 struct ip6t_replace *repl; 97 struct ip6t_replace *repl;
@@ -116,7 +104,17 @@ static int __net_init ip6table_nat_table_init(struct net *net)
116 if (repl == NULL) 104 if (repl == NULL)
117 return -ENOMEM; 105 return -ENOMEM;
118 ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, 106 ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
119 nf_nat_ipv6_ops, &net->ipv6.ip6table_nat); 107 NULL, &net->ipv6.ip6table_nat);
108 if (ret < 0) {
109 kfree(repl);
110 return ret;
111 }
112
113 ret = ip6t_nat_register_lookups(net);
114 if (ret < 0) {
115 ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
116 net->ipv6.ip6table_nat = NULL;
117 }
120 kfree(repl); 118 kfree(repl);
121 return ret; 119 return ret;
122} 120}
@@ -125,7 +123,8 @@ static void __net_exit ip6table_nat_net_exit(struct net *net)
125{ 123{
126 if (!net->ipv6.ip6table_nat) 124 if (!net->ipv6.ip6table_nat)
127 return; 125 return;
128 ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops); 126 ip6t_nat_unregister_lookups(net);
127 ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
129 net->ipv6.ip6table_nat = NULL; 128 net->ipv6.ip6table_nat = NULL;
130} 129}
131 130
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 207cb35569b1..c511d206bf9b 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -3,256 +3,12 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/netfilter.h> 4#include <linux/netfilter.h>
5#include <linux/rhashtable.h> 5#include <linux/rhashtable.h>
6#include <linux/ipv6.h>
7#include <linux/netdevice.h>
8#include <net/ipv6.h>
9#include <net/ip6_route.h>
10#include <net/neighbour.h>
11#include <net/netfilter/nf_flow_table.h> 6#include <net/netfilter/nf_flow_table.h>
12#include <net/netfilter/nf_tables.h> 7#include <net/netfilter/nf_tables.h>
13/* For layer 4 checksum field offset. */
14#include <linux/tcp.h>
15#include <linux/udp.h>
16
17static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
18 struct in6_addr *addr,
19 struct in6_addr *new_addr)
20{
21 struct tcphdr *tcph;
22
23 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
24 skb_try_make_writable(skb, thoff + sizeof(*tcph)))
25 return -1;
26
27 tcph = (void *)(skb_network_header(skb) + thoff);
28 inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
29 new_addr->s6_addr32, true);
30
31 return 0;
32}
33
34static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
35 struct in6_addr *addr,
36 struct in6_addr *new_addr)
37{
38 struct udphdr *udph;
39
40 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
41 skb_try_make_writable(skb, thoff + sizeof(*udph)))
42 return -1;
43
44 udph = (void *)(skb_network_header(skb) + thoff);
45 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
46 inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
47 new_addr->s6_addr32, true);
48 if (!udph->check)
49 udph->check = CSUM_MANGLED_0;
50 }
51
52 return 0;
53}
54
55static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
56 unsigned int thoff, struct in6_addr *addr,
57 struct in6_addr *new_addr)
58{
59 switch (ip6h->nexthdr) {
60 case IPPROTO_TCP:
61 if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
62 return NF_DROP;
63 break;
64 case IPPROTO_UDP:
65 if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
66 return NF_DROP;
67 break;
68 }
69
70 return 0;
71}
72
73static int nf_flow_snat_ipv6(const struct flow_offload *flow,
74 struct sk_buff *skb, struct ipv6hdr *ip6h,
75 unsigned int thoff,
76 enum flow_offload_tuple_dir dir)
77{
78 struct in6_addr addr, new_addr;
79
80 switch (dir) {
81 case FLOW_OFFLOAD_DIR_ORIGINAL:
82 addr = ip6h->saddr;
83 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
84 ip6h->saddr = new_addr;
85 break;
86 case FLOW_OFFLOAD_DIR_REPLY:
87 addr = ip6h->daddr;
88 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
89 ip6h->daddr = new_addr;
90 break;
91 default:
92 return -1;
93 }
94
95 return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
96}
97
98static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
99 struct sk_buff *skb, struct ipv6hdr *ip6h,
100 unsigned int thoff,
101 enum flow_offload_tuple_dir dir)
102{
103 struct in6_addr addr, new_addr;
104
105 switch (dir) {
106 case FLOW_OFFLOAD_DIR_ORIGINAL:
107 addr = ip6h->daddr;
108 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
109 ip6h->daddr = new_addr;
110 break;
111 case FLOW_OFFLOAD_DIR_REPLY:
112 addr = ip6h->saddr;
113 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
114 ip6h->saddr = new_addr;
115 break;
116 default:
117 return -1;
118 }
119
120 return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
121}
122
123static int nf_flow_nat_ipv6(const struct flow_offload *flow,
124 struct sk_buff *skb,
125 enum flow_offload_tuple_dir dir)
126{
127 struct ipv6hdr *ip6h = ipv6_hdr(skb);
128 unsigned int thoff = sizeof(*ip6h);
129
130 if (flow->flags & FLOW_OFFLOAD_SNAT &&
131 (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
132 nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
133 return -1;
134 if (flow->flags & FLOW_OFFLOAD_DNAT &&
135 (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
136 nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
137 return -1;
138
139 return 0;
140}
141
142static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
143 struct flow_offload_tuple *tuple)
144{
145 struct flow_ports *ports;
146 struct ipv6hdr *ip6h;
147 unsigned int thoff;
148
149 if (!pskb_may_pull(skb, sizeof(*ip6h)))
150 return -1;
151
152 ip6h = ipv6_hdr(skb);
153
154 if (ip6h->nexthdr != IPPROTO_TCP &&
155 ip6h->nexthdr != IPPROTO_UDP)
156 return -1;
157
158 thoff = sizeof(*ip6h);
159 if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
160 return -1;
161
162 ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
163
164 tuple->src_v6 = ip6h->saddr;
165 tuple->dst_v6 = ip6h->daddr;
166 tuple->src_port = ports->source;
167 tuple->dst_port = ports->dest;
168 tuple->l3proto = AF_INET6;
169 tuple->l4proto = ip6h->nexthdr;
170 tuple->iifidx = dev->ifindex;
171
172 return 0;
173}
174
175/* Based on ip_exceeds_mtu(). */
176static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
177{
178 if (skb->len <= mtu)
179 return false;
180
181 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
182 return false;
183
184 return true;
185}
186
187static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
188{
189 u32 mtu;
190
191 mtu = ip6_dst_mtu_forward(&rt->dst);
192 if (__nf_flow_exceeds_mtu(skb, mtu))
193 return true;
194
195 return false;
196}
197
198unsigned int
199nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
200 const struct nf_hook_state *state)
201{
202 struct flow_offload_tuple_rhash *tuplehash;
203 struct nf_flowtable *flow_table = priv;
204 struct flow_offload_tuple tuple = {};
205 enum flow_offload_tuple_dir dir;
206 struct flow_offload *flow;
207 struct net_device *outdev;
208 struct in6_addr *nexthop;
209 struct ipv6hdr *ip6h;
210 struct rt6_info *rt;
211
212 if (skb->protocol != htons(ETH_P_IPV6))
213 return NF_ACCEPT;
214
215 if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
216 return NF_ACCEPT;
217
218 tuplehash = flow_offload_lookup(flow_table, &tuple);
219 if (tuplehash == NULL)
220 return NF_ACCEPT;
221
222 outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
223 if (!outdev)
224 return NF_ACCEPT;
225
226 dir = tuplehash->tuple.dir;
227 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
228
229 rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
230 if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
231 return NF_ACCEPT;
232
233 if (skb_try_make_writable(skb, sizeof(*ip6h)))
234 return NF_DROP;
235
236 if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
237 nf_flow_nat_ipv6(flow, skb, dir) < 0)
238 return NF_DROP;
239
240 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
241 ip6h = ipv6_hdr(skb);
242 ip6h->hop_limit--;
243
244 skb->dev = outdev;
245 nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
246 neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
247
248 return NF_STOLEN;
249}
250EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
251 8
252static struct nf_flowtable_type flowtable_ipv6 = { 9static struct nf_flowtable_type flowtable_ipv6 = {
253 .family = NFPROTO_IPV6, 10 .family = NFPROTO_IPV6,
254 .params = &nf_flow_offload_rhash_params, 11 .init = nf_flow_table_init,
255 .gc = nf_flow_offload_work_gc,
256 .free = nf_flow_table_free, 12 .free = nf_flow_table_free,
257 .hook = nf_flow_offload_ipv6_hook, 13 .hook = nf_flow_offload_ipv6_hook,
258 .owner = THIS_MODULE, 14 .owner = THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 6b7f075f811f..ca6d38698b1a 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
62#endif 62#endif
63 63
64static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, 64static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
65 const struct nf_nat_range *range) 65 const struct nf_nat_range2 *range)
66{ 66{
67 return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && 67 return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
68 ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; 68 ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
@@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
151 151
152#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 152#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
153static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], 153static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
154 struct nf_nat_range *range) 154 struct nf_nat_range2 *range)
155{ 155{
156 if (tb[CTA_NAT_V6_MINIP]) { 156 if (tb[CTA_NAT_V6_MINIP]) {
157 nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], 157 nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
@@ -252,18 +252,12 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
252} 252}
253EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); 253EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
254 254
255unsigned int 255static unsigned int
256nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, 256nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
257 const struct nf_hook_state *state, 257 const struct nf_hook_state *state)
258 unsigned int (*do_chain)(void *priv,
259 struct sk_buff *skb,
260 const struct nf_hook_state *state,
261 struct nf_conn *ct))
262{ 258{
263 struct nf_conn *ct; 259 struct nf_conn *ct;
264 enum ip_conntrack_info ctinfo; 260 enum ip_conntrack_info ctinfo;
265 struct nf_conn_nat *nat;
266 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
267 __be16 frag_off; 261 __be16 frag_off;
268 int hdrlen; 262 int hdrlen;
269 u8 nexthdr; 263 u8 nexthdr;
@@ -277,11 +271,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
277 if (!ct) 271 if (!ct)
278 return NF_ACCEPT; 272 return NF_ACCEPT;
279 273
280 nat = nfct_nat(ct); 274 if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
281
282 switch (ctinfo) {
283 case IP_CT_RELATED:
284 case IP_CT_RELATED_REPLY:
285 nexthdr = ipv6_hdr(skb)->nexthdr; 275 nexthdr = ipv6_hdr(skb)->nexthdr;
286 hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), 276 hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
287 &nexthdr, &frag_off); 277 &nexthdr, &frag_off);
@@ -294,77 +284,29 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
294 else 284 else
295 return NF_ACCEPT; 285 return NF_ACCEPT;
296 } 286 }
297 /* Only ICMPs can be IP_CT_IS_REPLY: */
298 /* fall through */
299 case IP_CT_NEW:
300 /* Seen it before? This can happen for loopback, retrans,
301 * or local packets.
302 */
303 if (!nf_nat_initialized(ct, maniptype)) {
304 unsigned int ret;
305
306 ret = do_chain(priv, skb, state, ct);
307 if (ret != NF_ACCEPT)
308 return ret;
309
310 if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
311 break;
312
313 ret = nf_nat_alloc_null_binding(ct, state->hook);
314 if (ret != NF_ACCEPT)
315 return ret;
316 } else {
317 pr_debug("Already setup manip %s for ct %p\n",
318 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
319 ct);
320 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
321 goto oif_changed;
322 }
323 break;
324
325 default:
326 /* ESTABLISHED */
327 WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
328 ctinfo != IP_CT_ESTABLISHED_REPLY);
329 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
330 goto oif_changed;
331 } 287 }
332 288
333 return nf_nat_packet(ct, ctinfo, state->hook, skb); 289 return nf_nat_inet_fn(priv, skb, state);
334
335oif_changed:
336 nf_ct_kill_acct(ct, ctinfo, skb);
337 return NF_DROP;
338} 290}
339EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
340 291
341unsigned int 292static unsigned int
342nf_nat_ipv6_in(void *priv, struct sk_buff *skb, 293nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
343 const struct nf_hook_state *state, 294 const struct nf_hook_state *state)
344 unsigned int (*do_chain)(void *priv,
345 struct sk_buff *skb,
346 const struct nf_hook_state *state,
347 struct nf_conn *ct))
348{ 295{
349 unsigned int ret; 296 unsigned int ret;
350 struct in6_addr daddr = ipv6_hdr(skb)->daddr; 297 struct in6_addr daddr = ipv6_hdr(skb)->daddr;
351 298
352 ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); 299 ret = nf_nat_ipv6_fn(priv, skb, state);
353 if (ret != NF_DROP && ret != NF_STOLEN && 300 if (ret != NF_DROP && ret != NF_STOLEN &&
354 ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) 301 ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
355 skb_dst_drop(skb); 302 skb_dst_drop(skb);
356 303
357 return ret; 304 return ret;
358} 305}
359EXPORT_SYMBOL_GPL(nf_nat_ipv6_in);
360 306
361unsigned int 307static unsigned int
362nf_nat_ipv6_out(void *priv, struct sk_buff *skb, 308nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
363 const struct nf_hook_state *state, 309 const struct nf_hook_state *state)
364 unsigned int (*do_chain)(void *priv,
365 struct sk_buff *skb,
366 const struct nf_hook_state *state,
367 struct nf_conn *ct))
368{ 310{
369#ifdef CONFIG_XFRM 311#ifdef CONFIG_XFRM
370 const struct nf_conn *ct; 312 const struct nf_conn *ct;
@@ -373,7 +315,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
373#endif 315#endif
374 unsigned int ret; 316 unsigned int ret;
375 317
376 ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); 318 ret = nf_nat_ipv6_fn(priv, skb, state);
377#ifdef CONFIG_XFRM 319#ifdef CONFIG_XFRM
378 if (ret != NF_DROP && ret != NF_STOLEN && 320 if (ret != NF_DROP && ret != NF_STOLEN &&
379 !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && 321 !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -393,22 +335,17 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
393#endif 335#endif
394 return ret; 336 return ret;
395} 337}
396EXPORT_SYMBOL_GPL(nf_nat_ipv6_out);
397 338
398unsigned int 339static unsigned int
399nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, 340nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
400 const struct nf_hook_state *state, 341 const struct nf_hook_state *state)
401 unsigned int (*do_chain)(void *priv,
402 struct sk_buff *skb,
403 const struct nf_hook_state *state,
404 struct nf_conn *ct))
405{ 342{
406 const struct nf_conn *ct; 343 const struct nf_conn *ct;
407 enum ip_conntrack_info ctinfo; 344 enum ip_conntrack_info ctinfo;
408 unsigned int ret; 345 unsigned int ret;
409 int err; 346 int err;
410 347
411 ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); 348 ret = nf_nat_ipv6_fn(priv, skb, state);
412 if (ret != NF_DROP && ret != NF_STOLEN && 349 if (ret != NF_DROP && ret != NF_STOLEN &&
413 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 350 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
414 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 351 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -432,7 +369,49 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
432 } 369 }
433 return ret; 370 return ret;
434} 371}
435EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn); 372
373static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
374 /* Before packet filtering, change destination */
375 {
376 .hook = nf_nat_ipv6_in,
377 .pf = NFPROTO_IPV6,
378 .hooknum = NF_INET_PRE_ROUTING,
379 .priority = NF_IP6_PRI_NAT_DST,
380 },
381 /* After packet filtering, change source */
382 {
383 .hook = nf_nat_ipv6_out,
384 .pf = NFPROTO_IPV6,
385 .hooknum = NF_INET_POST_ROUTING,
386 .priority = NF_IP6_PRI_NAT_SRC,
387 },
388 /* Before packet filtering, change destination */
389 {
390 .hook = nf_nat_ipv6_local_fn,
391 .pf = NFPROTO_IPV6,
392 .hooknum = NF_INET_LOCAL_OUT,
393 .priority = NF_IP6_PRI_NAT_DST,
394 },
395 /* After packet filtering, change source */
396 {
397 .hook = nf_nat_ipv6_fn,
398 .pf = NFPROTO_IPV6,
399 .hooknum = NF_INET_LOCAL_IN,
400 .priority = NF_IP6_PRI_NAT_SRC,
401 },
402};
403
404int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
405{
406 return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
407}
408EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_register_fn);
409
410void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
411{
412 nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
413}
414EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn);
436 415
437static int __init nf_nat_l3proto_ipv6_init(void) 416static int __init nf_nat_l3proto_ipv6_init(void)
438{ 417{
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 98f61fcb9108..e6eb7cf9b54f 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/atomic.h> 13#include <linux/atomic.h>
15#include <linux/netdevice.h> 14#include <linux/netdevice.h>
16#include <linux/ipv6.h> 15#include <linux/ipv6.h>
@@ -26,14 +25,14 @@
26static atomic_t v6_worker_count; 25static atomic_t v6_worker_count;
27 26
28unsigned int 27unsigned int
29nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, 28nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
30 const struct net_device *out) 29 const struct net_device *out)
31{ 30{
32 enum ip_conntrack_info ctinfo; 31 enum ip_conntrack_info ctinfo;
33 struct nf_conn_nat *nat; 32 struct nf_conn_nat *nat;
34 struct in6_addr src; 33 struct in6_addr src;
35 struct nf_conn *ct; 34 struct nf_conn *ct;
36 struct nf_nat_range newrange; 35 struct nf_nat_range2 newrange;
37 36
38 ct = nf_ct_get(skb, &ctinfo); 37 ct = nf_ct_get(skb, &ctinfo);
39 WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || 38 WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
@@ -186,6 +185,3 @@ void nf_nat_masquerade_ipv6_unregister_notifier(void)
186 unregister_netdevice_notifier(&masq_dev_notifier); 185 unregister_netdevice_notifier(&masq_dev_notifier);
187} 186}
188EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); 187EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
189
190MODULE_LICENSE("GPL");
191MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 57593b00c5b4..d9bf42ba44fa 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
32static void 32static void
33icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, 33icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
34 struct nf_conntrack_tuple *tuple, 34 struct nf_conntrack_tuple *tuple,
35 const struct nf_nat_range *range, 35 const struct nf_nat_range2 *range,
36 enum nf_nat_manip_type maniptype, 36 enum nf_nat_manip_type maniptype,
37 const struct nf_conn *ct) 37 const struct nf_conn *ct)
38{ 38{
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
new file mode 100644
index 000000000000..bf1d6c421e3b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -0,0 +1,146 @@
1#include <net/netfilter/nf_tproxy.h>
2#include <linux/module.h>
3#include <net/inet6_hashtables.h>
4#include <net/addrconf.h>
5#include <net/udp.h>
6#include <net/tcp.h>
7
8const struct in6_addr *
9nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
10 const struct in6_addr *daddr)
11{
12 struct inet6_dev *indev;
13 struct inet6_ifaddr *ifa;
14 struct in6_addr *laddr;
15
16 if (!ipv6_addr_any(user_laddr))
17 return user_laddr;
18 laddr = NULL;
19
20 indev = __in6_dev_get(skb->dev);
21 if (indev) {
22 read_lock_bh(&indev->lock);
23 list_for_each_entry(ifa, &indev->addr_list, if_list) {
24 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
25 continue;
26
27 laddr = &ifa->addr;
28 break;
29 }
30 read_unlock_bh(&indev->lock);
31 }
32
33 return laddr ? laddr : daddr;
34}
35EXPORT_SYMBOL_GPL(nf_tproxy_laddr6);
36
37struct sock *
38nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
39 struct net *net,
40 const struct in6_addr *laddr,
41 const __be16 lport,
42 struct sock *sk)
43{
44 const struct ipv6hdr *iph = ipv6_hdr(skb);
45 struct tcphdr _hdr, *hp;
46
47 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
48 if (hp == NULL) {
49 inet_twsk_put(inet_twsk(sk));
50 return NULL;
51 }
52
53 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
54 /* SYN to a TIME_WAIT socket, we'd rather redirect it
55 * to a listener socket if there's one */
56 struct sock *sk2;
57
58 sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
59 &iph->saddr,
60 nf_tproxy_laddr6(skb, laddr, &iph->daddr),
61 hp->source,
62 lport ? lport : hp->dest,
63 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
64 if (sk2) {
65 inet_twsk_deschedule_put(inet_twsk(sk));
66 sk = sk2;
67 }
68 }
69
70 return sk;
71}
72EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
73
74struct sock *
75nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
76 const u8 protocol,
77 const struct in6_addr *saddr, const struct in6_addr *daddr,
78 const __be16 sport, const __be16 dport,
79 const struct net_device *in,
80 const enum nf_tproxy_lookup_t lookup_type)
81{
82 struct sock *sk;
83 struct tcphdr *tcph;
84
85 switch (protocol) {
86 case IPPROTO_TCP:
87 switch (lookup_type) {
88 case NF_TPROXY_LOOKUP_LISTENER:
89 tcph = hp;
90 sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
91 thoff + __tcp_hdrlen(tcph),
92 saddr, sport,
93 daddr, ntohs(dport),
94 in->ifindex, 0);
95
96 if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
97 sk = NULL;
98 /* NOTE: we return listeners even if bound to
99 * 0.0.0.0, those are filtered out in
100 * xt_socket, since xt_TPROXY needs 0 bound
101 * listeners too
102 */
103 break;
104 case NF_TPROXY_LOOKUP_ESTABLISHED:
105 sk = __inet6_lookup_established(net, &tcp_hashinfo,
106 saddr, sport, daddr, ntohs(dport),
107 in->ifindex, 0);
108 break;
109 default:
110 BUG();
111 }
112 break;
113 case IPPROTO_UDP:
114 sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
115 in->ifindex);
116 if (sk) {
117 int connected = (sk->sk_state == TCP_ESTABLISHED);
118 int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
119
120 /* NOTE: we return listeners even if bound to
121 * 0.0.0.0, those are filtered out in
122 * xt_socket, since xt_TPROXY needs 0 bound
123 * listeners too
124 */
125 if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
126 (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
127 sock_put(sk);
128 sk = NULL;
129 }
130 }
131 break;
132 default:
133 WARN_ON(1);
134 sk = NULL;
135 }
136
137 pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
138 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
139
140 return sk;
141}
142EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
143
144MODULE_LICENSE("GPL");
145MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
146MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 3557b114446c..8a081ad7d5db 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -26,8 +26,7 @@
26 26
27static unsigned int nft_nat_do_chain(void *priv, 27static unsigned int nft_nat_do_chain(void *priv,
28 struct sk_buff *skb, 28 struct sk_buff *skb,
29 const struct nf_hook_state *state, 29 const struct nf_hook_state *state)
30 struct nf_conn *ct)
31{ 30{
32 struct nft_pktinfo pkt; 31 struct nft_pktinfo pkt;
33 32
@@ -37,42 +36,14 @@ static unsigned int nft_nat_do_chain(void *priv,
37 return nft_do_chain(&pkt, priv); 36 return nft_do_chain(&pkt, priv);
38} 37}
39 38
40static unsigned int nft_nat_ipv6_fn(void *priv, 39static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
41 struct sk_buff *skb,
42 const struct nf_hook_state *state)
43{
44 return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain);
45}
46
47static unsigned int nft_nat_ipv6_in(void *priv,
48 struct sk_buff *skb,
49 const struct nf_hook_state *state)
50{
51 return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain);
52}
53
54static unsigned int nft_nat_ipv6_out(void *priv,
55 struct sk_buff *skb,
56 const struct nf_hook_state *state)
57{
58 return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain);
59}
60
61static unsigned int nft_nat_ipv6_local_fn(void *priv,
62 struct sk_buff *skb,
63 const struct nf_hook_state *state)
64{
65 return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
66}
67
68static int nft_nat_ipv6_init(struct nft_ctx *ctx)
69{ 40{
70 return nf_ct_netns_get(ctx->net, ctx->family); 41 return nf_nat_l3proto_ipv6_register_fn(net, ops);
71} 42}
72 43
73static void nft_nat_ipv6_free(struct nft_ctx *ctx) 44static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
74{ 45{
75 nf_ct_netns_put(ctx->net, ctx->family); 46 nf_nat_l3proto_ipv6_unregister_fn(net, ops);
76} 47}
77 48
78static const struct nft_chain_type nft_chain_nat_ipv6 = { 49static const struct nft_chain_type nft_chain_nat_ipv6 = {
@@ -85,13 +56,13 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
85 (1 << NF_INET_LOCAL_OUT) | 56 (1 << NF_INET_LOCAL_OUT) |
86 (1 << NF_INET_LOCAL_IN), 57 (1 << NF_INET_LOCAL_IN),
87 .hooks = { 58 .hooks = {
88 [NF_INET_PRE_ROUTING] = nft_nat_ipv6_in, 59 [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
89 [NF_INET_POST_ROUTING] = nft_nat_ipv6_out, 60 [NF_INET_POST_ROUTING] = nft_nat_do_chain,
90 [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn, 61 [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
91 [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, 62 [NF_INET_LOCAL_IN] = nft_nat_do_chain,
92 }, 63 },
93 .init = nft_nat_ipv6_init, 64 .ops_register = nft_nat_ipv6_reg,
94 .free = nft_nat_ipv6_free, 65 .ops_unregister = nft_nat_ipv6_unreg,
95}; 66};
96 67
97static int __init nft_chain_nat_ipv6_init(void) 68static int __init nft_chain_nat_ipv6_init(void)
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 4146536e9c15..dd0122f3cffe 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
22 const struct nft_pktinfo *pkt) 22 const struct nft_pktinfo *pkt)
23{ 23{
24 struct nft_masq *priv = nft_expr_priv(expr); 24 struct nft_masq *priv = nft_expr_priv(expr);
25 struct nf_nat_range range; 25 struct nf_nat_range2 range;
26 26
27 memset(&range, 0, sizeof(range)); 27 memset(&range, 0, sizeof(range));
28 range.flags = priv->flags; 28 range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index a27e424f690d..74269865acc8 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
22 const struct nft_pktinfo *pkt) 22 const struct nft_pktinfo *pkt)
23{ 23{
24 struct nft_redir *priv = nft_expr_priv(expr); 24 struct nft_redir *priv = nft_expr_priv(expr);
25 struct nf_nat_range range; 25 struct nf_nat_range2 range;
26 26
27 memset(&range, 0, sizeof(range)); 27 memset(&range, 0, sizeof(range));
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4979610287e2..b939b94e7e91 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -163,7 +163,8 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
163} 163}
164 164
165static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, 165static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
166 struct frag_hdr *fhdr, int nhoff) 166 struct frag_hdr *fhdr, int nhoff,
167 u32 *prob_offset)
167{ 168{
168 struct sk_buff *prev, *next; 169 struct sk_buff *prev, *next;
169 struct net_device *dev; 170 struct net_device *dev;
@@ -179,11 +180,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
179 ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); 180 ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
180 181
181 if ((unsigned int)end > IPV6_MAXPLEN) { 182 if ((unsigned int)end > IPV6_MAXPLEN) {
182 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 183 *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb);
183 IPSTATS_MIB_INHDRERRORS);
184 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
185 ((u8 *)&fhdr->frag_off -
186 skb_network_header(skb)));
187 return -1; 184 return -1;
188 } 185 }
189 186
@@ -214,10 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
214 /* RFC2460 says always send parameter problem in 211 /* RFC2460 says always send parameter problem in
215 * this case. -DaveM 212 * this case. -DaveM
216 */ 213 */
217 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 214 *prob_offset = offsetof(struct ipv6hdr, payload_len);
218 IPSTATS_MIB_INHDRERRORS);
219 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
220 offsetof(struct ipv6hdr, payload_len));
221 return -1; 215 return -1;
222 } 216 }
223 if (end > fq->q.len) { 217 if (end > fq->q.len) {
@@ -519,15 +513,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
519 iif = skb->dev ? skb->dev->ifindex : 0; 513 iif = skb->dev ? skb->dev->ifindex : 0;
520 fq = fq_find(net, fhdr->identification, hdr, iif); 514 fq = fq_find(net, fhdr->identification, hdr, iif);
521 if (fq) { 515 if (fq) {
516 u32 prob_offset = 0;
522 int ret; 517 int ret;
523 518
524 spin_lock(&fq->q.lock); 519 spin_lock(&fq->q.lock);
525 520
526 fq->iif = iif; 521 fq->iif = iif;
527 ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); 522 ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
523 &prob_offset);
528 524
529 spin_unlock(&fq->q.lock); 525 spin_unlock(&fq->q.lock);
530 inet_frag_put(&fq->q); 526 inet_frag_put(&fq->q);
527 if (prob_offset) {
528 __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
529 IPSTATS_MIB_INHDRERRORS);
530 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset);
531 }
531 return ret; 532 return ret;
532 } 533 }
533 534
@@ -536,7 +537,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
536 return -1; 537 return -1;
537 538
538fail_hdr: 539fail_hdr:
539 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 540 __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
540 IPSTATS_MIB_INHDRERRORS); 541 IPSTATS_MIB_INHDRERRORS);
541 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); 542 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
542 return -1; 543 return -1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a6598762d2c1..fb956989adaf 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,14 +63,20 @@
63#include <net/lwtunnel.h> 63#include <net/lwtunnel.h>
64#include <net/ip_tunnels.h> 64#include <net/ip_tunnels.h>
65#include <net/l3mdev.h> 65#include <net/l3mdev.h>
66#include <trace/events/fib6.h> 66#include <net/ip.h>
67
68#include <linux/uaccess.h> 67#include <linux/uaccess.h>
69 68
70#ifdef CONFIG_SYSCTL 69#ifdef CONFIG_SYSCTL
71#include <linux/sysctl.h> 70#include <linux/sysctl.h>
72#endif 71#endif
73 72
73static int ip6_rt_type_to_error(u8 fib6_type);
74
75#define CREATE_TRACE_POINTS
76#include <trace/events/fib6.h>
77EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78#undef CREATE_TRACE_POINTS
79
74enum rt6_nud_state { 80enum rt6_nud_state {
75 RT6_NUD_FAIL_HARD = -3, 81 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2, 82 RT6_NUD_FAIL_PROBE = -2,
@@ -78,7 +84,6 @@ enum rt6_nud_state {
78 RT6_NUD_SUCCEED = 1 84 RT6_NUD_SUCCEED = 1
79}; 85};
80 86
81static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 87static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84static unsigned int ip6_mtu(const struct dst_entry *dst); 89static unsigned int ip6_mtu(const struct dst_entry *dst);
@@ -97,25 +102,24 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu); 102 struct sk_buff *skb, u32 mtu);
98static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 103static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb); 104 struct sk_buff *skb);
100static void rt6_dst_from_metrics_check(struct rt6_info *rt); 105static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
101static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 106static size_t rt6_nlmsg_size(struct fib6_info *rt);
102static size_t rt6_nlmsg_size(struct rt6_info *rt); 107static int rt6_fill_node(struct net *net, struct sk_buff *skb,
103static int rt6_fill_node(struct net *net, 108 struct fib6_info *rt, struct dst_entry *dst,
104 struct sk_buff *skb, struct rt6_info *rt, 109 struct in6_addr *dest, struct in6_addr *src,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq, 110 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags); 111 unsigned int flags);
108static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 112static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
109 struct in6_addr *daddr, 113 struct in6_addr *daddr,
110 struct in6_addr *saddr); 114 struct in6_addr *saddr);
111 115
112#ifdef CONFIG_IPV6_ROUTE_INFO 116#ifdef CONFIG_IPV6_ROUTE_INFO
113static struct rt6_info *rt6_add_route_info(struct net *net, 117static struct fib6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen, 118 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr, 119 const struct in6_addr *gwaddr,
116 struct net_device *dev, 120 struct net_device *dev,
117 unsigned int pref); 121 unsigned int pref);
118static struct rt6_info *rt6_get_route_info(struct net *net, 122static struct fib6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen, 123 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr, 124 const struct in6_addr *gwaddr,
121 struct net_device *dev); 125 struct net_device *dev);
@@ -184,29 +188,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
184 } 188 }
185} 189}
186 190
187static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 191static inline const void *choose_neigh_daddr(const struct in6_addr *p,
188{
189 return dst_metrics_write_ptr(&rt->from->dst);
190}
191
192static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193{
194 struct rt6_info *rt = (struct rt6_info *)dst;
195
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
199 return NULL;
200 else
201 return dst_cow_metrics_generic(dst, old);
202}
203
204static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 struct sk_buff *skb, 192 struct sk_buff *skb,
206 const void *daddr) 193 const void *daddr)
207{ 194{
208 struct in6_addr *p = &rt->rt6i_gateway;
209
210 if (!ipv6_addr_any(p)) 195 if (!ipv6_addr_any(p))
211 return (const void *) p; 196 return (const void *) p;
212 else if (skb) 197 else if (skb)
@@ -214,18 +199,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt,
214 return daddr; 199 return daddr;
215} 200}
216 201
217static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 202struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
218 struct sk_buff *skb, 203 struct net_device *dev,
219 const void *daddr) 204 struct sk_buff *skb,
205 const void *daddr)
220{ 206{
221 struct rt6_info *rt = (struct rt6_info *) dst;
222 struct neighbour *n; 207 struct neighbour *n;
223 208
224 daddr = choose_neigh_daddr(rt, skb, daddr); 209 daddr = choose_neigh_daddr(gw, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr); 210 n = __ipv6_neigh_lookup(dev, daddr);
226 if (n) 211 if (n)
227 return n; 212 return n;
228 return neigh_create(&nd_tbl, daddr, dst->dev); 213 return neigh_create(&nd_tbl, daddr, dev);
214}
215
216static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217 struct sk_buff *skb,
218 const void *daddr)
219{
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
229} 223}
230 224
231static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 225static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
@@ -233,7 +227,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev; 227 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst; 228 struct rt6_info *rt = (struct rt6_info *)dst;
235 229
236 daddr = choose_neigh_daddr(rt, NULL, daddr); 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
237 if (!daddr) 231 if (!daddr)
238 return; 232 return;
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -250,7 +244,7 @@ static struct dst_ops ip6_dst_ops_template = {
250 .check = ip6_dst_check, 244 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss, 245 .default_advmss = ip6_default_advmss,
252 .mtu = ip6_mtu, 246 .mtu = ip6_mtu,
253 .cow_metrics = ipv6_cow_metrics, 247 .cow_metrics = dst_cow_metrics_generic,
254 .destroy = ip6_dst_destroy, 248 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown, 249 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice, 250 .negative_advice = ip6_negative_advice,
@@ -258,7 +252,7 @@ static struct dst_ops ip6_dst_ops_template = {
258 .update_pmtu = ip6_rt_update_pmtu, 252 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect, 253 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out, 254 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup, 255 .neigh_lookup = ip6_dst_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh, 256 .confirm_neigh = ip6_confirm_neigh,
263}; 257};
264 258
@@ -288,13 +282,22 @@ static struct dst_ops ip6_dst_blackhole_ops = {
288 .update_pmtu = ip6_rt_blackhole_update_pmtu, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect, 283 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic, 284 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup, 285 .neigh_lookup = ip6_dst_neigh_lookup,
292}; 286};
293 287
294static const u32 ip6_template_metrics[RTAX_MAX] = { 288static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0, 289 [RTAX_HOPLIMIT - 1] = 0,
296}; 290};
297 291
292static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
299};
300
298static const struct rt6_info ip6_null_entry_template = { 301static const struct rt6_info ip6_null_entry_template = {
299 .dst = { 302 .dst = {
300 .__refcnt = ATOMIC_INIT(1), 303 .__refcnt = ATOMIC_INIT(1),
@@ -305,9 +308,6 @@ static const struct rt6_info ip6_null_entry_template = {
305 .output = ip6_pkt_discard_out, 308 .output = ip6_pkt_discard_out,
306 }, 309 },
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
311}; 311};
312 312
313#ifdef CONFIG_IPV6_MULTIPLE_TABLES 313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -322,9 +322,6 @@ static const struct rt6_info ip6_prohibit_entry_template = {
322 .output = ip6_pkt_prohibit_out, 322 .output = ip6_pkt_prohibit_out,
323 }, 323 },
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
328}; 325};
329 326
330static const struct rt6_info ip6_blk_hole_entry_template = { 327static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -337,9 +334,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
337 .output = dst_discard_out, 334 .output = dst_discard_out,
338 }, 335 },
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
343}; 337};
344 338
345#endif 339#endif
@@ -349,14 +343,12 @@ static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst; 343 struct dst_entry *dst = &rt->dst;
350 344
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached); 346 INIT_LIST_HEAD(&rt->rt6i_uncached);
354} 347}
355 348
356/* allocate dst with ip6_dst_ops */ 349/* allocate dst with ip6_dst_ops */
357static struct rt6_info *__ip6_dst_alloc(struct net *net, 350struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
358 struct net_device *dev, 351 int flags)
359 int flags)
360{ 352{
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags); 354 1, DST_OBSOLETE_FORCE_CHK, flags);
@@ -368,34 +360,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
368 360
369 return rt; 361 return rt;
370} 362}
371
372struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
374 int flags)
375{
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378 if (rt) {
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
382 return NULL;
383 }
384 }
385
386 return rt;
387}
388EXPORT_SYMBOL(ip6_dst_alloc); 363EXPORT_SYMBOL(ip6_dst_alloc);
389 364
390static void ip6_dst_destroy(struct dst_entry *dst) 365static void ip6_dst_destroy(struct dst_entry *dst)
391{ 366{
392 struct rt6_info *rt = (struct rt6_info *)dst; 367 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket; 368 struct fib6_info *from;
394 struct rt6_info *from = rt->from;
395 struct inet6_dev *idev; 369 struct inet6_dev *idev;
396 370
397 dst_destroy_metrics_generic(dst); 371 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt); 372 rt6_uncached_list_del(rt);
400 373
401 idev = rt->rt6i_idev; 374 idev = rt->rt6i_idev;
@@ -403,14 +376,12 @@ static void ip6_dst_destroy(struct dst_entry *dst)
403 rt->rt6i_idev = NULL; 376 rt->rt6i_idev = NULL;
404 in6_dev_put(idev); 377 in6_dev_put(idev);
405 } 378 }
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 if (bucket) {
408 rt->rt6i_exception_bucket = NULL;
409 kfree(bucket);
410 }
411 379
412 rt->from = NULL; 380 rcu_read_lock();
413 dst_release(&from->dst); 381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
384 rcu_read_unlock();
414} 385}
415 386
416static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 387static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -440,23 +411,27 @@ static bool __rt6_check_expired(const struct rt6_info *rt)
440 411
441static bool rt6_check_expired(const struct rt6_info *rt) 412static bool rt6_check_expired(const struct rt6_info *rt)
442{ 413{
414 struct fib6_info *from;
415
416 from = rcu_dereference(rt->from);
417
443 if (rt->rt6i_flags & RTF_EXPIRES) { 418 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires)) 419 if (time_after(jiffies, rt->dst.expires))
445 return true; 420 return true;
446 } else if (rt->from) { 421 } else if (from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired(rt->from); 423 fib6_check_expired(from);
449 } 424 }
450 return false; 425 return false;
451} 426}
452 427
453static struct rt6_info *rt6_multipath_select(const struct net *net, 428struct fib6_info *fib6_multipath_select(const struct net *net,
454 struct rt6_info *match, 429 struct fib6_info *match,
455 struct flowi6 *fl6, int oif, 430 struct flowi6 *fl6, int oif,
456 const struct sk_buff *skb, 431 const struct sk_buff *skb,
457 int strict) 432 int strict)
458{ 433{
459 struct rt6_info *sibling, *next_sibling; 434 struct fib6_info *sibling, *next_sibling;
460 435
461 /* We might have already computed the hash for ICMPv6 errors. In such 436 /* We might have already computed the hash for ICMPv6 errors. In such
462 * case it will always be non-zero. Otherwise now is the time to do it. 437 * case it will always be non-zero. Otherwise now is the time to do it.
@@ -464,12 +439,15 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
464 if (!fl6->mp_hash) 439 if (!fl6->mp_hash)
465 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466 441
467 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) 442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
468 return match; 443 return match;
469 444
470 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, 445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
471 rt6i_siblings) { 446 fib6_siblings) {
472 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) 447 int nh_upper_bound;
448
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
473 continue; 451 continue;
474 if (rt6_score_route(sibling, oif, strict) < 0) 452 if (rt6_score_route(sibling, oif, strict) < 0)
475 break; 453 break;
@@ -484,38 +462,27 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
484 * Route lookup. rcu_read_lock() should be held. 462 * Route lookup. rcu_read_lock() should be held.
485 */ 463 */
486 464
487static inline struct rt6_info *rt6_device_match(struct net *net, 465static inline struct fib6_info *rt6_device_match(struct net *net,
488 struct rt6_info *rt, 466 struct fib6_info *rt,
489 const struct in6_addr *saddr, 467 const struct in6_addr *saddr,
490 int oif, 468 int oif,
491 int flags) 469 int flags)
492{ 470{
493 struct rt6_info *local = NULL; 471 struct fib6_info *sprt;
494 struct rt6_info *sprt;
495 472
496 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) 473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
497 return rt; 475 return rt;
498 476
499 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { 477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
500 struct net_device *dev = sprt->dst.dev; 478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
501 479
502 if (sprt->rt6i_nh_flags & RTNH_F_DEAD) 480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
503 continue; 481 continue;
504 482
505 if (oif) { 483 if (oif) {
506 if (dev->ifindex == oif) 484 if (dev->ifindex == oif)
507 return sprt; 485 return sprt;
508 if (dev->flags & IFF_LOOPBACK) {
509 if (!sprt->rt6i_idev ||
510 sprt->rt6i_idev->dev->ifindex != oif) {
511 if (flags & RT6_LOOKUP_F_IFACE)
512 continue;
513 if (local &&
514 local->rt6i_idev->dev->ifindex == oif)
515 continue;
516 }
517 local = sprt;
518 }
519 } else { 486 } else {
520 if (ipv6_chk_addr(net, saddr, dev, 487 if (ipv6_chk_addr(net, saddr, dev,
521 flags & RT6_LOOKUP_F_IFACE)) 488 flags & RT6_LOOKUP_F_IFACE))
@@ -523,15 +490,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
523 } 490 }
524 } 491 }
525 492
526 if (oif) { 493 if (oif && flags & RT6_LOOKUP_F_IFACE)
527 if (local) 494 return net->ipv6.fib6_null_entry;
528 return local;
529
530 if (flags & RT6_LOOKUP_F_IFACE)
531 return net->ipv6.ip6_null_entry;
532 }
533 495
534 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; 496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
535} 497}
536 498
537#ifdef CONFIG_IPV6_ROUTER_PREF 499#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -553,10 +515,13 @@ static void rt6_probe_deferred(struct work_struct *w)
553 kfree(work); 515 kfree(work);
554} 516}
555 517
556static void rt6_probe(struct rt6_info *rt) 518static void rt6_probe(struct fib6_info *rt)
557{ 519{
558 struct __rt6_probe_work *work; 520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
559 struct neighbour *neigh; 522 struct neighbour *neigh;
523 struct net_device *dev;
524
560 /* 525 /*
561 * Okay, this does not seem to be appropriate 526 * Okay, this does not seem to be appropriate
562 * for now, however, we need to check if it 527 * for now, however, we need to check if it
@@ -565,20 +530,25 @@ static void rt6_probe(struct rt6_info *rt)
565 * Router Reachability Probe MUST be rate-limited 530 * Router Reachability Probe MUST be rate-limited
566 * to no more than one per minute. 531 * to no more than one per minute.
567 */ 532 */
568 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
569 return; 534 return;
535
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
570 rcu_read_lock_bh(); 538 rcu_read_lock_bh();
571 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572 if (neigh) { 540 if (neigh) {
541 struct inet6_dev *idev;
542
573 if (neigh->nud_state & NUD_VALID) 543 if (neigh->nud_state & NUD_VALID)
574 goto out; 544 goto out;
575 545
546 idev = __in6_dev_get(dev);
576 work = NULL; 547 work = NULL;
577 write_lock(&neigh->lock); 548 write_lock(&neigh->lock);
578 if (!(neigh->nud_state & NUD_VALID) && 549 if (!(neigh->nud_state & NUD_VALID) &&
579 time_after(jiffies, 550 time_after(jiffies,
580 neigh->updated + 551 neigh->updated + idev->cnf.rtr_probe_interval)) {
581 rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 work = kmalloc(sizeof(*work), GFP_ATOMIC); 552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 if (work) 553 if (work)
584 __neigh_set_probe_once(neigh); 554 __neigh_set_probe_once(neigh);
@@ -590,9 +560,9 @@ static void rt6_probe(struct rt6_info *rt)
590 560
591 if (work) { 561 if (work) {
592 INIT_WORK(&work->work, rt6_probe_deferred); 562 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = rt->rt6i_gateway; 563 work->target = *nh_gw;
594 dev_hold(rt->dst.dev); 564 dev_hold(dev);
595 work->dev = rt->dst.dev; 565 work->dev = dev;
596 schedule_work(&work->work); 566 schedule_work(&work->work);
597 } 567 }
598 568
@@ -600,7 +570,7 @@ out:
600 rcu_read_unlock_bh(); 570 rcu_read_unlock_bh();
601} 571}
602#else 572#else
603static inline void rt6_probe(struct rt6_info *rt) 573static inline void rt6_probe(struct fib6_info *rt)
604{ 574{
605} 575}
606#endif 576#endif
@@ -608,28 +578,27 @@ static inline void rt6_probe(struct rt6_info *rt)
608/* 578/*
609 * Default Router Selection (RFC 2461 6.3.6) 579 * Default Router Selection (RFC 2461 6.3.6)
610 */ 580 */
611static inline int rt6_check_dev(struct rt6_info *rt, int oif) 581static inline int rt6_check_dev(struct fib6_info *rt, int oif)
612{ 582{
613 struct net_device *dev = rt->dst.dev; 583 const struct net_device *dev = rt->fib6_nh.nh_dev;
584
614 if (!oif || dev->ifindex == oif) 585 if (!oif || dev->ifindex == oif)
615 return 2; 586 return 2;
616 if ((dev->flags & IFF_LOOPBACK) &&
617 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 return 1;
619 return 0; 587 return 0;
620} 588}
621 589
622static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 590static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
623{ 591{
624 struct neighbour *neigh;
625 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
626 594
627 if (rt->rt6i_flags & RTF_NONEXTHOP || 595 if (rt->fib6_flags & RTF_NONEXTHOP ||
628 !(rt->rt6i_flags & RTF_GATEWAY)) 596 !(rt->fib6_flags & RTF_GATEWAY))
629 return RT6_NUD_SUCCEED; 597 return RT6_NUD_SUCCEED;
630 598
631 rcu_read_lock_bh(); 599 rcu_read_lock_bh();
632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601 &rt->fib6_nh.nh_gw);
633 if (neigh) { 602 if (neigh) {
634 read_lock(&neigh->lock); 603 read_lock(&neigh->lock);
635 if (neigh->nud_state & NUD_VALID) 604 if (neigh->nud_state & NUD_VALID)
@@ -650,8 +619,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
650 return ret; 619 return ret;
651} 620}
652 621
653static int rt6_score_route(struct rt6_info *rt, int oif, 622static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
654 int strict)
655{ 623{
656 int m; 624 int m;
657 625
@@ -659,7 +627,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
659 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 return RT6_NUD_FAIL_HARD; 628 return RT6_NUD_FAIL_HARD;
661#ifdef CONFIG_IPV6_ROUTER_PREF 629#ifdef CONFIG_IPV6_ROUTER_PREF
662 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
663#endif 631#endif
664 if (strict & RT6_LOOKUP_F_REACHABLE) { 632 if (strict & RT6_LOOKUP_F_REACHABLE) {
665 int n = rt6_check_neigh(rt); 633 int n = rt6_check_neigh(rt);
@@ -669,23 +637,37 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
669 return m; 637 return m;
670} 638}
671 639
672static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 640/* called with rc_read_lock held */
673 int *mpri, struct rt6_info *match, 641static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642{
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
644 bool rc = false;
645
646 if (dev) {
647 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
650 }
651
652 return rc;
653}
654
655static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
674 bool *do_rr) 657 bool *do_rr)
675{ 658{
676 int m; 659 int m;
677 bool match_do_rr = false; 660 bool match_do_rr = false;
678 struct inet6_dev *idev = rt->rt6i_idev;
679 661
680 if (rt->rt6i_nh_flags & RTNH_F_DEAD) 662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
681 goto out; 663 goto out;
682 664
683 if (idev->cnf.ignore_routes_with_linkdown && 665 if (fib6_ignore_linkdown(rt) &&
684 rt->rt6i_nh_flags & RTNH_F_LINKDOWN && 666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
685 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686 goto out; 668 goto out;
687 669
688 if (rt6_check_expired(rt)) 670 if (fib6_check_expired(rt))
689 goto out; 671 goto out;
690 672
691 m = rt6_score_route(rt, oif, strict); 673 m = rt6_score_route(rt, oif, strict);
@@ -709,19 +691,19 @@ out:
709 return match; 691 return match;
710} 692}
711 693
712static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 694static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
713 struct rt6_info *leaf, 695 struct fib6_info *leaf,
714 struct rt6_info *rr_head, 696 struct fib6_info *rr_head,
715 u32 metric, int oif, int strict, 697 u32 metric, int oif, int strict,
716 bool *do_rr) 698 bool *do_rr)
717{ 699{
718 struct rt6_info *rt, *match, *cont; 700 struct fib6_info *rt, *match, *cont;
719 int mpri = -1; 701 int mpri = -1;
720 702
721 match = NULL; 703 match = NULL;
722 cont = NULL; 704 cont = NULL;
723 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { 705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
724 if (rt->rt6i_metric != metric) { 706 if (rt->fib6_metric != metric) {
725 cont = rt; 707 cont = rt;
726 break; 708 break;
727 } 709 }
@@ -730,8 +712,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
730 } 712 }
731 713
732 for (rt = leaf; rt && rt != rr_head; 714 for (rt = leaf; rt && rt != rr_head;
733 rt = rcu_dereference(rt->rt6_next)) { 715 rt = rcu_dereference(rt->fib6_next)) {
734 if (rt->rt6i_metric != metric) { 716 if (rt->fib6_metric != metric) {
735 cont = rt; 717 cont = rt;
736 break; 718 break;
737 } 719 }
@@ -742,22 +724,22 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
742 if (match || !cont) 724 if (match || !cont)
743 return match; 725 return match;
744 726
745 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) 727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
746 match = find_match(rt, oif, strict, &mpri, match, do_rr); 728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747 729
748 return match; 730 return match;
749} 731}
750 732
751static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, 733static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 int oif, int strict) 734 int oif, int strict)
753{ 735{
754 struct rt6_info *leaf = rcu_dereference(fn->leaf); 736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
755 struct rt6_info *match, *rt0; 737 struct fib6_info *match, *rt0;
756 bool do_rr = false; 738 bool do_rr = false;
757 int key_plen; 739 int key_plen;
758 740
759 if (!leaf || leaf == net->ipv6.ip6_null_entry) 741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
760 return net->ipv6.ip6_null_entry; 742 return net->ipv6.fib6_null_entry;
761 743
762 rt0 = rcu_dereference(fn->rr_ptr); 744 rt0 = rcu_dereference(fn->rr_ptr);
763 if (!rt0) 745 if (!rt0)
@@ -768,39 +750,39 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
768 * (This might happen if all routes under fn are deleted from 750 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.) 751 * the tree and fib6_repair_tree() is called on the node.)
770 */ 752 */
771 key_plen = rt0->rt6i_dst.plen; 753 key_plen = rt0->fib6_dst.plen;
772#ifdef CONFIG_IPV6_SUBTREES 754#ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->rt6i_src.plen) 755 if (rt0->fib6_src.plen)
774 key_plen = rt0->rt6i_src.plen; 756 key_plen = rt0->fib6_src.plen;
775#endif 757#endif
776 if (fn->fn_bit != key_plen) 758 if (fn->fn_bit != key_plen)
777 return net->ipv6.ip6_null_entry; 759 return net->ipv6.fib6_null_entry;
778 760
779 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, 761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
780 &do_rr); 762 &do_rr);
781 763
782 if (do_rr) { 764 if (do_rr) {
783 struct rt6_info *next = rcu_dereference(rt0->rt6_next); 765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
784 766
785 /* no entries matched; do round-robin */ 767 /* no entries matched; do round-robin */
786 if (!next || next->rt6i_metric != rt0->rt6i_metric) 768 if (!next || next->fib6_metric != rt0->fib6_metric)
787 next = leaf; 769 next = leaf;
788 770
789 if (next != rt0) { 771 if (next != rt0) {
790 spin_lock_bh(&leaf->rt6i_table->tb6_lock); 772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
791 /* make sure next is not being deleted from the tree */ 773 /* make sure next is not being deleted from the tree */
792 if (next->rt6i_node) 774 if (next->fib6_node)
793 rcu_assign_pointer(fn->rr_ptr, next); 775 rcu_assign_pointer(fn->rr_ptr, next);
794 spin_unlock_bh(&leaf->rt6i_table->tb6_lock); 776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
795 } 777 }
796 } 778 }
797 779
798 return match ? match : net->ipv6.ip6_null_entry; 780 return match ? match : net->ipv6.fib6_null_entry;
799} 781}
800 782
801static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 783static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
802{ 784{
803 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804} 786}
805 787
806#ifdef CONFIG_IPV6_ROUTE_INFO 788#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -812,7 +794,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
812 struct in6_addr prefix_buf, *prefix; 794 struct in6_addr prefix_buf, *prefix;
813 unsigned int pref; 795 unsigned int pref;
814 unsigned long lifetime; 796 unsigned long lifetime;
815 struct rt6_info *rt; 797 struct fib6_info *rt;
816 798
817 if (len < sizeof(struct route_info)) { 799 if (len < sizeof(struct route_info)) {
818 return -EINVAL; 800 return -EINVAL;
@@ -850,13 +832,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
850 } 832 }
851 833
852 if (rinfo->prefix_len == 0) 834 if (rinfo->prefix_len == 0)
853 rt = rt6_get_dflt_router(gwaddr, dev); 835 rt = rt6_get_dflt_router(net, gwaddr, dev);
854 else 836 else
855 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856 gwaddr, dev); 838 gwaddr, dev);
857 839
858 if (rt && !lifetime) { 840 if (rt && !lifetime) {
859 ip6_del_rt(rt); 841 ip6_del_rt(net, rt);
860 rt = NULL; 842 rt = NULL;
861 } 843 }
862 844
@@ -864,21 +846,162 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
864 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 dev, pref); 847 dev, pref);
866 else if (rt) 848 else if (rt)
867 rt->rt6i_flags = RTF_ROUTEINFO | 849 rt->fib6_flags = RTF_ROUTEINFO |
868 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869 851
870 if (rt) { 852 if (rt) {
871 if (!addrconf_finite_timeout(lifetime)) 853 if (!addrconf_finite_timeout(lifetime))
872 rt6_clean_expires(rt); 854 fib6_clean_expires(rt);
873 else 855 else
874 rt6_set_expires(rt, jiffies + HZ * lifetime); 856 fib6_set_expires(rt, jiffies + HZ * lifetime);
875 857
876 ip6_rt_put(rt); 858 fib6_info_release(rt);
877 } 859 }
878 return 0; 860 return 0;
879} 861}
880#endif 862#endif
881 863
864/*
865 * Misc support functions
866 */
867
868/* called with rcu_lock held */
869static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870{
871 struct net_device *dev = rt->fib6_nh.nh_dev;
872
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
877 */
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
885 */
886 }
887
888 return dev;
889}
890
891static const int fib6_prop[RTN_MAX + 1] = {
892 [RTN_UNSPEC] = 0,
893 [RTN_UNICAST] = 0,
894 [RTN_LOCAL] = 0,
895 [RTN_BROADCAST] = 0,
896 [RTN_ANYCAST] = 0,
897 [RTN_MULTICAST] = 0,
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
902 [RTN_NAT] = -EINVAL,
903 [RTN_XRESOLVE] = -EINVAL,
904};
905
906static int ip6_rt_type_to_error(u8 fib6_type)
907{
908 return fib6_prop[fib6_type];
909}
910
911static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912{
913 unsigned short flags = 0;
914
915 if (rt->dst_nocount)
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
919 if (rt->dst_host)
920 flags |= DST_HOST;
921
922 return flags;
923}
924
925static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926{
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929 switch (ort->fib6_type) {
930 case RTN_BLACKHOLE:
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
933 break;
934 case RTN_PROHIBIT:
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
937 break;
938 case RTN_THROW:
939 case RTN_UNREACHABLE:
940 default:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
943 break;
944 }
945}
946
947static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948{
949 rt->dst.flags |= fib6_info_dst_flags(ort);
950
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
953 return;
954 }
955
956 rt->dst.error = 0;
957 rt->dst.output = ip6_output;
958
959 if (ort->fib6_type == RTN_LOCAL) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
963 } else {
964 rt->dst.input = ip6_forward;
965 }
966
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
970 }
971
972 rt->dst.lastuse = jiffies;
973}
974
975static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
976{
977 rt->rt6i_flags &= ~RTF_EXPIRES;
978 fib6_info_hold(from);
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 if (from->fib6_metrics != &dst_default_metrics) {
982 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983 refcount_inc(&from->fib6_metrics->refcnt);
984 }
985}
986
987static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
988{
989 struct net_device *dev = fib6_info_nh_dev(ort);
990
991 ip6_rt_init_dst(rt, ort);
992
993 rt->rt6i_dst = ort->fib6_dst;
994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
995 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
996 rt->rt6i_flags = ort->fib6_flags;
997 rt6_set_from(rt, ort);
998#ifdef CONFIG_IPV6_SUBTREES
999 rt->rt6i_src = ort->fib6_src;
1000#endif
1001 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1003}
1004
882static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1005static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 struct in6_addr *saddr) 1006 struct in6_addr *saddr)
884{ 1007{
@@ -889,7 +1012,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
889 pn = rcu_dereference(fn->parent); 1012 pn = rcu_dereference(fn->parent);
890 sn = FIB6_SUBTREE(pn); 1013 sn = FIB6_SUBTREE(pn);
891 if (sn && sn != fn) 1014 if (sn && sn != fn)
892 fn = fib6_lookup(sn, NULL, saddr); 1015 fn = fib6_node_lookup(sn, NULL, saddr);
893 else 1016 else
894 fn = pn; 1017 fn = pn;
895 if (fn->fn_flags & RTN_RTINFO) 1018 if (fn->fn_flags & RTN_RTINFO)
@@ -914,50 +1037,74 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
914 return false; 1037 return false;
915} 1038}
916 1039
1040/* called with rcu_lock held */
1041static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1042{
1043 unsigned short flags = fib6_info_dst_flags(rt);
1044 struct net_device *dev = rt->fib6_nh.nh_dev;
1045 struct rt6_info *nrt;
1046
1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1048 if (nrt)
1049 ip6_rt_copy_init(nrt, rt);
1050
1051 return nrt;
1052}
1053
917static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1054static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 struct fib6_table *table, 1055 struct fib6_table *table,
919 struct flowi6 *fl6, 1056 struct flowi6 *fl6,
920 const struct sk_buff *skb, 1057 const struct sk_buff *skb,
921 int flags) 1058 int flags)
922{ 1059{
923 struct rt6_info *rt, *rt_cache; 1060 struct fib6_info *f6i;
924 struct fib6_node *fn; 1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
925 1063
926 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927 flags &= ~RT6_LOOKUP_F_IFACE; 1065 flags &= ~RT6_LOOKUP_F_IFACE;
928 1066
929 rcu_read_lock(); 1067 rcu_read_lock();
930 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931restart: 1069restart:
932 rt = rcu_dereference(fn->leaf); 1070 f6i = rcu_dereference(fn->leaf);
933 if (!rt) { 1071 if (!f6i) {
934 rt = net->ipv6.ip6_null_entry; 1072 f6i = net->ipv6.fib6_null_entry;
935 } else { 1073 } else {
936 rt = rt6_device_match(net, rt, &fl6->saddr, 1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
937 fl6->flowi6_oif, flags); 1075 fl6->flowi6_oif, flags);
938 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
939 rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, 1077 f6i = fib6_multipath_select(net, f6i, fl6,
940 skb, flags); 1078 fl6->flowi6_oif, skb,
1079 flags);
941 } 1080 }
942 if (rt == net->ipv6.ip6_null_entry) { 1081 if (f6i == net->ipv6.fib6_null_entry) {
943 fn = fib6_backtrack(fn, &fl6->saddr); 1082 fn = fib6_backtrack(fn, &fl6->saddr);
944 if (fn) 1083 if (fn)
945 goto restart; 1084 goto restart;
946 } 1085 }
947 /* Search through exception table */
948 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949 if (rt_cache)
950 rt = rt_cache;
951 1086
952 if (ip6_hold_safe(net, &rt, true)) 1087 trace_fib6_table_lookup(net, f6i, table, fl6);
953 dst_use_noref(&rt->dst, jiffies);
954 1088
955 rcu_read_unlock(); 1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091 if (rt) {
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1096 dst_hold(&rt->dst);
1097 } else {
1098 rt = ip6_create_rt_rcu(f6i);
1099 if (!rt) {
1100 rt = net->ipv6.ip6_null_entry;
1101 dst_hold(&rt->dst);
1102 }
1103 }
956 1104
957 trace_fib6_table_lookup(net, rt, table, fl6); 1105 rcu_read_unlock();
958 1106
959 return rt; 1107 return rt;
960
961} 1108}
962 1109
963struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1110struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
@@ -999,55 +1146,28 @@ EXPORT_SYMBOL(rt6_lookup);
999 * Caller must hold dst before calling it. 1146 * Caller must hold dst before calling it.
1000 */ 1147 */
1001 1148
1002static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 1149static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1003 struct mx6_config *mxc,
1004 struct netlink_ext_ack *extack) 1150 struct netlink_ext_ack *extack)
1005{ 1151{
1006 int err; 1152 int err;
1007 struct fib6_table *table; 1153 struct fib6_table *table;
1008 1154
1009 table = rt->rt6i_table; 1155 table = rt->fib6_table;
1010 spin_lock_bh(&table->tb6_lock); 1156 spin_lock_bh(&table->tb6_lock);
1011 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1012 spin_unlock_bh(&table->tb6_lock); 1158 spin_unlock_bh(&table->tb6_lock);
1013 1159
1014 return err; 1160 return err;
1015} 1161}
1016 1162
1017int ip6_ins_rt(struct rt6_info *rt) 1163int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1018{ 1164{
1019 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 1165 struct nl_info info = { .nl_net = net, };
1020 struct mx6_config mxc = { .mx = NULL, };
1021 1166
1022 /* Hold dst to account for the reference from the fib6 tree */ 1167 return __ip6_ins_rt(rt, &info, NULL);
1023 dst_hold(&rt->dst);
1024 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025} 1168}
1026 1169
1027/* called with rcu_lock held */ 1170static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1028static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029{
1030 struct net_device *dev = rt->dst.dev;
1031
1032 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033 /* for copies of local routes, dst->dev needs to be the
1034 * device if it is a master device, the master device if
1035 * device is enslaved, and the loopback as the default
1036 */
1037 if (netif_is_l3_slave(dev) &&
1038 !rt6_need_strict(&rt->rt6i_dst.addr))
1039 dev = l3mdev_master_dev_rcu(dev);
1040 else if (!netif_is_l3_master(dev))
1041 dev = dev_net(dev)->loopback_dev;
1042 /* last case is netif_is_l3_master(dev) is true in which
1043 * case we want dev returned to be dev
1044 */
1045 }
1046
1047 return dev;
1048}
1049
1050static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 const struct in6_addr *daddr, 1171 const struct in6_addr *daddr,
1052 const struct in6_addr *saddr) 1172 const struct in6_addr *saddr)
1053{ 1173{
@@ -1058,26 +1178,20 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1058 * Clone the route. 1178 * Clone the route.
1059 */ 1179 */
1060 1180
1061 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 ort = ort->from;
1063
1064 rcu_read_lock();
1065 dev = ip6_rt_get_dev_rcu(ort); 1181 dev = ip6_rt_get_dev_rcu(ort);
1066 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1067 rcu_read_unlock();
1068 if (!rt) 1183 if (!rt)
1069 return NULL; 1184 return NULL;
1070 1185
1071 ip6_rt_copy_init(rt, ort); 1186 ip6_rt_copy_init(rt, ort);
1072 rt->rt6i_flags |= RTF_CACHE; 1187 rt->rt6i_flags |= RTF_CACHE;
1073 rt->rt6i_metric = 0;
1074 rt->dst.flags |= DST_HOST; 1188 rt->dst.flags |= DST_HOST;
1075 rt->rt6i_dst.addr = *daddr; 1189 rt->rt6i_dst.addr = *daddr;
1076 rt->rt6i_dst.plen = 128; 1190 rt->rt6i_dst.plen = 128;
1077 1191
1078 if (!rt6_is_gw_or_nonexthop(ort)) { 1192 if (!rt6_is_gw_or_nonexthop(ort)) {
1079 if (ort->rt6i_dst.plen != 128 && 1193 if (ort->fib6_dst.plen != 128 &&
1080 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1081 rt->rt6i_flags |= RTF_ANYCAST; 1195 rt->rt6i_flags |= RTF_ANYCAST;
1082#ifdef CONFIG_IPV6_SUBTREES 1196#ifdef CONFIG_IPV6_SUBTREES
1083 if (rt->rt6i_src.plen && saddr) { 1197 if (rt->rt6i_src.plen && saddr) {
@@ -1090,45 +1204,44 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1090 return rt; 1204 return rt;
1091} 1205}
1092 1206
1093static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1207static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1094{ 1208{
1209 unsigned short flags = fib6_info_dst_flags(rt);
1095 struct net_device *dev; 1210 struct net_device *dev;
1096 struct rt6_info *pcpu_rt; 1211 struct rt6_info *pcpu_rt;
1097 1212
1098 rcu_read_lock(); 1213 rcu_read_lock();
1099 dev = ip6_rt_get_dev_rcu(rt); 1214 dev = ip6_rt_get_dev_rcu(rt);
1100 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1101 rcu_read_unlock(); 1216 rcu_read_unlock();
1102 if (!pcpu_rt) 1217 if (!pcpu_rt)
1103 return NULL; 1218 return NULL;
1104 ip6_rt_copy_init(pcpu_rt, rt); 1219 ip6_rt_copy_init(pcpu_rt, rt);
1105 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 pcpu_rt->rt6i_flags |= RTF_PCPU; 1220 pcpu_rt->rt6i_flags |= RTF_PCPU;
1107 return pcpu_rt; 1221 return pcpu_rt;
1108} 1222}
1109 1223
1110/* It should be called with rcu_read_lock() acquired */ 1224/* It should be called with rcu_read_lock() acquired */
1111static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1225static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1112{ 1226{
1113 struct rt6_info *pcpu_rt, **p; 1227 struct rt6_info *pcpu_rt, **p;
1114 1228
1115 p = this_cpu_ptr(rt->rt6i_pcpu); 1229 p = this_cpu_ptr(rt->rt6i_pcpu);
1116 pcpu_rt = *p; 1230 pcpu_rt = *p;
1117 1231
1118 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) 1232 if (pcpu_rt)
1119 rt6_dst_from_metrics_check(pcpu_rt); 1233 ip6_hold_safe(NULL, &pcpu_rt, false);
1120 1234
1121 return pcpu_rt; 1235 return pcpu_rt;
1122} 1236}
1123 1237
1124static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1238static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1239 struct fib6_info *rt)
1125{ 1240{
1126 struct rt6_info *pcpu_rt, *prev, **p; 1241 struct rt6_info *pcpu_rt, *prev, **p;
1127 1242
1128 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1243 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129 if (!pcpu_rt) { 1244 if (!pcpu_rt) {
1130 struct net *net = dev_net(rt->dst.dev);
1131
1132 dst_hold(&net->ipv6.ip6_null_entry->dst); 1245 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 return net->ipv6.ip6_null_entry; 1246 return net->ipv6.ip6_null_entry;
1134 } 1247 }
@@ -1138,7 +1251,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1138 prev = cmpxchg(p, NULL, pcpu_rt); 1251 prev = cmpxchg(p, NULL, pcpu_rt);
1139 BUG_ON(prev); 1252 BUG_ON(prev);
1140 1253
1141 rt6_dst_from_metrics_check(pcpu_rt);
1142 return pcpu_rt; 1254 return pcpu_rt;
1143} 1255}
1144 1256
@@ -1158,9 +1270,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1158 return; 1270 return;
1159 1271
1160 net = dev_net(rt6_ex->rt6i->dst.dev); 1272 net = dev_net(rt6_ex->rt6i->dst.dev);
1161 rt6_ex->rt6i->rt6i_node = NULL;
1162 hlist_del_rcu(&rt6_ex->hlist); 1273 hlist_del_rcu(&rt6_ex->hlist);
1163 rt6_release(rt6_ex->rt6i); 1274 dst_release(&rt6_ex->rt6i->dst);
1164 kfree_rcu(rt6_ex, rcu); 1275 kfree_rcu(rt6_ex, rcu);
1165 WARN_ON_ONCE(!bucket->depth); 1276 WARN_ON_ONCE(!bucket->depth);
1166 bucket->depth--; 1277 bucket->depth--;
@@ -1268,20 +1379,36 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1268 return NULL; 1379 return NULL;
1269} 1380}
1270 1381
1382static unsigned int fib6_mtu(const struct fib6_info *rt)
1383{
1384 unsigned int mtu;
1385
1386 if (rt->fib6_pmtu) {
1387 mtu = rt->fib6_pmtu;
1388 } else {
1389 struct net_device *dev = fib6_info_nh_dev(rt);
1390 struct inet6_dev *idev;
1391
1392 rcu_read_lock();
1393 idev = __in6_dev_get(dev);
1394 mtu = idev->cnf.mtu6;
1395 rcu_read_unlock();
1396 }
1397
1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1399
1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1401}
1402
1271static int rt6_insert_exception(struct rt6_info *nrt, 1403static int rt6_insert_exception(struct rt6_info *nrt,
1272 struct rt6_info *ort) 1404 struct fib6_info *ort)
1273{ 1405{
1274 struct net *net = dev_net(ort->dst.dev); 1406 struct net *net = dev_net(nrt->dst.dev);
1275 struct rt6_exception_bucket *bucket; 1407 struct rt6_exception_bucket *bucket;
1276 struct in6_addr *src_key = NULL; 1408 struct in6_addr *src_key = NULL;
1277 struct rt6_exception *rt6_ex; 1409 struct rt6_exception *rt6_ex;
1278 int err = 0; 1410 int err = 0;
1279 1411
1280 /* ort can't be a cache or pcpu route */
1281 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282 ort = ort->from;
1283 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284
1285 spin_lock_bh(&rt6_exception_lock); 1412 spin_lock_bh(&rt6_exception_lock);
1286 1413
1287 if (ort->exception_bucket_flushed) { 1414 if (ort->exception_bucket_flushed) {
@@ -1308,19 +1435,19 @@ static int rt6_insert_exception(struct rt6_info *nrt,
1308 * Otherwise, the exception table is indexed by 1435 * Otherwise, the exception table is indexed by
1309 * a hash of only rt6i_dst. 1436 * a hash of only rt6i_dst.
1310 */ 1437 */
1311 if (ort->rt6i_src.plen) 1438 if (ort->fib6_src.plen)
1312 src_key = &nrt->rt6i_src.addr; 1439 src_key = &nrt->rt6i_src.addr;
1313#endif 1440#endif
1314 1441
1315 /* Update rt6i_prefsrc as it could be changed 1442 /* Update rt6i_prefsrc as it could be changed
1316 * in rt6_remove_prefsrc() 1443 * in rt6_remove_prefsrc()
1317 */ 1444 */
1318 nrt->rt6i_prefsrc = ort->rt6i_prefsrc; 1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1319 /* rt6_mtu_change() might lower mtu on ort. 1446 /* rt6_mtu_change() might lower mtu on ort.
1320 * Only insert this exception route if its mtu 1447 * Only insert this exception route if its mtu
1321 * is less than ort's mtu value. 1448 * is less than ort's mtu value.
1322 */ 1449 */
1323 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { 1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1324 err = -EINVAL; 1451 err = -EINVAL;
1325 goto out; 1452 goto out;
1326 } 1453 }
@@ -1337,8 +1464,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
1337 } 1464 }
1338 rt6_ex->rt6i = nrt; 1465 rt6_ex->rt6i = nrt;
1339 rt6_ex->stamp = jiffies; 1466 rt6_ex->stamp = jiffies;
1340 atomic_inc(&nrt->rt6i_ref);
1341 nrt->rt6i_node = ort->rt6i_node;
1342 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343 bucket->depth++; 1468 bucket->depth++;
1344 net->ipv6.rt6_stats->fib_rt_cache++; 1469 net->ipv6.rt6_stats->fib_rt_cache++;
@@ -1351,16 +1476,16 @@ out:
1351 1476
1352 /* Update fn->fn_sernum to invalidate all cached dst */ 1477 /* Update fn->fn_sernum to invalidate all cached dst */
1353 if (!err) { 1478 if (!err) {
1354 spin_lock_bh(&ort->rt6i_table->tb6_lock); 1479 spin_lock_bh(&ort->fib6_table->tb6_lock);
1355 fib6_update_sernum(ort); 1480 fib6_update_sernum(net, ort);
1356 spin_unlock_bh(&ort->rt6i_table->tb6_lock); 1481 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1357 fib6_force_start_gc(net); 1482 fib6_force_start_gc(net);
1358 } 1483 }
1359 1484
1360 return err; 1485 return err;
1361} 1486}
1362 1487
1363void rt6_flush_exceptions(struct rt6_info *rt) 1488void rt6_flush_exceptions(struct fib6_info *rt)
1364{ 1489{
1365 struct rt6_exception_bucket *bucket; 1490 struct rt6_exception_bucket *bucket;
1366 struct rt6_exception *rt6_ex; 1491 struct rt6_exception *rt6_ex;
@@ -1390,7 +1515,7 @@ out:
1390/* Find cached rt in the hash table inside passed in rt 1515/* Find cached rt in the hash table inside passed in rt
1391 * Caller has to hold rcu_read_lock() 1516 * Caller has to hold rcu_read_lock()
1392 */ 1517 */
1393static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 1518static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1394 struct in6_addr *daddr, 1519 struct in6_addr *daddr,
1395 struct in6_addr *saddr) 1520 struct in6_addr *saddr)
1396{ 1521{
@@ -1408,7 +1533,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1408 * Otherwise, the exception table is indexed by 1533 * Otherwise, the exception table is indexed by
1409 * a hash of only rt6i_dst. 1534 * a hash of only rt6i_dst.
1410 */ 1535 */
1411 if (rt->rt6i_src.plen) 1536 if (rt->fib6_src.plen)
1412 src_key = saddr; 1537 src_key = saddr;
1413#endif 1538#endif
1414 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
@@ -1420,14 +1545,15 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1420} 1545}
1421 1546
1422/* Remove the passed in cached rt from the hash table that contains it */ 1547/* Remove the passed in cached rt from the hash table that contains it */
1423int rt6_remove_exception_rt(struct rt6_info *rt) 1548static int rt6_remove_exception_rt(struct rt6_info *rt)
1424{ 1549{
1425 struct rt6_exception_bucket *bucket; 1550 struct rt6_exception_bucket *bucket;
1426 struct rt6_info *from = rt->from;
1427 struct in6_addr *src_key = NULL; 1551 struct in6_addr *src_key = NULL;
1428 struct rt6_exception *rt6_ex; 1552 struct rt6_exception *rt6_ex;
1553 struct fib6_info *from;
1429 int err; 1554 int err;
1430 1555
1556 from = rcu_dereference(rt->from);
1431 if (!from || 1557 if (!from ||
1432 !(rt->rt6i_flags & RTF_CACHE)) 1558 !(rt->rt6i_flags & RTF_CACHE))
1433 return -EINVAL; 1559 return -EINVAL;
@@ -1445,7 +1571,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
1445 * Otherwise, the exception table is indexed by 1571 * Otherwise, the exception table is indexed by
1446 * a hash of only rt6i_dst. 1572 * a hash of only rt6i_dst.
1447 */ 1573 */
1448 if (from->rt6i_src.plen) 1574 if (from->fib6_src.plen)
1449 src_key = &rt->rt6i_src.addr; 1575 src_key = &rt->rt6i_src.addr;
1450#endif 1576#endif
1451 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1577 rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1468,7 +1594,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
1468static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1594static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469{ 1595{
1470 struct rt6_exception_bucket *bucket; 1596 struct rt6_exception_bucket *bucket;
1471 struct rt6_info *from = rt->from; 1597 struct fib6_info *from = rt->from;
1472 struct in6_addr *src_key = NULL; 1598 struct in6_addr *src_key = NULL;
1473 struct rt6_exception *rt6_ex; 1599 struct rt6_exception *rt6_ex;
1474 1600
@@ -1486,7 +1612,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1486 * Otherwise, the exception table is indexed by 1612 * Otherwise, the exception table is indexed by
1487 * a hash of only rt6i_dst. 1613 * a hash of only rt6i_dst.
1488 */ 1614 */
1489 if (from->rt6i_src.plen) 1615 if (from->fib6_src.plen)
1490 src_key = &rt->rt6i_src.addr; 1616 src_key = &rt->rt6i_src.addr;
1491#endif 1617#endif
1492 rt6_ex = __rt6_find_exception_rcu(&bucket, 1618 rt6_ex = __rt6_find_exception_rcu(&bucket,
@@ -1498,7 +1624,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1498 rcu_read_unlock(); 1624 rcu_read_unlock();
1499} 1625}
1500 1626
1501static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) 1627static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1502{ 1628{
1503 struct rt6_exception_bucket *bucket; 1629 struct rt6_exception_bucket *bucket;
1504 struct rt6_exception *rt6_ex; 1630 struct rt6_exception *rt6_ex;
@@ -1540,7 +1666,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1540} 1666}
1541 1667
1542static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1668static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543 struct rt6_info *rt, int mtu) 1669 struct fib6_info *rt, int mtu)
1544{ 1670{
1545 struct rt6_exception_bucket *bucket; 1671 struct rt6_exception_bucket *bucket;
1546 struct rt6_exception *rt6_ex; 1672 struct rt6_exception *rt6_ex;
@@ -1557,12 +1683,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1557 struct rt6_info *entry = rt6_ex->rt6i; 1683 struct rt6_info *entry = rt6_ex->rt6i;
1558 1684
1559 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560 * route), the metrics of its rt->dst.from have already 1686 * route), the metrics of its rt->from have already
1561 * been updated. 1687 * been updated.
1562 */ 1688 */
1563 if (entry->rt6i_pmtu && 1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1564 rt6_mtu_change_route_allowed(idev, entry, mtu)) 1690 rt6_mtu_change_route_allowed(idev, entry, mtu))
1565 entry->rt6i_pmtu = mtu; 1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1566 } 1692 }
1567 bucket++; 1693 bucket++;
1568 } 1694 }
@@ -1570,7 +1696,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1570 1696
1571#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1697#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1572 1698
1573static void rt6_exceptions_clean_tohost(struct rt6_info *rt, 1699static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1574 struct in6_addr *gateway) 1700 struct in6_addr *gateway)
1575{ 1701{
1576 struct rt6_exception_bucket *bucket; 1702 struct rt6_exception_bucket *bucket;
@@ -1649,7 +1775,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1649 gc_args->more++; 1775 gc_args->more++;
1650} 1776}
1651 1777
1652void rt6_age_exceptions(struct rt6_info *rt, 1778void rt6_age_exceptions(struct fib6_info *rt,
1653 struct fib6_gc_args *gc_args, 1779 struct fib6_gc_args *gc_args,
1654 unsigned long now) 1780 unsigned long now)
1655{ 1781{
@@ -1680,32 +1806,22 @@ void rt6_age_exceptions(struct rt6_info *rt,
1680 rcu_read_unlock_bh(); 1806 rcu_read_unlock_bh();
1681} 1807}
1682 1808
1683struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1809/* must be called with rcu lock held */
1684 int oif, struct flowi6 *fl6, 1810struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1685 const struct sk_buff *skb, int flags) 1811 int oif, struct flowi6 *fl6, int strict)
1686{ 1812{
1687 struct fib6_node *fn, *saved_fn; 1813 struct fib6_node *fn, *saved_fn;
1688 struct rt6_info *rt, *rt_cache; 1814 struct fib6_info *f6i;
1689 int strict = 0;
1690 1815
1691 strict |= flags & RT6_LOOKUP_F_IFACE; 1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1692 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693 if (net->ipv6.devconf_all->forwarding == 0)
1694 strict |= RT6_LOOKUP_F_REACHABLE;
1695
1696 rcu_read_lock();
1697
1698 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699 saved_fn = fn; 1817 saved_fn = fn;
1700 1818
1701 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702 oif = 0; 1820 oif = 0;
1703 1821
1704redo_rt6_select: 1822redo_rt6_select:
1705 rt = rt6_select(net, fn, oif, strict); 1823 f6i = rt6_select(net, fn, oif, strict);
1706 if (rt->rt6i_nsiblings) 1824 if (f6i == net->ipv6.fib6_null_entry) {
1707 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708 if (rt == net->ipv6.ip6_null_entry) {
1709 fn = fib6_backtrack(fn, &fl6->saddr); 1825 fn = fib6_backtrack(fn, &fl6->saddr);
1710 if (fn) 1826 if (fn)
1711 goto redo_rt6_select; 1827 goto redo_rt6_select;
@@ -1717,45 +1833,57 @@ redo_rt6_select:
1717 } 1833 }
1718 } 1834 }
1719 1835
1720 /*Search through exception table */ 1836 trace_fib6_table_lookup(net, f6i, table, fl6);
1721 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722 if (rt_cache)
1723 rt = rt_cache;
1724 1837
1725 if (rt == net->ipv6.ip6_null_entry) { 1838 return f6i;
1839}
1840
1841struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1842 int oif, struct flowi6 *fl6,
1843 const struct sk_buff *skb, int flags)
1844{
1845 struct fib6_info *f6i;
1846 struct rt6_info *rt;
1847 int strict = 0;
1848
1849 strict |= flags & RT6_LOOKUP_F_IFACE;
1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1851 if (net->ipv6.devconf_all->forwarding == 0)
1852 strict |= RT6_LOOKUP_F_REACHABLE;
1853
1854 rcu_read_lock();
1855
1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1857 if (f6i->fib6_nsiblings)
1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1859
1860 if (f6i == net->ipv6.fib6_null_entry) {
1861 rt = net->ipv6.ip6_null_entry;
1726 rcu_read_unlock(); 1862 rcu_read_unlock();
1727 dst_hold(&rt->dst); 1863 dst_hold(&rt->dst);
1728 trace_fib6_table_lookup(net, rt, table, fl6);
1729 return rt; 1864 return rt;
1730 } else if (rt->rt6i_flags & RTF_CACHE) { 1865 }
1731 if (ip6_hold_safe(net, &rt, true)) { 1866
1867 /*Search through exception table */
1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1869 if (rt) {
1870 if (ip6_hold_safe(net, &rt, true))
1732 dst_use_noref(&rt->dst, jiffies); 1871 dst_use_noref(&rt->dst, jiffies);
1733 rt6_dst_from_metrics_check(rt); 1872
1734 }
1735 rcu_read_unlock(); 1873 rcu_read_unlock();
1736 trace_fib6_table_lookup(net, rt, table, fl6);
1737 return rt; 1874 return rt;
1738 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739 !(rt->rt6i_flags & RTF_GATEWAY))) { 1876 !(f6i->fib6_flags & RTF_GATEWAY))) {
1740 /* Create a RTF_CACHE clone which will not be 1877 /* Create a RTF_CACHE clone which will not be
1741 * owned by the fib6 tree. It is for the special case where 1878 * owned by the fib6 tree. It is for the special case where
1742 * the daddr in the skb during the neighbor look-up is different 1879 * the daddr in the skb during the neighbor look-up is different
1743 * from the fl6->daddr used to look-up route here. 1880 * from the fl6->daddr used to look-up route here.
1744 */ 1881 */
1745
1746 struct rt6_info *uncached_rt; 1882 struct rt6_info *uncached_rt;
1747 1883
1748 if (ip6_hold_safe(net, &rt, true)) { 1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1749 dst_use_noref(&rt->dst, jiffies);
1750 } else {
1751 rcu_read_unlock();
1752 uncached_rt = rt;
1753 goto uncached_rt_out;
1754 }
1755 rcu_read_unlock();
1756 1885
1757 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1886 rcu_read_unlock();
1758 dst_release(&rt->dst);
1759 1887
1760 if (uncached_rt) { 1888 if (uncached_rt) {
1761 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
@@ -1768,36 +1896,21 @@ redo_rt6_select:
1768 dst_hold(&uncached_rt->dst); 1896 dst_hold(&uncached_rt->dst);
1769 } 1897 }
1770 1898
1771uncached_rt_out:
1772 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773 return uncached_rt; 1899 return uncached_rt;
1774
1775 } else { 1900 } else {
1776 /* Get a percpu copy */ 1901 /* Get a percpu copy */
1777 1902
1778 struct rt6_info *pcpu_rt; 1903 struct rt6_info *pcpu_rt;
1779 1904
1780 dst_use_noref(&rt->dst, jiffies);
1781 local_bh_disable(); 1905 local_bh_disable();
1782 pcpu_rt = rt6_get_pcpu_route(rt); 1906 pcpu_rt = rt6_get_pcpu_route(f6i);
1783 1907
1784 if (!pcpu_rt) { 1908 if (!pcpu_rt)
1785 /* atomic_inc_not_zero() is needed when using rcu */ 1909 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1786 if (atomic_inc_not_zero(&rt->rt6i_ref)) { 1910
1787 /* No dst_hold() on rt is needed because grabbing
1788 * rt->rt6i_ref makes sure rt can't be released.
1789 */
1790 pcpu_rt = rt6_make_pcpu_route(rt);
1791 rt6_release(rt);
1792 } else {
1793 /* rt is already removed from tree */
1794 pcpu_rt = net->ipv6.ip6_null_entry;
1795 dst_hold(&pcpu_rt->dst);
1796 }
1797 }
1798 local_bh_enable(); 1911 local_bh_enable();
1799 rcu_read_unlock(); 1912 rcu_read_unlock();
1800 trace_fib6_table_lookup(net, pcpu_rt, table, fl6); 1913
1801 return pcpu_rt; 1914 return pcpu_rt;
1802 } 1915 }
1803} 1916}
@@ -1868,7 +1981,7 @@ out:
1868 } else { 1981 } else {
1869 keys->addrs.v6addrs.src = key_iph->saddr; 1982 keys->addrs.v6addrs.src = key_iph->saddr;
1870 keys->addrs.v6addrs.dst = key_iph->daddr; 1983 keys->addrs.v6addrs.dst = key_iph->daddr;
1871 keys->tags.flow_label = ip6_flowinfo(key_iph); 1984 keys->tags.flow_label = ip6_flowlabel(key_iph);
1872 keys->basic.ip_proto = key_iph->nexthdr; 1985 keys->basic.ip_proto = key_iph->nexthdr;
1873 } 1986 }
1874} 1987}
@@ -1889,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1889 } else { 2002 } else {
1890 hash_keys.addrs.v6addrs.src = fl6->saddr; 2003 hash_keys.addrs.v6addrs.src = fl6->saddr;
1891 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2004 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1892 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; 2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1893 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2006 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1894 } 2007 }
1895 break; 2008 break;
@@ -2020,7 +2133,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
2020 rt->rt6i_idev = in6_dev_get(loopback_dev); 2133 rt->rt6i_idev = in6_dev_get(loopback_dev);
2021 rt->rt6i_gateway = ort->rt6i_gateway; 2134 rt->rt6i_gateway = ort->rt6i_gateway;
2022 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2023 rt->rt6i_metric = 0;
2024 2136
2025 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2026#ifdef CONFIG_IPV6_SUBTREES 2138#ifdef CONFIG_IPV6_SUBTREES
@@ -2036,18 +2148,27 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
2036 * Destination cache support functions 2148 * Destination cache support functions
2037 */ 2149 */
2038 2150
2039static void rt6_dst_from_metrics_check(struct rt6_info *rt) 2151static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2040{ 2152{
2041 if (rt->from && 2153 u32 rt_cookie = 0;
2042 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) 2154
2043 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); 2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2156 return false;
2157
2158 if (fib6_check_expired(f6i))
2159 return false;
2160
2161 return true;
2044} 2162}
2045 2163
2046static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 2164static struct dst_entry *rt6_check(struct rt6_info *rt,
2165 struct fib6_info *from,
2166 u32 cookie)
2047{ 2167{
2048 u32 rt_cookie = 0; 2168 u32 rt_cookie = 0;
2049 2169
2050 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) 2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2171 rt_cookie != cookie)
2051 return NULL; 2172 return NULL;
2052 2173
2053 if (rt6_check_expired(rt)) 2174 if (rt6_check_expired(rt))
@@ -2056,11 +2177,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2056 return &rt->dst; 2177 return &rt->dst;
2057} 2178}
2058 2179
2059static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 2180static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2181 struct fib6_info *from,
2182 u32 cookie)
2060{ 2183{
2061 if (!__rt6_check_expired(rt) && 2184 if (!__rt6_check_expired(rt) &&
2062 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2063 rt6_check(rt->from, cookie)) 2186 fib6_check(from, cookie))
2064 return &rt->dst; 2187 return &rt->dst;
2065 else 2188 else
2066 return NULL; 2189 return NULL;
@@ -2068,22 +2191,30 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2068 2191
2069static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2192static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2070{ 2193{
2194 struct dst_entry *dst_ret;
2195 struct fib6_info *from;
2071 struct rt6_info *rt; 2196 struct rt6_info *rt;
2072 2197
2073 rt = (struct rt6_info *) dst; 2198 rt = container_of(dst, struct rt6_info, dst);
2199
2200 rcu_read_lock();
2074 2201
2075 /* All IPV6 dsts are created with ->obsolete set to the value 2202 /* All IPV6 dsts are created with ->obsolete set to the value
2076 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2077 * into this function always. 2204 * into this function always.
2078 */ 2205 */
2079 2206
2080 rt6_dst_from_metrics_check(rt); 2207 from = rcu_dereference(rt->from);
2081 2208
2082 if (rt->rt6i_flags & RTF_PCPU || 2209 if (from && (rt->rt6i_flags & RTF_PCPU ||
2083 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) 2210 unlikely(!list_empty(&rt->rt6i_uncached))))
2084 return rt6_dst_from_check(rt, cookie); 2211 dst_ret = rt6_dst_from_check(rt, from, cookie);
2085 else 2212 else
2086 return rt6_check(rt, cookie); 2213 dst_ret = rt6_check(rt, from, cookie);
2214
2215 rcu_read_unlock();
2216
2217 return dst_ret;
2087} 2218}
2088 2219
2089static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2220static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -2092,10 +2223,12 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2092 2223
2093 if (rt) { 2224 if (rt) {
2094 if (rt->rt6i_flags & RTF_CACHE) { 2225 if (rt->rt6i_flags & RTF_CACHE) {
2226 rcu_read_lock();
2095 if (rt6_check_expired(rt)) { 2227 if (rt6_check_expired(rt)) {
2096 ip6_del_rt(rt); 2228 rt6_remove_exception_rt(rt);
2097 dst = NULL; 2229 dst = NULL;
2098 } 2230 }
2231 rcu_read_unlock();
2099 } else { 2232 } else {
2100 dst_release(dst); 2233 dst_release(dst);
2101 dst = NULL; 2234 dst = NULL;
@@ -2112,35 +2245,60 @@ static void ip6_link_failure(struct sk_buff *skb)
2112 2245
2113 rt = (struct rt6_info *) skb_dst(skb); 2246 rt = (struct rt6_info *) skb_dst(skb);
2114 if (rt) { 2247 if (rt) {
2248 rcu_read_lock();
2115 if (rt->rt6i_flags & RTF_CACHE) { 2249 if (rt->rt6i_flags & RTF_CACHE) {
2116 if (dst_hold_safe(&rt->dst)) 2250 if (dst_hold_safe(&rt->dst))
2117 ip6_del_rt(rt); 2251 rt6_remove_exception_rt(rt);
2118 } else { 2252 } else {
2253 struct fib6_info *from;
2119 struct fib6_node *fn; 2254 struct fib6_node *fn;
2120 2255
2121 rcu_read_lock(); 2256 from = rcu_dereference(rt->from);
2122 fn = rcu_dereference(rt->rt6i_node); 2257 if (from) {
2123 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2258 fn = rcu_dereference(from->fib6_node);
2124 fn->fn_sernum = -1; 2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2125 rcu_read_unlock(); 2260 fn->fn_sernum = -1;
2261 }
2126 } 2262 }
2263 rcu_read_unlock();
2127 } 2264 }
2128} 2265}
2129 2266
2267static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2268{
2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2270 struct fib6_info *from;
2271
2272 rcu_read_lock();
2273 from = rcu_dereference(rt0->from);
2274 if (from)
2275 rt0->dst.expires = from->expires;
2276 rcu_read_unlock();
2277 }
2278
2279 dst_set_expires(&rt0->dst, timeout);
2280 rt0->rt6i_flags |= RTF_EXPIRES;
2281}
2282
2130static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2283static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2131{ 2284{
2132 struct net *net = dev_net(rt->dst.dev); 2285 struct net *net = dev_net(rt->dst.dev);
2133 2286
2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2134 rt->rt6i_flags |= RTF_MODIFIED; 2288 rt->rt6i_flags |= RTF_MODIFIED;
2135 rt->rt6i_pmtu = mtu;
2136 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2137} 2290}
2138 2291
2139static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2292static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2140{ 2293{
2294 bool from_set;
2295
2296 rcu_read_lock();
2297 from_set = !!rcu_dereference(rt->from);
2298 rcu_read_unlock();
2299
2141 return !(rt->rt6i_flags & RTF_CACHE) && 2300 return !(rt->rt6i_flags & RTF_CACHE) &&
2142 (rt->rt6i_flags & RTF_PCPU || 2301 (rt->rt6i_flags & RTF_PCPU || from_set);
2143 rcu_access_pointer(rt->rt6i_node));
2144} 2302}
2145 2303
2146static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2304static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -2176,14 +2334,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2176 if (rt6->rt6i_flags & RTF_CACHE) 2334 if (rt6->rt6i_flags & RTF_CACHE)
2177 rt6_update_exception_stamp_rt(rt6); 2335 rt6_update_exception_stamp_rt(rt6);
2178 } else if (daddr) { 2336 } else if (daddr) {
2337 struct fib6_info *from;
2179 struct rt6_info *nrt6; 2338 struct rt6_info *nrt6;
2180 2339
2181 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 2340 rcu_read_lock();
2341 from = rcu_dereference(rt6->from);
2342 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2182 if (nrt6) { 2343 if (nrt6) {
2183 rt6_do_update_pmtu(nrt6, mtu); 2344 rt6_do_update_pmtu(nrt6, mtu);
2184 if (rt6_insert_exception(nrt6, rt6)) 2345 if (rt6_insert_exception(nrt6, from))
2185 dst_release_immediate(&nrt6->dst); 2346 dst_release_immediate(&nrt6->dst);
2186 } 2347 }
2348 rcu_read_unlock();
2187 } 2349 }
2188} 2350}
2189 2351
@@ -2264,7 +2426,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
2264 int flags) 2426 int flags)
2265{ 2427{
2266 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2428 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2267 struct rt6_info *rt, *rt_cache; 2429 struct rt6_info *ret = NULL, *rt_cache;
2430 struct fib6_info *rt;
2268 struct fib6_node *fn; 2431 struct fib6_node *fn;
2269 2432
2270 /* Get the "current" route for this destination and 2433 /* Get the "current" route for this destination and
@@ -2278,32 +2441,32 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
2278 */ 2441 */
2279 2442
2280 rcu_read_lock(); 2443 rcu_read_lock();
2281 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2444 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2282restart: 2445restart:
2283 for_each_fib6_node_rt_rcu(fn) { 2446 for_each_fib6_node_rt_rcu(fn) {
2284 if (rt->rt6i_nh_flags & RTNH_F_DEAD) 2447 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2285 continue; 2448 continue;
2286 if (rt6_check_expired(rt)) 2449 if (fib6_check_expired(rt))
2287 continue; 2450 continue;
2288 if (rt->dst.error) 2451 if (rt->fib6_flags & RTF_REJECT)
2289 break; 2452 break;
2290 if (!(rt->rt6i_flags & RTF_GATEWAY)) 2453 if (!(rt->fib6_flags & RTF_GATEWAY))
2291 continue; 2454 continue;
2292 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 2455 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2293 continue; 2456 continue;
2294 /* rt_cache's gateway might be different from its 'parent' 2457 /* rt_cache's gateway might be different from its 'parent'
2295 * in the case of an ip redirect. 2458 * in the case of an ip redirect.
2296 * So we keep searching in the exception table if the gateway 2459 * So we keep searching in the exception table if the gateway
2297 * is different. 2460 * is different.
2298 */ 2461 */
2299 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { 2462 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2300 rt_cache = rt6_find_cached_rt(rt, 2463 rt_cache = rt6_find_cached_rt(rt,
2301 &fl6->daddr, 2464 &fl6->daddr,
2302 &fl6->saddr); 2465 &fl6->saddr);
2303 if (rt_cache && 2466 if (rt_cache &&
2304 ipv6_addr_equal(&rdfl->gateway, 2467 ipv6_addr_equal(&rdfl->gateway,
2305 &rt_cache->rt6i_gateway)) { 2468 &rt_cache->rt6i_gateway)) {
2306 rt = rt_cache; 2469 ret = rt_cache;
2307 break; 2470 break;
2308 } 2471 }
2309 continue; 2472 continue;
@@ -2312,25 +2475,28 @@ restart:
2312 } 2475 }
2313 2476
2314 if (!rt) 2477 if (!rt)
2315 rt = net->ipv6.ip6_null_entry; 2478 rt = net->ipv6.fib6_null_entry;
2316 else if (rt->dst.error) { 2479 else if (rt->fib6_flags & RTF_REJECT) {
2317 rt = net->ipv6.ip6_null_entry; 2480 ret = net->ipv6.ip6_null_entry;
2318 goto out; 2481 goto out;
2319 } 2482 }
2320 2483
2321 if (rt == net->ipv6.ip6_null_entry) { 2484 if (rt == net->ipv6.fib6_null_entry) {
2322 fn = fib6_backtrack(fn, &fl6->saddr); 2485 fn = fib6_backtrack(fn, &fl6->saddr);
2323 if (fn) 2486 if (fn)
2324 goto restart; 2487 goto restart;
2325 } 2488 }
2326 2489
2327out: 2490out:
2328 ip6_hold_safe(net, &rt, true); 2491 if (ret)
2492 dst_hold(&ret->dst);
2493 else
2494 ret = ip6_create_rt_rcu(rt);
2329 2495
2330 rcu_read_unlock(); 2496 rcu_read_unlock();
2331 2497
2332 trace_fib6_table_lookup(net, rt, table, fl6); 2498 trace_fib6_table_lookup(net, rt, table, fl6);
2333 return rt; 2499 return ret;
2334}; 2500};
2335 2501
2336static struct dst_entry *ip6_route_redirect(struct net *net, 2502static struct dst_entry *ip6_route_redirect(struct net *net,
@@ -2422,12 +2588,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2422 2588
2423static unsigned int ip6_mtu(const struct dst_entry *dst) 2589static unsigned int ip6_mtu(const struct dst_entry *dst)
2424{ 2590{
2425 const struct rt6_info *rt = (const struct rt6_info *)dst;
2426 unsigned int mtu = rt->rt6i_pmtu;
2427 struct inet6_dev *idev; 2591 struct inet6_dev *idev;
2428 2592 unsigned int mtu;
2429 if (mtu)
2430 goto out;
2431 2593
2432 mtu = dst_metric_raw(dst, RTAX_MTU); 2594 mtu = dst_metric_raw(dst, RTAX_MTU);
2433 if (mtu) 2595 if (mtu)
@@ -2447,6 +2609,54 @@ out:
2447 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2609 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2448} 2610}
2449 2611
2612/* MTU selection:
2613 * 1. mtu on route is locked - use it
2614 * 2. mtu from nexthop exception
2615 * 3. mtu from egress device
2616 *
2617 * based on ip6_dst_mtu_forward and exception logic of
2618 * rt6_find_cached_rt; called with rcu_read_lock
2619 */
2620u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2621 struct in6_addr *saddr)
2622{
2623 struct rt6_exception_bucket *bucket;
2624 struct rt6_exception *rt6_ex;
2625 struct in6_addr *src_key;
2626 struct inet6_dev *idev;
2627 u32 mtu = 0;
2628
2629 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2630 mtu = f6i->fib6_pmtu;
2631 if (mtu)
2632 goto out;
2633 }
2634
2635 src_key = NULL;
2636#ifdef CONFIG_IPV6_SUBTREES
2637 if (f6i->fib6_src.plen)
2638 src_key = saddr;
2639#endif
2640
2641 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2642 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2643 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2644 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2645
2646 if (likely(!mtu)) {
2647 struct net_device *dev = fib6_info_nh_dev(f6i);
2648
2649 mtu = IPV6_MIN_MTU;
2650 idev = __in6_dev_get(dev);
2651 if (idev && idev->cnf.mtu6 > mtu)
2652 mtu = idev->cnf.mtu6;
2653 }
2654
2655 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2656out:
2657 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2658}
2659
2450struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2660struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2451 struct flowi6 *fl6) 2661 struct flowi6 *fl6)
2452{ 2662{
@@ -2511,60 +2721,22 @@ out:
2511 return entries > rt_max_size; 2721 return entries > rt_max_size;
2512} 2722}
2513 2723
2514static int ip6_convert_metrics(struct mx6_config *mxc, 2724static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2515 const struct fib6_config *cfg) 2725 struct fib6_config *cfg)
2516{ 2726{
2517 struct net *net = cfg->fc_nlinfo.nl_net; 2727 struct dst_metrics *p;
2518 bool ecn_ca = false;
2519 struct nlattr *nla;
2520 int remaining;
2521 u32 *mp;
2522 2728
2523 if (!cfg->fc_mx) 2729 if (!cfg->fc_mx)
2524 return 0; 2730 return 0;
2525 2731
2526 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 2732 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2527 if (unlikely(!mp)) 2733 if (unlikely(!p))
2528 return -ENOMEM; 2734 return -ENOMEM;
2529 2735
2530 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 2736 refcount_set(&p->refcnt, 1);
2531 int type = nla_type(nla); 2737 rt->fib6_metrics = p;
2532 u32 val;
2533
2534 if (!type)
2535 continue;
2536 if (unlikely(type > RTAX_MAX))
2537 goto err;
2538
2539 if (type == RTAX_CC_ALGO) {
2540 char tmp[TCP_CA_NAME_MAX];
2541
2542 nla_strlcpy(tmp, nla, sizeof(tmp));
2543 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2544 if (val == TCP_CA_UNSPEC)
2545 goto err;
2546 } else {
2547 val = nla_get_u32(nla);
2548 }
2549 if (type == RTAX_HOPLIMIT && val > 255)
2550 val = 255;
2551 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2552 goto err;
2553
2554 mp[type - 1] = val;
2555 __set_bit(type - 1, mxc->mx_valid);
2556 }
2557
2558 if (ecn_ca) {
2559 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2560 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2561 }
2562 2738
2563 mxc->mx = mp; 2739 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2564 return 0;
2565 err:
2566 kfree(mp);
2567 return -EINVAL;
2568} 2740}
2569 2741
2570static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2742static struct rt6_info *ip6_nh_lookup_table(struct net *net,
@@ -2750,11 +2922,12 @@ out:
2750 return err; 2922 return err;
2751} 2923}
2752 2924
2753static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 2925static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2926 gfp_t gfp_flags,
2754 struct netlink_ext_ack *extack) 2927 struct netlink_ext_ack *extack)
2755{ 2928{
2756 struct net *net = cfg->fc_nlinfo.nl_net; 2929 struct net *net = cfg->fc_nlinfo.nl_net;
2757 struct rt6_info *rt = NULL; 2930 struct fib6_info *rt = NULL;
2758 struct net_device *dev = NULL; 2931 struct net_device *dev = NULL;
2759 struct inet6_dev *idev = NULL; 2932 struct inet6_dev *idev = NULL;
2760 struct fib6_table *table; 2933 struct fib6_table *table;
@@ -2773,6 +2946,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2773 goto out; 2946 goto out;
2774 } 2947 }
2775 2948
2949 if (cfg->fc_type > RTN_MAX) {
2950 NL_SET_ERR_MSG(extack, "Invalid route type");
2951 goto out;
2952 }
2953
2776 if (cfg->fc_dst_len > 128) { 2954 if (cfg->fc_dst_len > 128) {
2777 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2955 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2778 goto out; 2956 goto out;
@@ -2831,35 +3009,30 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2831 if (!table) 3009 if (!table)
2832 goto out; 3010 goto out;
2833 3011
2834 rt = ip6_dst_alloc(net, NULL, 3012 err = -ENOMEM;
2835 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 3013 rt = fib6_info_alloc(gfp_flags);
3014 if (!rt)
3015 goto out;
3016
3017 if (cfg->fc_flags & RTF_ADDRCONF)
3018 rt->dst_nocount = true;
2836 3019
2837 if (!rt) { 3020 err = ip6_convert_metrics(net, rt, cfg);
2838 err = -ENOMEM; 3021 if (err < 0)
2839 goto out; 3022 goto out;
2840 }
2841 3023
2842 if (cfg->fc_flags & RTF_EXPIRES) 3024 if (cfg->fc_flags & RTF_EXPIRES)
2843 rt6_set_expires(rt, jiffies + 3025 fib6_set_expires(rt, jiffies +
2844 clock_t_to_jiffies(cfg->fc_expires)); 3026 clock_t_to_jiffies(cfg->fc_expires));
2845 else 3027 else
2846 rt6_clean_expires(rt); 3028 fib6_clean_expires(rt);
2847 3029
2848 if (cfg->fc_protocol == RTPROT_UNSPEC) 3030 if (cfg->fc_protocol == RTPROT_UNSPEC)
2849 cfg->fc_protocol = RTPROT_BOOT; 3031 cfg->fc_protocol = RTPROT_BOOT;
2850 rt->rt6i_protocol = cfg->fc_protocol; 3032 rt->fib6_protocol = cfg->fc_protocol;
2851 3033
2852 addr_type = ipv6_addr_type(&cfg->fc_dst); 3034 addr_type = ipv6_addr_type(&cfg->fc_dst);
2853 3035
2854 if (addr_type & IPV6_ADDR_MULTICAST)
2855 rt->dst.input = ip6_mc_input;
2856 else if (cfg->fc_flags & RTF_LOCAL)
2857 rt->dst.input = ip6_input;
2858 else
2859 rt->dst.input = ip6_forward;
2860
2861 rt->dst.output = ip6_output;
2862
2863 if (cfg->fc_encap) { 3036 if (cfg->fc_encap) {
2864 struct lwtunnel_state *lwtstate; 3037 struct lwtunnel_state *lwtstate;
2865 3038
@@ -2868,22 +3041,23 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2868 &lwtstate, extack); 3041 &lwtstate, extack);
2869 if (err) 3042 if (err)
2870 goto out; 3043 goto out;
2871 rt->dst.lwtstate = lwtstate_get(lwtstate); 3044 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2872 lwtunnel_set_redirect(&rt->dst);
2873 } 3045 }
2874 3046
2875 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3047 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2876 rt->rt6i_dst.plen = cfg->fc_dst_len; 3048 rt->fib6_dst.plen = cfg->fc_dst_len;
2877 if (rt->rt6i_dst.plen == 128) 3049 if (rt->fib6_dst.plen == 128)
2878 rt->dst.flags |= DST_HOST; 3050 rt->dst_host = true;
2879 3051
2880#ifdef CONFIG_IPV6_SUBTREES 3052#ifdef CONFIG_IPV6_SUBTREES
2881 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 3053 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2882 rt->rt6i_src.plen = cfg->fc_src_len; 3054 rt->fib6_src.plen = cfg->fc_src_len;
2883#endif 3055#endif
2884 3056
2885 rt->rt6i_metric = cfg->fc_metric; 3057 rt->fib6_metric = cfg->fc_metric;
2886 rt->rt6i_nh_weight = 1; 3058 rt->fib6_nh.nh_weight = 1;
3059
3060 rt->fib6_type = cfg->fc_type;
2887 3061
2888 /* We cannot add true routes via loopback here, 3062 /* We cannot add true routes via loopback here,
2889 they would result in kernel looping; promote them to reject routes 3063 they would result in kernel looping; promote them to reject routes
@@ -2906,28 +3080,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2906 goto out; 3080 goto out;
2907 } 3081 }
2908 } 3082 }
2909 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 3083 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
2910 switch (cfg->fc_type) {
2911 case RTN_BLACKHOLE:
2912 rt->dst.error = -EINVAL;
2913 rt->dst.output = dst_discard_out;
2914 rt->dst.input = dst_discard;
2915 break;
2916 case RTN_PROHIBIT:
2917 rt->dst.error = -EACCES;
2918 rt->dst.output = ip6_pkt_prohibit_out;
2919 rt->dst.input = ip6_pkt_prohibit;
2920 break;
2921 case RTN_THROW:
2922 case RTN_UNREACHABLE:
2923 default:
2924 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2925 : (cfg->fc_type == RTN_UNREACHABLE)
2926 ? -EHOSTUNREACH : -ENETUNREACH;
2927 rt->dst.output = ip6_pkt_discard_out;
2928 rt->dst.input = ip6_pkt_discard;
2929 break;
2930 }
2931 goto install_route; 3084 goto install_route;
2932 } 3085 }
2933 3086
@@ -2936,7 +3089,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2936 if (err) 3089 if (err)
2937 goto out; 3090 goto out;
2938 3091
2939 rt->rt6i_gateway = cfg->fc_gateway; 3092 rt->fib6_nh.nh_gw = cfg->fc_gateway;
2940 } 3093 }
2941 3094
2942 err = -ENODEV; 3095 err = -ENODEV;
@@ -2961,96 +3114,82 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2961 err = -EINVAL; 3114 err = -EINVAL;
2962 goto out; 3115 goto out;
2963 } 3116 }
2964 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 3117 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
2965 rt->rt6i_prefsrc.plen = 128; 3118 rt->fib6_prefsrc.plen = 128;
2966 } else 3119 } else
2967 rt->rt6i_prefsrc.plen = 0; 3120 rt->fib6_prefsrc.plen = 0;
2968 3121
2969 rt->rt6i_flags = cfg->fc_flags; 3122 rt->fib6_flags = cfg->fc_flags;
2970 3123
2971install_route: 3124install_route:
2972 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3125 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2973 !netif_carrier_ok(dev)) 3126 !netif_carrier_ok(dev))
2974 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; 3127 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
2975 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); 3128 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2976 rt->dst.dev = dev; 3129 rt->fib6_nh.nh_dev = dev;
2977 rt->rt6i_idev = idev; 3130 rt->fib6_table = table;
2978 rt->rt6i_table = table;
2979 3131
2980 cfg->fc_nlinfo.nl_net = dev_net(dev); 3132 cfg->fc_nlinfo.nl_net = dev_net(dev);
2981 3133
3134 if (idev)
3135 in6_dev_put(idev);
3136
2982 return rt; 3137 return rt;
2983out: 3138out:
2984 if (dev) 3139 if (dev)
2985 dev_put(dev); 3140 dev_put(dev);
2986 if (idev) 3141 if (idev)
2987 in6_dev_put(idev); 3142 in6_dev_put(idev);
2988 if (rt)
2989 dst_release_immediate(&rt->dst);
2990 3143
3144 fib6_info_release(rt);
2991 return ERR_PTR(err); 3145 return ERR_PTR(err);
2992} 3146}
2993 3147
2994int ip6_route_add(struct fib6_config *cfg, 3148int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
2995 struct netlink_ext_ack *extack) 3149 struct netlink_ext_ack *extack)
2996{ 3150{
2997 struct mx6_config mxc = { .mx = NULL, }; 3151 struct fib6_info *rt;
2998 struct rt6_info *rt;
2999 int err; 3152 int err;
3000 3153
3001 rt = ip6_route_info_create(cfg, extack); 3154 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3002 if (IS_ERR(rt)) { 3155 if (IS_ERR(rt))
3003 err = PTR_ERR(rt); 3156 return PTR_ERR(rt);
3004 rt = NULL;
3005 goto out;
3006 }
3007
3008 err = ip6_convert_metrics(&mxc, cfg);
3009 if (err)
3010 goto out;
3011
3012 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
3013
3014 kfree(mxc.mx);
3015 3157
3016 return err; 3158 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3017out: 3159 fib6_info_release(rt);
3018 if (rt)
3019 dst_release_immediate(&rt->dst);
3020 3160
3021 return err; 3161 return err;
3022} 3162}
3023 3163
3024static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 3164static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3025{ 3165{
3026 int err; 3166 struct net *net = info->nl_net;
3027 struct fib6_table *table; 3167 struct fib6_table *table;
3028 struct net *net = dev_net(rt->dst.dev); 3168 int err;
3029 3169
3030 if (rt == net->ipv6.ip6_null_entry) { 3170 if (rt == net->ipv6.fib6_null_entry) {
3031 err = -ENOENT; 3171 err = -ENOENT;
3032 goto out; 3172 goto out;
3033 } 3173 }
3034 3174
3035 table = rt->rt6i_table; 3175 table = rt->fib6_table;
3036 spin_lock_bh(&table->tb6_lock); 3176 spin_lock_bh(&table->tb6_lock);
3037 err = fib6_del(rt, info); 3177 err = fib6_del(rt, info);
3038 spin_unlock_bh(&table->tb6_lock); 3178 spin_unlock_bh(&table->tb6_lock);
3039 3179
3040out: 3180out:
3041 ip6_rt_put(rt); 3181 fib6_info_release(rt);
3042 return err; 3182 return err;
3043} 3183}
3044 3184
3045int ip6_del_rt(struct rt6_info *rt) 3185int ip6_del_rt(struct net *net, struct fib6_info *rt)
3046{ 3186{
3047 struct nl_info info = { 3187 struct nl_info info = { .nl_net = net };
3048 .nl_net = dev_net(rt->dst.dev), 3188
3049 };
3050 return __ip6_del_rt(rt, &info); 3189 return __ip6_del_rt(rt, &info);
3051} 3190}
3052 3191
3053static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 3192static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3054{ 3193{
3055 struct nl_info *info = &cfg->fc_nlinfo; 3194 struct nl_info *info = &cfg->fc_nlinfo;
3056 struct net *net = info->nl_net; 3195 struct net *net = info->nl_net;
@@ -3058,20 +3197,20 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3058 struct fib6_table *table; 3197 struct fib6_table *table;
3059 int err = -ENOENT; 3198 int err = -ENOENT;
3060 3199
3061 if (rt == net->ipv6.ip6_null_entry) 3200 if (rt == net->ipv6.fib6_null_entry)
3062 goto out_put; 3201 goto out_put;
3063 table = rt->rt6i_table; 3202 table = rt->fib6_table;
3064 spin_lock_bh(&table->tb6_lock); 3203 spin_lock_bh(&table->tb6_lock);
3065 3204
3066 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 3205 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3067 struct rt6_info *sibling, *next_sibling; 3206 struct fib6_info *sibling, *next_sibling;
3068 3207
3069 /* prefer to send a single notification with all hops */ 3208 /* prefer to send a single notification with all hops */
3070 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3209 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3071 if (skb) { 3210 if (skb) {
3072 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3211 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3073 3212
3074 if (rt6_fill_node(net, skb, rt, 3213 if (rt6_fill_node(net, skb, rt, NULL,
3075 NULL, NULL, 0, RTM_DELROUTE, 3214 NULL, NULL, 0, RTM_DELROUTE,
3076 info->portid, seq, 0) < 0) { 3215 info->portid, seq, 0) < 0) {
3077 kfree_skb(skb); 3216 kfree_skb(skb);
@@ -3081,8 +3220,8 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3081 } 3220 }
3082 3221
3083 list_for_each_entry_safe(sibling, next_sibling, 3222 list_for_each_entry_safe(sibling, next_sibling,
3084 &rt->rt6i_siblings, 3223 &rt->fib6_siblings,
3085 rt6i_siblings) { 3224 fib6_siblings) {
3086 err = fib6_del(sibling, info); 3225 err = fib6_del(sibling, info);
3087 if (err) 3226 if (err)
3088 goto out_unlock; 3227 goto out_unlock;
@@ -3093,7 +3232,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3093out_unlock: 3232out_unlock:
3094 spin_unlock_bh(&table->tb6_lock); 3233 spin_unlock_bh(&table->tb6_lock);
3095out_put: 3234out_put:
3096 ip6_rt_put(rt); 3235 fib6_info_release(rt);
3097 3236
3098 if (skb) { 3237 if (skb) {
3099 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3238 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
@@ -3102,11 +3241,28 @@ out_put:
3102 return err; 3241 return err;
3103} 3242}
3104 3243
3244static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3245{
3246 int rc = -ESRCH;
3247
3248 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3249 goto out;
3250
3251 if (cfg->fc_flags & RTF_GATEWAY &&
3252 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3253 goto out;
3254 if (dst_hold_safe(&rt->dst))
3255 rc = rt6_remove_exception_rt(rt);
3256out:
3257 return rc;
3258}
3259
3105static int ip6_route_del(struct fib6_config *cfg, 3260static int ip6_route_del(struct fib6_config *cfg,
3106 struct netlink_ext_ack *extack) 3261 struct netlink_ext_ack *extack)
3107{ 3262{
3108 struct rt6_info *rt, *rt_cache; 3263 struct rt6_info *rt_cache;
3109 struct fib6_table *table; 3264 struct fib6_table *table;
3265 struct fib6_info *rt;
3110 struct fib6_node *fn; 3266 struct fib6_node *fn;
3111 int err = -ESRCH; 3267 int err = -ESRCH;
3112 3268
@@ -3126,25 +3282,31 @@ static int ip6_route_del(struct fib6_config *cfg,
3126 if (fn) { 3282 if (fn) {
3127 for_each_fib6_node_rt_rcu(fn) { 3283 for_each_fib6_node_rt_rcu(fn) {
3128 if (cfg->fc_flags & RTF_CACHE) { 3284 if (cfg->fc_flags & RTF_CACHE) {
3285 int rc;
3286
3129 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 3287 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3130 &cfg->fc_src); 3288 &cfg->fc_src);
3131 if (!rt_cache) 3289 if (rt_cache) {
3132 continue; 3290 rc = ip6_del_cached_rt(rt_cache, cfg);
3133 rt = rt_cache; 3291 if (rc != -ESRCH) {
3292 rcu_read_unlock();
3293 return rc;
3294 }
3295 }
3296 continue;
3134 } 3297 }
3135 if (cfg->fc_ifindex && 3298 if (cfg->fc_ifindex &&
3136 (!rt->dst.dev || 3299 (!rt->fib6_nh.nh_dev ||
3137 rt->dst.dev->ifindex != cfg->fc_ifindex)) 3300 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3138 continue; 3301 continue;
3139 if (cfg->fc_flags & RTF_GATEWAY && 3302 if (cfg->fc_flags & RTF_GATEWAY &&
3140 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3303 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3141 continue; 3304 continue;
3142 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 3305 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3143 continue; 3306 continue;
3144 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 3307 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3145 continue; 3308 continue;
3146 if (!dst_hold_safe(&rt->dst)) 3309 fib6_info_hold(rt);
3147 break;
3148 rcu_read_unlock(); 3310 rcu_read_unlock();
3149 3311
3150 /* if gateway was specified only delete the one hop */ 3312 /* if gateway was specified only delete the one hop */
@@ -3166,6 +3328,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
3166 struct ndisc_options ndopts; 3328 struct ndisc_options ndopts;
3167 struct inet6_dev *in6_dev; 3329 struct inet6_dev *in6_dev;
3168 struct neighbour *neigh; 3330 struct neighbour *neigh;
3331 struct fib6_info *from;
3169 struct rd_msg *msg; 3332 struct rd_msg *msg;
3170 int optlen, on_link; 3333 int optlen, on_link;
3171 u8 *lladdr; 3334 u8 *lladdr;
@@ -3247,7 +3410,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
3247 NEIGH_UPDATE_F_ISROUTER)), 3410 NEIGH_UPDATE_F_ISROUTER)),
3248 NDISC_REDIRECT, &ndopts); 3411 NDISC_REDIRECT, &ndopts);
3249 3412
3250 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 3413 rcu_read_lock();
3414 from = rcu_dereference(rt->from);
3415 fib6_info_hold(from);
3416 rcu_read_unlock();
3417
3418 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3251 if (!nrt) 3419 if (!nrt)
3252 goto out; 3420 goto out;
3253 3421
@@ -3255,14 +3423,13 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
3255 if (on_link) 3423 if (on_link)
3256 nrt->rt6i_flags &= ~RTF_GATEWAY; 3424 nrt->rt6i_flags &= ~RTF_GATEWAY;
3257 3425
3258 nrt->rt6i_protocol = RTPROT_REDIRECT;
3259 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3426 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3260 3427
3261 /* No need to remove rt from the exception table if rt is 3428 /* No need to remove rt from the exception table if rt is
3262 * a cached route because rt6_insert_exception() will 3429 * a cached route because rt6_insert_exception() will
3263 * takes care of it 3430 * takes care of it
3264 */ 3431 */
3265 if (rt6_insert_exception(nrt, rt)) { 3432 if (rt6_insert_exception(nrt, from)) {
3266 dst_release_immediate(&nrt->dst); 3433 dst_release_immediate(&nrt->dst);
3267 goto out; 3434 goto out;
3268 } 3435 }
@@ -3274,47 +3441,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
3274 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3441 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3275 3442
3276out: 3443out:
3444 fib6_info_release(from);
3277 neigh_release(neigh); 3445 neigh_release(neigh);
3278} 3446}
3279 3447
3280/*
3281 * Misc support functions
3282 */
3283
3284static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3285{
3286 BUG_ON(from->from);
3287
3288 rt->rt6i_flags &= ~RTF_EXPIRES;
3289 dst_hold(&from->dst);
3290 rt->from = from;
3291 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3292}
3293
3294static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3295{
3296 rt->dst.input = ort->dst.input;
3297 rt->dst.output = ort->dst.output;
3298 rt->rt6i_dst = ort->rt6i_dst;
3299 rt->dst.error = ort->dst.error;
3300 rt->rt6i_idev = ort->rt6i_idev;
3301 if (rt->rt6i_idev)
3302 in6_dev_hold(rt->rt6i_idev);
3303 rt->dst.lastuse = jiffies;
3304 rt->rt6i_gateway = ort->rt6i_gateway;
3305 rt->rt6i_flags = ort->rt6i_flags;
3306 rt6_set_from(rt, ort);
3307 rt->rt6i_metric = ort->rt6i_metric;
3308#ifdef CONFIG_IPV6_SUBTREES
3309 rt->rt6i_src = ort->rt6i_src;
3310#endif
3311 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3312 rt->rt6i_table = ort->rt6i_table;
3313 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3314}
3315
3316#ifdef CONFIG_IPV6_ROUTE_INFO 3448#ifdef CONFIG_IPV6_ROUTE_INFO
3317static struct rt6_info *rt6_get_route_info(struct net *net, 3449static struct fib6_info *rt6_get_route_info(struct net *net,
3318 const struct in6_addr *prefix, int prefixlen, 3450 const struct in6_addr *prefix, int prefixlen,
3319 const struct in6_addr *gwaddr, 3451 const struct in6_addr *gwaddr,
3320 struct net_device *dev) 3452 struct net_device *dev)
@@ -3322,7 +3454,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
3322 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3454 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3323 int ifindex = dev->ifindex; 3455 int ifindex = dev->ifindex;
3324 struct fib6_node *fn; 3456 struct fib6_node *fn;
3325 struct rt6_info *rt = NULL; 3457 struct fib6_info *rt = NULL;
3326 struct fib6_table *table; 3458 struct fib6_table *table;
3327 3459
3328 table = fib6_get_table(net, tb_id); 3460 table = fib6_get_table(net, tb_id);
@@ -3335,13 +3467,13 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
3335 goto out; 3467 goto out;
3336 3468
3337 for_each_fib6_node_rt_rcu(fn) { 3469 for_each_fib6_node_rt_rcu(fn) {
3338 if (rt->dst.dev->ifindex != ifindex) 3470 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3339 continue; 3471 continue;
3340 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3472 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3341 continue; 3473 continue;
3342 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 3474 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3343 continue; 3475 continue;
3344 ip6_hold_safe(NULL, &rt, false); 3476 fib6_info_hold(rt);
3345 break; 3477 break;
3346 } 3478 }
3347out: 3479out:
@@ -3349,7 +3481,7 @@ out:
3349 return rt; 3481 return rt;
3350} 3482}
3351 3483
3352static struct rt6_info *rt6_add_route_info(struct net *net, 3484static struct fib6_info *rt6_add_route_info(struct net *net,
3353 const struct in6_addr *prefix, int prefixlen, 3485 const struct in6_addr *prefix, int prefixlen,
3354 const struct in6_addr *gwaddr, 3486 const struct in6_addr *gwaddr,
3355 struct net_device *dev, 3487 struct net_device *dev,
@@ -3362,6 +3494,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
3362 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3494 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3363 RTF_UP | RTF_PREF(pref), 3495 RTF_UP | RTF_PREF(pref),
3364 .fc_protocol = RTPROT_RA, 3496 .fc_protocol = RTPROT_RA,
3497 .fc_type = RTN_UNICAST,
3365 .fc_nlinfo.portid = 0, 3498 .fc_nlinfo.portid = 0,
3366 .fc_nlinfo.nlh = NULL, 3499 .fc_nlinfo.nlh = NULL,
3367 .fc_nlinfo.nl_net = net, 3500 .fc_nlinfo.nl_net = net,
@@ -3375,36 +3508,39 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
3375 if (!prefixlen) 3508 if (!prefixlen)
3376 cfg.fc_flags |= RTF_DEFAULT; 3509 cfg.fc_flags |= RTF_DEFAULT;
3377 3510
3378 ip6_route_add(&cfg, NULL); 3511 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3379 3512
3380 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3513 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3381} 3514}
3382#endif 3515#endif
3383 3516
3384struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 3517struct fib6_info *rt6_get_dflt_router(struct net *net,
3518 const struct in6_addr *addr,
3519 struct net_device *dev)
3385{ 3520{
3386 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3521 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3387 struct rt6_info *rt; 3522 struct fib6_info *rt;
3388 struct fib6_table *table; 3523 struct fib6_table *table;
3389 3524
3390 table = fib6_get_table(dev_net(dev), tb_id); 3525 table = fib6_get_table(net, tb_id);
3391 if (!table) 3526 if (!table)
3392 return NULL; 3527 return NULL;
3393 3528
3394 rcu_read_lock(); 3529 rcu_read_lock();
3395 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3530 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3396 if (dev == rt->dst.dev && 3531 if (dev == rt->fib6_nh.nh_dev &&
3397 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3532 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3398 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 3533 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3399 break; 3534 break;
3400 } 3535 }
3401 if (rt) 3536 if (rt)
3402 ip6_hold_safe(NULL, &rt, false); 3537 fib6_info_hold(rt);
3403 rcu_read_unlock(); 3538 rcu_read_unlock();
3404 return rt; 3539 return rt;
3405} 3540}
3406 3541
3407struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 3542struct fib6_info *rt6_add_dflt_router(struct net *net,
3543 const struct in6_addr *gwaddr,
3408 struct net_device *dev, 3544 struct net_device *dev,
3409 unsigned int pref) 3545 unsigned int pref)
3410{ 3546{
@@ -3415,14 +3551,15 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3415 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3551 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3416 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3552 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3417 .fc_protocol = RTPROT_RA, 3553 .fc_protocol = RTPROT_RA,
3554 .fc_type = RTN_UNICAST,
3418 .fc_nlinfo.portid = 0, 3555 .fc_nlinfo.portid = 0,
3419 .fc_nlinfo.nlh = NULL, 3556 .fc_nlinfo.nlh = NULL,
3420 .fc_nlinfo.nl_net = dev_net(dev), 3557 .fc_nlinfo.nl_net = net,
3421 }; 3558 };
3422 3559
3423 cfg.fc_gateway = *gwaddr; 3560 cfg.fc_gateway = *gwaddr;
3424 3561
3425 if (!ip6_route_add(&cfg, NULL)) { 3562 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3426 struct fib6_table *table; 3563 struct fib6_table *table;
3427 3564
3428 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3565 table = fib6_get_table(dev_net(dev), cfg.fc_table);
@@ -3430,24 +3567,25 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3430 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3567 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3431 } 3568 }
3432 3569
3433 return rt6_get_dflt_router(gwaddr, dev); 3570 return rt6_get_dflt_router(net, gwaddr, dev);
3434} 3571}
3435 3572
3436static void __rt6_purge_dflt_routers(struct fib6_table *table) 3573static void __rt6_purge_dflt_routers(struct net *net,
3574 struct fib6_table *table)
3437{ 3575{
3438 struct rt6_info *rt; 3576 struct fib6_info *rt;
3439 3577
3440restart: 3578restart:
3441 rcu_read_lock(); 3579 rcu_read_lock();
3442 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3580 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3443 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3581 struct net_device *dev = fib6_info_nh_dev(rt);
3444 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 3582 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3445 if (dst_hold_safe(&rt->dst)) { 3583
3446 rcu_read_unlock(); 3584 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3447 ip6_del_rt(rt); 3585 (!idev || idev->cnf.accept_ra != 2)) {
3448 } else { 3586 fib6_info_hold(rt);
3449 rcu_read_unlock(); 3587 rcu_read_unlock();
3450 } 3588 ip6_del_rt(net, rt);
3451 goto restart; 3589 goto restart;
3452 } 3590 }
3453 } 3591 }
@@ -3468,7 +3606,7 @@ void rt6_purge_dflt_routers(struct net *net)
3468 head = &net->ipv6.fib_table_hash[h]; 3606 head = &net->ipv6.fib_table_hash[h];
3469 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3607 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3470 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3608 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3471 __rt6_purge_dflt_routers(table); 3609 __rt6_purge_dflt_routers(net, table);
3472 } 3610 }
3473 } 3611 }
3474 3612
@@ -3489,6 +3627,7 @@ static void rtmsg_to_fib6_config(struct net *net,
3489 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3627 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3490 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3628 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3491 cfg->fc_flags = rtmsg->rtmsg_flags; 3629 cfg->fc_flags = rtmsg->rtmsg_flags;
3630 cfg->fc_type = rtmsg->rtmsg_type;
3492 3631
3493 cfg->fc_nlinfo.nl_net = net; 3632 cfg->fc_nlinfo.nl_net = net;
3494 3633
@@ -3518,7 +3657,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3518 rtnl_lock(); 3657 rtnl_lock();
3519 switch (cmd) { 3658 switch (cmd) {
3520 case SIOCADDRT: 3659 case SIOCADDRT:
3521 err = ip6_route_add(&cfg, NULL); 3660 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3522 break; 3661 break;
3523 case SIOCDELRT: 3662 case SIOCDELRT:
3524 err = ip6_route_del(&cfg, NULL); 3663 err = ip6_route_del(&cfg, NULL);
@@ -3546,7 +3685,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3546 case IPSTATS_MIB_INNOROUTES: 3685 case IPSTATS_MIB_INNOROUTES:
3547 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3686 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3548 if (type == IPV6_ADDR_ANY) { 3687 if (type == IPV6_ADDR_ANY) {
3549 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3688 IP6_INC_STATS(dev_net(dst->dev),
3689 __in6_dev_get_safely(skb->dev),
3550 IPSTATS_MIB_INADDRERRORS); 3690 IPSTATS_MIB_INADDRERRORS);
3551 break; 3691 break;
3552 } 3692 }
@@ -3587,40 +3727,40 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
3587 * Allocate a dst for local (unicast / anycast) address. 3727 * Allocate a dst for local (unicast / anycast) address.
3588 */ 3728 */
3589 3729
3590struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 3730struct fib6_info *addrconf_f6i_alloc(struct net *net,
3591 const struct in6_addr *addr, 3731 struct inet6_dev *idev,
3592 bool anycast) 3732 const struct in6_addr *addr,
3733 bool anycast, gfp_t gfp_flags)
3593{ 3734{
3594 u32 tb_id; 3735 u32 tb_id;
3595 struct net *net = dev_net(idev->dev);
3596 struct net_device *dev = idev->dev; 3736 struct net_device *dev = idev->dev;
3597 struct rt6_info *rt; 3737 struct fib6_info *f6i;
3598 3738
3599 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 3739 f6i = fib6_info_alloc(gfp_flags);
3600 if (!rt) 3740 if (!f6i)
3601 return ERR_PTR(-ENOMEM); 3741 return ERR_PTR(-ENOMEM);
3602 3742
3603 in6_dev_hold(idev); 3743 f6i->dst_nocount = true;
3604 3744 f6i->dst_host = true;
3605 rt->dst.flags |= DST_HOST; 3745 f6i->fib6_protocol = RTPROT_KERNEL;
3606 rt->dst.input = ip6_input; 3746 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3607 rt->dst.output = ip6_output; 3747 if (anycast) {
3608 rt->rt6i_idev = idev; 3748 f6i->fib6_type = RTN_ANYCAST;
3609 3749 f6i->fib6_flags |= RTF_ANYCAST;
3610 rt->rt6i_protocol = RTPROT_KERNEL; 3750 } else {
3611 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 3751 f6i->fib6_type = RTN_LOCAL;
3612 if (anycast) 3752 f6i->fib6_flags |= RTF_LOCAL;
3613 rt->rt6i_flags |= RTF_ANYCAST; 3753 }
3614 else
3615 rt->rt6i_flags |= RTF_LOCAL;
3616 3754
3617 rt->rt6i_gateway = *addr; 3755 f6i->fib6_nh.nh_gw = *addr;
3618 rt->rt6i_dst.addr = *addr; 3756 dev_hold(dev);
3619 rt->rt6i_dst.plen = 128; 3757 f6i->fib6_nh.nh_dev = dev;
3758 f6i->fib6_dst.addr = *addr;
3759 f6i->fib6_dst.plen = 128;
3620 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3760 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3621 rt->rt6i_table = fib6_get_table(net, tb_id); 3761 f6i->fib6_table = fib6_get_table(net, tb_id);
3622 3762
3623 return rt; 3763 return f6i;
3624} 3764}
3625 3765
3626/* remove deleted ip from prefsrc entries */ 3766/* remove deleted ip from prefsrc entries */
@@ -3630,18 +3770,18 @@ struct arg_dev_net_ip {
3630 struct in6_addr *addr; 3770 struct in6_addr *addr;
3631}; 3771};
3632 3772
3633static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 3773static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3634{ 3774{
3635 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3775 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3636 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3776 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3637 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3777 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3638 3778
3639 if (((void *)rt->dst.dev == dev || !dev) && 3779 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3640 rt != net->ipv6.ip6_null_entry && 3780 rt != net->ipv6.fib6_null_entry &&
3641 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 3781 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3642 spin_lock_bh(&rt6_exception_lock); 3782 spin_lock_bh(&rt6_exception_lock);
3643 /* remove prefsrc entry */ 3783 /* remove prefsrc entry */
3644 rt->rt6i_prefsrc.plen = 0; 3784 rt->fib6_prefsrc.plen = 0;
3645 /* need to update cache as well */ 3785 /* need to update cache as well */
3646 rt6_exceptions_remove_prefsrc(rt); 3786 rt6_exceptions_remove_prefsrc(rt);
3647 spin_unlock_bh(&rt6_exception_lock); 3787 spin_unlock_bh(&rt6_exception_lock);
@@ -3663,12 +3803,12 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3663#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3803#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3664 3804
3665/* Remove routers and update dst entries when gateway turn into host. */ 3805/* Remove routers and update dst entries when gateway turn into host. */
3666static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 3806static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3667{ 3807{
3668 struct in6_addr *gateway = (struct in6_addr *)arg; 3808 struct in6_addr *gateway = (struct in6_addr *)arg;
3669 3809
3670 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3810 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3671 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 3811 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3672 return -1; 3812 return -1;
3673 } 3813 }
3674 3814
@@ -3694,85 +3834,85 @@ struct arg_netdev_event {
3694 }; 3834 };
3695}; 3835};
3696 3836
3697static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) 3837static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3698{ 3838{
3699 struct rt6_info *iter; 3839 struct fib6_info *iter;
3700 struct fib6_node *fn; 3840 struct fib6_node *fn;
3701 3841
3702 fn = rcu_dereference_protected(rt->rt6i_node, 3842 fn = rcu_dereference_protected(rt->fib6_node,
3703 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 3843 lockdep_is_held(&rt->fib6_table->tb6_lock));
3704 iter = rcu_dereference_protected(fn->leaf, 3844 iter = rcu_dereference_protected(fn->leaf,
3705 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 3845 lockdep_is_held(&rt->fib6_table->tb6_lock));
3706 while (iter) { 3846 while (iter) {
3707 if (iter->rt6i_metric == rt->rt6i_metric && 3847 if (iter->fib6_metric == rt->fib6_metric &&
3708 rt6_qualify_for_ecmp(iter)) 3848 iter->fib6_nsiblings)
3709 return iter; 3849 return iter;
3710 iter = rcu_dereference_protected(iter->rt6_next, 3850 iter = rcu_dereference_protected(iter->fib6_next,
3711 lockdep_is_held(&rt->rt6i_table->tb6_lock)); 3851 lockdep_is_held(&rt->fib6_table->tb6_lock));
3712 } 3852 }
3713 3853
3714 return NULL; 3854 return NULL;
3715} 3855}
3716 3856
3717static bool rt6_is_dead(const struct rt6_info *rt) 3857static bool rt6_is_dead(const struct fib6_info *rt)
3718{ 3858{
3719 if (rt->rt6i_nh_flags & RTNH_F_DEAD || 3859 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3720 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && 3860 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3721 rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 3861 fib6_ignore_linkdown(rt)))
3722 return true; 3862 return true;
3723 3863
3724 return false; 3864 return false;
3725} 3865}
3726 3866
3727static int rt6_multipath_total_weight(const struct rt6_info *rt) 3867static int rt6_multipath_total_weight(const struct fib6_info *rt)
3728{ 3868{
3729 struct rt6_info *iter; 3869 struct fib6_info *iter;
3730 int total = 0; 3870 int total = 0;
3731 3871
3732 if (!rt6_is_dead(rt)) 3872 if (!rt6_is_dead(rt))
3733 total += rt->rt6i_nh_weight; 3873 total += rt->fib6_nh.nh_weight;
3734 3874
3735 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { 3875 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3736 if (!rt6_is_dead(iter)) 3876 if (!rt6_is_dead(iter))
3737 total += iter->rt6i_nh_weight; 3877 total += iter->fib6_nh.nh_weight;
3738 } 3878 }
3739 3879
3740 return total; 3880 return total;
3741} 3881}
3742 3882
3743static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) 3883static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3744{ 3884{
3745 int upper_bound = -1; 3885 int upper_bound = -1;
3746 3886
3747 if (!rt6_is_dead(rt)) { 3887 if (!rt6_is_dead(rt)) {
3748 *weight += rt->rt6i_nh_weight; 3888 *weight += rt->fib6_nh.nh_weight;
3749 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 3889 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3750 total) - 1; 3890 total) - 1;
3751 } 3891 }
3752 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); 3892 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3753} 3893}
3754 3894
3755static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) 3895static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3756{ 3896{
3757 struct rt6_info *iter; 3897 struct fib6_info *iter;
3758 int weight = 0; 3898 int weight = 0;
3759 3899
3760 rt6_upper_bound_set(rt, &weight, total); 3900 rt6_upper_bound_set(rt, &weight, total);
3761 3901
3762 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3902 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3763 rt6_upper_bound_set(iter, &weight, total); 3903 rt6_upper_bound_set(iter, &weight, total);
3764} 3904}
3765 3905
3766void rt6_multipath_rebalance(struct rt6_info *rt) 3906void rt6_multipath_rebalance(struct fib6_info *rt)
3767{ 3907{
3768 struct rt6_info *first; 3908 struct fib6_info *first;
3769 int total; 3909 int total;
3770 3910
3771 /* In case the entire multipath route was marked for flushing, 3911 /* In case the entire multipath route was marked for flushing,
3772 * then there is no need to rebalance upon the removal of every 3912 * then there is no need to rebalance upon the removal of every
3773 * sibling route. 3913 * sibling route.
3774 */ 3914 */
3775 if (!rt->rt6i_nsiblings || rt->should_flush) 3915 if (!rt->fib6_nsiblings || rt->should_flush)
3776 return; 3916 return;
3777 3917
3778 /* During lookup routes are evaluated in order, so we need to 3918 /* During lookup routes are evaluated in order, so we need to
@@ -3787,14 +3927,14 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
3787 rt6_multipath_upper_bound_set(first, total); 3927 rt6_multipath_upper_bound_set(first, total);
3788} 3928}
3789 3929
3790static int fib6_ifup(struct rt6_info *rt, void *p_arg) 3930static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3791{ 3931{
3792 const struct arg_netdev_event *arg = p_arg; 3932 const struct arg_netdev_event *arg = p_arg;
3793 const struct net *net = dev_net(arg->dev); 3933 struct net *net = dev_net(arg->dev);
3794 3934
3795 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { 3935 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3796 rt->rt6i_nh_flags &= ~arg->nh_flags; 3936 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3797 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); 3937 fib6_update_sernum_upto_root(net, rt);
3798 rt6_multipath_rebalance(rt); 3938 rt6_multipath_rebalance(rt);
3799 } 3939 }
3800 3940
@@ -3816,95 +3956,96 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3816 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 3956 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3817} 3957}
3818 3958
3819static bool rt6_multipath_uses_dev(const struct rt6_info *rt, 3959static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3820 const struct net_device *dev) 3960 const struct net_device *dev)
3821{ 3961{
3822 struct rt6_info *iter; 3962 struct fib6_info *iter;
3823 3963
3824 if (rt->dst.dev == dev) 3964 if (rt->fib6_nh.nh_dev == dev)
3825 return true; 3965 return true;
3826 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3966 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3827 if (iter->dst.dev == dev) 3967 if (iter->fib6_nh.nh_dev == dev)
3828 return true; 3968 return true;
3829 3969
3830 return false; 3970 return false;
3831} 3971}
3832 3972
3833static void rt6_multipath_flush(struct rt6_info *rt) 3973static void rt6_multipath_flush(struct fib6_info *rt)
3834{ 3974{
3835 struct rt6_info *iter; 3975 struct fib6_info *iter;
3836 3976
3837 rt->should_flush = 1; 3977 rt->should_flush = 1;
3838 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3978 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3839 iter->should_flush = 1; 3979 iter->should_flush = 1;
3840} 3980}
3841 3981
3842static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, 3982static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3843 const struct net_device *down_dev) 3983 const struct net_device *down_dev)
3844{ 3984{
3845 struct rt6_info *iter; 3985 struct fib6_info *iter;
3846 unsigned int dead = 0; 3986 unsigned int dead = 0;
3847 3987
3848 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) 3988 if (rt->fib6_nh.nh_dev == down_dev ||
3989 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3849 dead++; 3990 dead++;
3850 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3851 if (iter->dst.dev == down_dev || 3992 if (iter->fib6_nh.nh_dev == down_dev ||
3852 iter->rt6i_nh_flags & RTNH_F_DEAD) 3993 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3853 dead++; 3994 dead++;
3854 3995
3855 return dead; 3996 return dead;
3856} 3997}
3857 3998
3858static void rt6_multipath_nh_flags_set(struct rt6_info *rt, 3999static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3859 const struct net_device *dev, 4000 const struct net_device *dev,
3860 unsigned int nh_flags) 4001 unsigned int nh_flags)
3861{ 4002{
3862 struct rt6_info *iter; 4003 struct fib6_info *iter;
3863 4004
3864 if (rt->dst.dev == dev) 4005 if (rt->fib6_nh.nh_dev == dev)
3865 rt->rt6i_nh_flags |= nh_flags; 4006 rt->fib6_nh.nh_flags |= nh_flags;
3866 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) 4007 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3867 if (iter->dst.dev == dev) 4008 if (iter->fib6_nh.nh_dev == dev)
3868 iter->rt6i_nh_flags |= nh_flags; 4009 iter->fib6_nh.nh_flags |= nh_flags;
3869} 4010}
3870 4011
3871/* called with write lock held for table with rt */ 4012/* called with write lock held for table with rt */
3872static int fib6_ifdown(struct rt6_info *rt, void *p_arg) 4013static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3873{ 4014{
3874 const struct arg_netdev_event *arg = p_arg; 4015 const struct arg_netdev_event *arg = p_arg;
3875 const struct net_device *dev = arg->dev; 4016 const struct net_device *dev = arg->dev;
3876 const struct net *net = dev_net(dev); 4017 struct net *net = dev_net(dev);
3877 4018
3878 if (rt == net->ipv6.ip6_null_entry) 4019 if (rt == net->ipv6.fib6_null_entry)
3879 return 0; 4020 return 0;
3880 4021
3881 switch (arg->event) { 4022 switch (arg->event) {
3882 case NETDEV_UNREGISTER: 4023 case NETDEV_UNREGISTER:
3883 return rt->dst.dev == dev ? -1 : 0; 4024 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3884 case NETDEV_DOWN: 4025 case NETDEV_DOWN:
3885 if (rt->should_flush) 4026 if (rt->should_flush)
3886 return -1; 4027 return -1;
3887 if (!rt->rt6i_nsiblings) 4028 if (!rt->fib6_nsiblings)
3888 return rt->dst.dev == dev ? -1 : 0; 4029 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3889 if (rt6_multipath_uses_dev(rt, dev)) { 4030 if (rt6_multipath_uses_dev(rt, dev)) {
3890 unsigned int count; 4031 unsigned int count;
3891 4032
3892 count = rt6_multipath_dead_count(rt, dev); 4033 count = rt6_multipath_dead_count(rt, dev);
3893 if (rt->rt6i_nsiblings + 1 == count) { 4034 if (rt->fib6_nsiblings + 1 == count) {
3894 rt6_multipath_flush(rt); 4035 rt6_multipath_flush(rt);
3895 return -1; 4036 return -1;
3896 } 4037 }
3897 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4038 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3898 RTNH_F_LINKDOWN); 4039 RTNH_F_LINKDOWN);
3899 fib6_update_sernum(rt); 4040 fib6_update_sernum(net, rt);
3900 rt6_multipath_rebalance(rt); 4041 rt6_multipath_rebalance(rt);
3901 } 4042 }
3902 return -2; 4043 return -2;
3903 case NETDEV_CHANGE: 4044 case NETDEV_CHANGE:
3904 if (rt->dst.dev != dev || 4045 if (rt->fib6_nh.nh_dev != dev ||
3905 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) 4046 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3906 break; 4047 break;
3907 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; 4048 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3908 rt6_multipath_rebalance(rt); 4049 rt6_multipath_rebalance(rt);
3909 break; 4050 break;
3910 } 4051 }
@@ -3936,7 +4077,7 @@ struct rt6_mtu_change_arg {
3936 unsigned int mtu; 4077 unsigned int mtu;
3937}; 4078};
3938 4079
3939static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 4080static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
3940{ 4081{
3941 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4082 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3942 struct inet6_dev *idev; 4083 struct inet6_dev *idev;
@@ -3956,12 +4097,15 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3956 Since RFC 1981 doesn't include administrative MTU increase 4097 Since RFC 1981 doesn't include administrative MTU increase
3957 update PMTU increase is a MUST. (i.e. jumbo frame) 4098 update PMTU increase is a MUST. (i.e. jumbo frame)
3958 */ 4099 */
3959 if (rt->dst.dev == arg->dev && 4100 if (rt->fib6_nh.nh_dev == arg->dev &&
3960 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 4101 !fib6_metric_locked(rt, RTAX_MTU)) {
4102 u32 mtu = rt->fib6_pmtu;
4103
4104 if (mtu >= arg->mtu ||
4105 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4106 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4107
3961 spin_lock_bh(&rt6_exception_lock); 4108 spin_lock_bh(&rt6_exception_lock);
3962 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3963 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3964 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3965 rt6_exceptions_update_pmtu(idev, rt, arg->mtu); 4109 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3966 spin_unlock_bh(&rt6_exception_lock); 4110 spin_unlock_bh(&rt6_exception_lock);
3967 } 4111 }
@@ -3993,6 +4137,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3993 [RTA_UID] = { .type = NLA_U32 }, 4137 [RTA_UID] = { .type = NLA_U32 },
3994 [RTA_MARK] = { .type = NLA_U32 }, 4138 [RTA_MARK] = { .type = NLA_U32 },
3995 [RTA_TABLE] = { .type = NLA_U32 }, 4139 [RTA_TABLE] = { .type = NLA_U32 },
4140 [RTA_IP_PROTO] = { .type = NLA_U8 },
4141 [RTA_SPORT] = { .type = NLA_U16 },
4142 [RTA_DPORT] = { .type = NLA_U16 },
3996}; 4143};
3997 4144
3998static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4145static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4122,9 +4269,8 @@ errout:
4122} 4269}
4123 4270
4124struct rt6_nh { 4271struct rt6_nh {
4125 struct rt6_info *rt6_info; 4272 struct fib6_info *fib6_info;
4126 struct fib6_config r_cfg; 4273 struct fib6_config r_cfg;
4127 struct mx6_config mxc;
4128 struct list_head next; 4274 struct list_head next;
4129}; 4275};
4130 4276
@@ -4139,23 +4285,25 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4139 } 4285 }
4140} 4286}
4141 4287
4142static int ip6_route_info_append(struct list_head *rt6_nh_list, 4288static int ip6_route_info_append(struct net *net,
4143 struct rt6_info *rt, struct fib6_config *r_cfg) 4289 struct list_head *rt6_nh_list,
4290 struct fib6_info *rt,
4291 struct fib6_config *r_cfg)
4144{ 4292{
4145 struct rt6_nh *nh; 4293 struct rt6_nh *nh;
4146 int err = -EEXIST; 4294 int err = -EEXIST;
4147 4295
4148 list_for_each_entry(nh, rt6_nh_list, next) { 4296 list_for_each_entry(nh, rt6_nh_list, next) {
4149 /* check if rt6_info already exists */ 4297 /* check if fib6_info already exists */
4150 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 4298 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4151 return err; 4299 return err;
4152 } 4300 }
4153 4301
4154 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4302 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4155 if (!nh) 4303 if (!nh)
4156 return -ENOMEM; 4304 return -ENOMEM;
4157 nh->rt6_info = rt; 4305 nh->fib6_info = rt;
4158 err = ip6_convert_metrics(&nh->mxc, r_cfg); 4306 err = ip6_convert_metrics(net, rt, r_cfg);
4159 if (err) { 4307 if (err) {
4160 kfree(nh); 4308 kfree(nh);
4161 return err; 4309 return err;
@@ -4166,8 +4314,8 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
4166 return 0; 4314 return 0;
4167} 4315}
4168 4316
4169static void ip6_route_mpath_notify(struct rt6_info *rt, 4317static void ip6_route_mpath_notify(struct fib6_info *rt,
4170 struct rt6_info *rt_last, 4318 struct fib6_info *rt_last,
4171 struct nl_info *info, 4319 struct nl_info *info,
4172 __u16 nlflags) 4320 __u16 nlflags)
4173{ 4321{
@@ -4177,10 +4325,10 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
4177 * nexthop. Since sibling routes are always added at the end of 4325 * nexthop. Since sibling routes are always added at the end of
4178 * the list, find the first sibling of the last route appended 4326 * the list, find the first sibling of the last route appended
4179 */ 4327 */
4180 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 4328 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4181 rt = list_first_entry(&rt_last->rt6i_siblings, 4329 rt = list_first_entry(&rt_last->fib6_siblings,
4182 struct rt6_info, 4330 struct fib6_info,
4183 rt6i_siblings); 4331 fib6_siblings);
4184 } 4332 }
4185 4333
4186 if (rt) 4334 if (rt)
@@ -4190,11 +4338,11 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
4190static int ip6_route_multipath_add(struct fib6_config *cfg, 4338static int ip6_route_multipath_add(struct fib6_config *cfg,
4191 struct netlink_ext_ack *extack) 4339 struct netlink_ext_ack *extack)
4192{ 4340{
4193 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 4341 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4194 struct nl_info *info = &cfg->fc_nlinfo; 4342 struct nl_info *info = &cfg->fc_nlinfo;
4195 struct fib6_config r_cfg; 4343 struct fib6_config r_cfg;
4196 struct rtnexthop *rtnh; 4344 struct rtnexthop *rtnh;
4197 struct rt6_info *rt; 4345 struct fib6_info *rt;
4198 struct rt6_nh *err_nh; 4346 struct rt6_nh *err_nh;
4199 struct rt6_nh *nh, *nh_safe; 4347 struct rt6_nh *nh, *nh_safe;
4200 __u16 nlflags; 4348 __u16 nlflags;
@@ -4214,7 +4362,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
4214 rtnh = (struct rtnexthop *)cfg->fc_mp; 4362 rtnh = (struct rtnexthop *)cfg->fc_mp;
4215 4363
4216 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 4364 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4217 * rt6_info structs per nexthop 4365 * fib6_info structs per nexthop
4218 */ 4366 */
4219 while (rtnh_ok(rtnh, remaining)) { 4367 while (rtnh_ok(rtnh, remaining)) {
4220 memcpy(&r_cfg, cfg, sizeof(*cfg)); 4368 memcpy(&r_cfg, cfg, sizeof(*cfg));
@@ -4237,18 +4385,19 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
4237 } 4385 }
4238 4386
4239 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 4387 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4240 rt = ip6_route_info_create(&r_cfg, extack); 4388 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4241 if (IS_ERR(rt)) { 4389 if (IS_ERR(rt)) {
4242 err = PTR_ERR(rt); 4390 err = PTR_ERR(rt);
4243 rt = NULL; 4391 rt = NULL;
4244 goto cleanup; 4392 goto cleanup;
4245 } 4393 }
4246 4394
4247 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; 4395 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4248 4396
4249 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 4397 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4398 rt, &r_cfg);
4250 if (err) { 4399 if (err) {
4251 dst_release_immediate(&rt->dst); 4400 fib6_info_release(rt);
4252 goto cleanup; 4401 goto cleanup;
4253 } 4402 }
4254 4403
@@ -4263,14 +4412,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
4263 4412
4264 err_nh = NULL; 4413 err_nh = NULL;
4265 list_for_each_entry(nh, &rt6_nh_list, next) { 4414 list_for_each_entry(nh, &rt6_nh_list, next) {
4266 rt_last = nh->rt6_info; 4415 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4267 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 4416 fib6_info_release(nh->fib6_info);
4268 /* save reference to first route for notification */ 4417
4269 if (!rt_notif && !err) 4418 if (!err) {
4270 rt_notif = nh->rt6_info; 4419 /* save reference to last route successfully inserted */
4271 4420 rt_last = nh->fib6_info;
4272 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 4421
4273 nh->rt6_info = NULL; 4422 /* save reference to first route for notification */
4423 if (!rt_notif)
4424 rt_notif = nh->fib6_info;
4425 }
4426
4427 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4428 nh->fib6_info = NULL;
4274 if (err) { 4429 if (err) {
4275 if (replace && nhn) 4430 if (replace && nhn)
4276 ip6_print_replace_route_err(&rt6_nh_list); 4431 ip6_print_replace_route_err(&rt6_nh_list);
@@ -4287,6 +4442,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
4287 */ 4442 */
4288 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4443 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4289 NLM_F_REPLACE); 4444 NLM_F_REPLACE);
4445 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
4290 nhn++; 4446 nhn++;
4291 } 4447 }
4292 4448
@@ -4311,9 +4467,8 @@ add_errout:
4311 4467
4312cleanup: 4468cleanup:
4313 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 4469 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4314 if (nh->rt6_info) 4470 if (nh->fib6_info)
4315 dst_release_immediate(&nh->rt6_info->dst); 4471 fib6_info_release(nh->fib6_info);
4316 kfree(nh->mxc.mx);
4317 list_del(&nh->next); 4472 list_del(&nh->next);
4318 kfree(nh); 4473 kfree(nh);
4319 } 4474 }
@@ -4390,20 +4545,20 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4390 if (cfg.fc_mp) 4545 if (cfg.fc_mp)
4391 return ip6_route_multipath_add(&cfg, extack); 4546 return ip6_route_multipath_add(&cfg, extack);
4392 else 4547 else
4393 return ip6_route_add(&cfg, extack); 4548 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4394} 4549}
4395 4550
4396static size_t rt6_nlmsg_size(struct rt6_info *rt) 4551static size_t rt6_nlmsg_size(struct fib6_info *rt)
4397{ 4552{
4398 int nexthop_len = 0; 4553 int nexthop_len = 0;
4399 4554
4400 if (rt->rt6i_nsiblings) { 4555 if (rt->fib6_nsiblings) {
4401 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 4556 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4402 + NLA_ALIGN(sizeof(struct rtnexthop)) 4557 + NLA_ALIGN(sizeof(struct rtnexthop))
4403 + nla_total_size(16) /* RTA_GATEWAY */ 4558 + nla_total_size(16) /* RTA_GATEWAY */
4404 + lwtunnel_get_encap_size(rt->dst.lwtstate); 4559 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4405 4560
4406 nexthop_len *= rt->rt6i_nsiblings; 4561 nexthop_len *= rt->fib6_nsiblings;
4407 } 4562 }
4408 4563
4409 return NLMSG_ALIGN(sizeof(struct rtmsg)) 4564 return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4419,38 +4574,41 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
4419 + nla_total_size(sizeof(struct rta_cacheinfo)) 4574 + nla_total_size(sizeof(struct rta_cacheinfo))
4420 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 4575 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4421 + nla_total_size(1) /* RTA_PREF */ 4576 + nla_total_size(1) /* RTA_PREF */
4422 + lwtunnel_get_encap_size(rt->dst.lwtstate) 4577 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4423 + nexthop_len; 4578 + nexthop_len;
4424} 4579}
4425 4580
4426static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 4581static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4427 unsigned int *flags, bool skip_oif) 4582 unsigned int *flags, bool skip_oif)
4428{ 4583{
4429 if (rt->rt6i_nh_flags & RTNH_F_DEAD) 4584 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4430 *flags |= RTNH_F_DEAD; 4585 *flags |= RTNH_F_DEAD;
4431 4586
4432 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { 4587 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4433 *flags |= RTNH_F_LINKDOWN; 4588 *flags |= RTNH_F_LINKDOWN;
4434 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 4589
4590 rcu_read_lock();
4591 if (fib6_ignore_linkdown(rt))
4435 *flags |= RTNH_F_DEAD; 4592 *flags |= RTNH_F_DEAD;
4593 rcu_read_unlock();
4436 } 4594 }
4437 4595
4438 if (rt->rt6i_flags & RTF_GATEWAY) { 4596 if (rt->fib6_flags & RTF_GATEWAY) {
4439 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 4597 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4440 goto nla_put_failure; 4598 goto nla_put_failure;
4441 } 4599 }
4442 4600
4443 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); 4601 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4444 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 4602 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4445 *flags |= RTNH_F_OFFLOAD; 4603 *flags |= RTNH_F_OFFLOAD;
4446 4604
4447 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 4605 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4448 if (!skip_oif && rt->dst.dev && 4606 if (!skip_oif && rt->fib6_nh.nh_dev &&
4449 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 4607 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4450 goto nla_put_failure; 4608 goto nla_put_failure;
4451 4609
4452 if (rt->dst.lwtstate && 4610 if (rt->fib6_nh.nh_lwtstate &&
4453 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 4611 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4454 goto nla_put_failure; 4612 goto nla_put_failure;
4455 4613
4456 return 0; 4614 return 0;
@@ -4460,8 +4618,9 @@ nla_put_failure:
4460} 4618}
4461 4619
4462/* add multipath next hop */ 4620/* add multipath next hop */
4463static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 4621static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4464{ 4622{
4623 const struct net_device *dev = rt->fib6_nh.nh_dev;
4465 struct rtnexthop *rtnh; 4624 struct rtnexthop *rtnh;
4466 unsigned int flags = 0; 4625 unsigned int flags = 0;
4467 4626
@@ -4469,8 +4628,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4469 if (!rtnh) 4628 if (!rtnh)
4470 goto nla_put_failure; 4629 goto nla_put_failure;
4471 4630
4472 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; 4631 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4473 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 4632 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4474 4633
4475 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4634 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4476 goto nla_put_failure; 4635 goto nla_put_failure;
@@ -4486,16 +4645,16 @@ nla_put_failure:
4486 return -EMSGSIZE; 4645 return -EMSGSIZE;
4487} 4646}
4488 4647
4489static int rt6_fill_node(struct net *net, 4648static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4490 struct sk_buff *skb, struct rt6_info *rt, 4649 struct fib6_info *rt, struct dst_entry *dst,
4491 struct in6_addr *dst, struct in6_addr *src, 4650 struct in6_addr *dest, struct in6_addr *src,
4492 int iif, int type, u32 portid, u32 seq, 4651 int iif, int type, u32 portid, u32 seq,
4493 unsigned int flags) 4652 unsigned int flags)
4494{ 4653{
4495 u32 metrics[RTAX_MAX];
4496 struct rtmsg *rtm; 4654 struct rtmsg *rtm;
4497 struct nlmsghdr *nlh; 4655 struct nlmsghdr *nlh;
4498 long expires; 4656 long expires = 0;
4657 u32 *pmetrics;
4499 u32 table; 4658 u32 table;
4500 4659
4501 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4660 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
@@ -4504,53 +4663,31 @@ static int rt6_fill_node(struct net *net,
4504 4663
4505 rtm = nlmsg_data(nlh); 4664 rtm = nlmsg_data(nlh);
4506 rtm->rtm_family = AF_INET6; 4665 rtm->rtm_family = AF_INET6;
4507 rtm->rtm_dst_len = rt->rt6i_dst.plen; 4666 rtm->rtm_dst_len = rt->fib6_dst.plen;
4508 rtm->rtm_src_len = rt->rt6i_src.plen; 4667 rtm->rtm_src_len = rt->fib6_src.plen;
4509 rtm->rtm_tos = 0; 4668 rtm->rtm_tos = 0;
4510 if (rt->rt6i_table) 4669 if (rt->fib6_table)
4511 table = rt->rt6i_table->tb6_id; 4670 table = rt->fib6_table->tb6_id;
4512 else 4671 else
4513 table = RT6_TABLE_UNSPEC; 4672 table = RT6_TABLE_UNSPEC;
4514 rtm->rtm_table = table; 4673 rtm->rtm_table = table;
4515 if (nla_put_u32(skb, RTA_TABLE, table)) 4674 if (nla_put_u32(skb, RTA_TABLE, table))
4516 goto nla_put_failure; 4675 goto nla_put_failure;
4517 if (rt->rt6i_flags & RTF_REJECT) { 4676
4518 switch (rt->dst.error) { 4677 rtm->rtm_type = rt->fib6_type;
4519 case -EINVAL:
4520 rtm->rtm_type = RTN_BLACKHOLE;
4521 break;
4522 case -EACCES:
4523 rtm->rtm_type = RTN_PROHIBIT;
4524 break;
4525 case -EAGAIN:
4526 rtm->rtm_type = RTN_THROW;
4527 break;
4528 default:
4529 rtm->rtm_type = RTN_UNREACHABLE;
4530 break;
4531 }
4532 }
4533 else if (rt->rt6i_flags & RTF_LOCAL)
4534 rtm->rtm_type = RTN_LOCAL;
4535 else if (rt->rt6i_flags & RTF_ANYCAST)
4536 rtm->rtm_type = RTN_ANYCAST;
4537 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4538 rtm->rtm_type = RTN_LOCAL;
4539 else
4540 rtm->rtm_type = RTN_UNICAST;
4541 rtm->rtm_flags = 0; 4678 rtm->rtm_flags = 0;
4542 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4679 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4543 rtm->rtm_protocol = rt->rt6i_protocol; 4680 rtm->rtm_protocol = rt->fib6_protocol;
4544 4681
4545 if (rt->rt6i_flags & RTF_CACHE) 4682 if (rt->fib6_flags & RTF_CACHE)
4546 rtm->rtm_flags |= RTM_F_CLONED; 4683 rtm->rtm_flags |= RTM_F_CLONED;
4547 4684
4548 if (dst) { 4685 if (dest) {
4549 if (nla_put_in6_addr(skb, RTA_DST, dst)) 4686 if (nla_put_in6_addr(skb, RTA_DST, dest))
4550 goto nla_put_failure; 4687 goto nla_put_failure;
4551 rtm->rtm_dst_len = 128; 4688 rtm->rtm_dst_len = 128;
4552 } else if (rtm->rtm_dst_len) 4689 } else if (rtm->rtm_dst_len)
4553 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 4690 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4554 goto nla_put_failure; 4691 goto nla_put_failure;
4555#ifdef CONFIG_IPV6_SUBTREES 4692#ifdef CONFIG_IPV6_SUBTREES
4556 if (src) { 4693 if (src) {
@@ -4558,12 +4695,12 @@ static int rt6_fill_node(struct net *net,
4558 goto nla_put_failure; 4695 goto nla_put_failure;
4559 rtm->rtm_src_len = 128; 4696 rtm->rtm_src_len = 128;
4560 } else if (rtm->rtm_src_len && 4697 } else if (rtm->rtm_src_len &&
4561 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 4698 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4562 goto nla_put_failure; 4699 goto nla_put_failure;
4563#endif 4700#endif
4564 if (iif) { 4701 if (iif) {
4565#ifdef CONFIG_IPV6_MROUTE 4702#ifdef CONFIG_IPV6_MROUTE
4566 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 4703 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4567 int err = ip6mr_get_route(net, skb, rtm, portid); 4704 int err = ip6mr_get_route(net, skb, rtm, portid);
4568 4705
4569 if (err == 0) 4706 if (err == 0)
@@ -4574,34 +4711,32 @@ static int rt6_fill_node(struct net *net,
4574#endif 4711#endif
4575 if (nla_put_u32(skb, RTA_IIF, iif)) 4712 if (nla_put_u32(skb, RTA_IIF, iif))
4576 goto nla_put_failure; 4713 goto nla_put_failure;
4577 } else if (dst) { 4714 } else if (dest) {
4578 struct in6_addr saddr_buf; 4715 struct in6_addr saddr_buf;
4579 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 4716 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4580 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4717 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4581 goto nla_put_failure; 4718 goto nla_put_failure;
4582 } 4719 }
4583 4720
4584 if (rt->rt6i_prefsrc.plen) { 4721 if (rt->fib6_prefsrc.plen) {
4585 struct in6_addr saddr_buf; 4722 struct in6_addr saddr_buf;
4586 saddr_buf = rt->rt6i_prefsrc.addr; 4723 saddr_buf = rt->fib6_prefsrc.addr;
4587 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4724 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4588 goto nla_put_failure; 4725 goto nla_put_failure;
4589 } 4726 }
4590 4727
4591 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 4728 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4592 if (rt->rt6i_pmtu) 4729 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4593 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4594 if (rtnetlink_put_metrics(skb, metrics) < 0)
4595 goto nla_put_failure; 4730 goto nla_put_failure;
4596 4731
4597 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 4732 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4598 goto nla_put_failure; 4733 goto nla_put_failure;
4599 4734
4600 /* For multipath routes, walk the siblings list and add 4735 /* For multipath routes, walk the siblings list and add
4601 * each as a nexthop within RTA_MULTIPATH. 4736 * each as a nexthop within RTA_MULTIPATH.
4602 */ 4737 */
4603 if (rt->rt6i_nsiblings) { 4738 if (rt->fib6_nsiblings) {
4604 struct rt6_info *sibling, *next_sibling; 4739 struct fib6_info *sibling, *next_sibling;
4605 struct nlattr *mp; 4740 struct nlattr *mp;
4606 4741
4607 mp = nla_nest_start(skb, RTA_MULTIPATH); 4742 mp = nla_nest_start(skb, RTA_MULTIPATH);
@@ -4612,7 +4747,7 @@ static int rt6_fill_node(struct net *net,
4612 goto nla_put_failure; 4747 goto nla_put_failure;
4613 4748
4614 list_for_each_entry_safe(sibling, next_sibling, 4749 list_for_each_entry_safe(sibling, next_sibling,
4615 &rt->rt6i_siblings, rt6i_siblings) { 4750 &rt->fib6_siblings, fib6_siblings) {
4616 if (rt6_add_nexthop(skb, sibling) < 0) 4751 if (rt6_add_nexthop(skb, sibling) < 0)
4617 goto nla_put_failure; 4752 goto nla_put_failure;
4618 } 4753 }
@@ -4623,12 +4758,15 @@ static int rt6_fill_node(struct net *net,
4623 goto nla_put_failure; 4758 goto nla_put_failure;
4624 } 4759 }
4625 4760
4626 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 4761 if (rt->fib6_flags & RTF_EXPIRES) {
4762 expires = dst ? dst->expires : rt->expires;
4763 expires -= jiffies;
4764 }
4627 4765
4628 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 4766 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4629 goto nla_put_failure; 4767 goto nla_put_failure;
4630 4768
4631 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 4769 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4632 goto nla_put_failure; 4770 goto nla_put_failure;
4633 4771
4634 4772
@@ -4640,12 +4778,12 @@ nla_put_failure:
4640 return -EMSGSIZE; 4778 return -EMSGSIZE;
4641} 4779}
4642 4780
4643int rt6_dump_route(struct rt6_info *rt, void *p_arg) 4781int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4644{ 4782{
4645 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4783 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4646 struct net *net = arg->net; 4784 struct net *net = arg->net;
4647 4785
4648 if (rt == net->ipv6.ip6_null_entry) 4786 if (rt == net->ipv6.fib6_null_entry)
4649 return 0; 4787 return 0;
4650 4788
4651 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4789 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
@@ -4653,16 +4791,15 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4653 4791
4654 /* user wants prefix routes only */ 4792 /* user wants prefix routes only */
4655 if (rtm->rtm_flags & RTM_F_PREFIX && 4793 if (rtm->rtm_flags & RTM_F_PREFIX &&
4656 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 4794 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4657 /* success since this is not a prefix route */ 4795 /* success since this is not a prefix route */
4658 return 1; 4796 return 1;
4659 } 4797 }
4660 } 4798 }
4661 4799
4662 return rt6_fill_node(net, 4800 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4663 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 4801 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4664 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 4802 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4665 NLM_F_MULTI);
4666} 4803}
4667 4804
4668static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4805static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4671,6 +4808,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4671 struct net *net = sock_net(in_skb->sk); 4808 struct net *net = sock_net(in_skb->sk);
4672 struct nlattr *tb[RTA_MAX+1]; 4809 struct nlattr *tb[RTA_MAX+1];
4673 int err, iif = 0, oif = 0; 4810 int err, iif = 0, oif = 0;
4811 struct fib6_info *from;
4674 struct dst_entry *dst; 4812 struct dst_entry *dst;
4675 struct rt6_info *rt; 4813 struct rt6_info *rt;
4676 struct sk_buff *skb; 4814 struct sk_buff *skb;
@@ -4718,6 +4856,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4718 else 4856 else
4719 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4857 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4720 4858
4859 if (tb[RTA_SPORT])
4860 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4861
4862 if (tb[RTA_DPORT])
4863 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4864
4865 if (tb[RTA_IP_PROTO]) {
4866 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4867 &fl6.flowi6_proto, extack);
4868 if (err)
4869 goto errout;
4870 }
4871
4721 if (iif) { 4872 if (iif) {
4722 struct net_device *dev; 4873 struct net_device *dev;
4723 int flags = 0; 4874 int flags = 0;
@@ -4759,14 +4910,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4759 goto errout; 4910 goto errout;
4760 } 4911 }
4761 4912
4762 if (fibmatch && rt->from) {
4763 struct rt6_info *ort = rt->from;
4764
4765 dst_hold(&ort->dst);
4766 ip6_rt_put(rt);
4767 rt = ort;
4768 }
4769
4770 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4913 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4771 if (!skb) { 4914 if (!skb) {
4772 ip6_rt_put(rt); 4915 ip6_rt_put(rt);
@@ -4775,14 +4918,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4775 } 4918 }
4776 4919
4777 skb_dst_set(skb, &rt->dst); 4920 skb_dst_set(skb, &rt->dst);
4921
4922 rcu_read_lock();
4923 from = rcu_dereference(rt->from);
4924
4778 if (fibmatch) 4925 if (fibmatch)
4779 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 4926 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4780 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4927 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4781 nlh->nlmsg_seq, 0); 4928 nlh->nlmsg_seq, 0);
4782 else 4929 else
4783 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 4930 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4784 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4931 &fl6.saddr, iif, RTM_NEWROUTE,
4785 nlh->nlmsg_seq, 0); 4932 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4933 0);
4934 rcu_read_unlock();
4935
4786 if (err < 0) { 4936 if (err < 0) {
4787 kfree_skb(skb); 4937 kfree_skb(skb);
4788 goto errout; 4938 goto errout;
@@ -4793,7 +4943,7 @@ errout:
4793 return err; 4943 return err;
4794} 4944}
4795 4945
4796void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 4946void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4797 unsigned int nlm_flags) 4947 unsigned int nlm_flags)
4798{ 4948{
4799 struct sk_buff *skb; 4949 struct sk_buff *skb;
@@ -4808,8 +4958,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4808 if (!skb) 4958 if (!skb)
4809 goto errout; 4959 goto errout;
4810 4960
4811 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 4961 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4812 event, info->portid, seq, nlm_flags); 4962 event, info->portid, seq, nlm_flags);
4813 if (err < 0) { 4963 if (err < 0) {
4814 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4964 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4815 WARN_ON(err == -EMSGSIZE); 4965 WARN_ON(err == -EMSGSIZE);
@@ -4834,6 +4984,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
4834 return NOTIFY_OK; 4984 return NOTIFY_OK;
4835 4985
4836 if (event == NETDEV_REGISTER) { 4986 if (event == NETDEV_REGISTER) {
4987 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4837 net->ipv6.ip6_null_entry->dst.dev = dev; 4988 net->ipv6.ip6_null_entry->dst.dev = dev;
4838 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4989 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4839#ifdef CONFIG_IPV6_MULTIPLE_TABLES 4990#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5010,11 +5161,17 @@ static int __net_init ip6_route_net_init(struct net *net)
5010 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 5161 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5011 goto out_ip6_dst_ops; 5162 goto out_ip6_dst_ops;
5012 5163
5164 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5165 sizeof(*net->ipv6.fib6_null_entry),
5166 GFP_KERNEL);
5167 if (!net->ipv6.fib6_null_entry)
5168 goto out_ip6_dst_entries;
5169
5013 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 5170 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5014 sizeof(*net->ipv6.ip6_null_entry), 5171 sizeof(*net->ipv6.ip6_null_entry),
5015 GFP_KERNEL); 5172 GFP_KERNEL);
5016 if (!net->ipv6.ip6_null_entry) 5173 if (!net->ipv6.ip6_null_entry)
5017 goto out_ip6_dst_entries; 5174 goto out_fib6_null_entry;
5018 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 5175 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5019 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 5176 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5020 ip6_template_metrics, true); 5177 ip6_template_metrics, true);
@@ -5061,6 +5218,8 @@ out_ip6_prohibit_entry:
5061out_ip6_null_entry: 5218out_ip6_null_entry:
5062 kfree(net->ipv6.ip6_null_entry); 5219 kfree(net->ipv6.ip6_null_entry);
5063#endif 5220#endif
5221out_fib6_null_entry:
5222 kfree(net->ipv6.fib6_null_entry);
5064out_ip6_dst_entries: 5223out_ip6_dst_entries:
5065 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 5224 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5066out_ip6_dst_ops: 5225out_ip6_dst_ops:
@@ -5069,6 +5228,7 @@ out_ip6_dst_ops:
5069 5228
5070static void __net_exit ip6_route_net_exit(struct net *net) 5229static void __net_exit ip6_route_net_exit(struct net *net)
5071{ 5230{
5231 kfree(net->ipv6.fib6_null_entry);
5072 kfree(net->ipv6.ip6_null_entry); 5232 kfree(net->ipv6.ip6_null_entry);
5073#ifdef CONFIG_IPV6_MULTIPLE_TABLES 5233#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5074 kfree(net->ipv6.ip6_prohibit_entry); 5234 kfree(net->ipv6.ip6_prohibit_entry);
@@ -5141,6 +5301,7 @@ void __init ip6_route_init_special_entries(void)
5141 /* Registering of the loopback is done before this portion of code, 5301 /* Registering of the loopback is done before this portion of code,
5142 * the loopback reference in rt6_info will not be taken, do it 5302 * the loopback reference in rt6_info will not be taken, do it
5143 * manually for init_net */ 5303 * manually for init_net */
5304 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5144 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 5305 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5145 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 5306 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 5307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 7f5621d09571..0fdf2a55e746 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -226,7 +226,6 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
226 226
227nla_put_failure: 227nla_put_failure:
228 rcu_read_unlock(); 228 rcu_read_unlock();
229 genlmsg_cancel(msg, hdr);
230free_msg: 229free_msg:
231 nlmsg_free(msg); 230 nlmsg_free(msg);
232 return -ENOMEM; 231 return -ENOMEM;
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index bf4763fd68c2..19ccf0dc996c 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,6 +91,24 @@ static void set_tun_src(struct net *net, struct net_device *dev,
91 rcu_read_unlock(); 91 rcu_read_unlock();
92} 92}
93 93
94/* Compute flowlabel for outer IPv6 header */
95static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
96 struct ipv6hdr *inner_hdr)
97{
98 int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
99 __be32 flowlabel = 0;
100 u32 hash;
101
102 if (do_flowlabel > 0) {
103 hash = skb_get_hash(skb);
104 rol32(hash, 16);
105 flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
106 } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
107 flowlabel = ip6_flowlabel(inner_hdr);
108 }
109 return flowlabel;
110}
111
94/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ 112/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
95int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) 113int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
96{ 114{
@@ -99,6 +117,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
99 struct ipv6hdr *hdr, *inner_hdr; 117 struct ipv6hdr *hdr, *inner_hdr;
100 struct ipv6_sr_hdr *isrh; 118 struct ipv6_sr_hdr *isrh;
101 int hdrlen, tot_len, err; 119 int hdrlen, tot_len, err;
120 __be32 flowlabel;
102 121
103 hdrlen = (osrh->hdrlen + 1) << 3; 122 hdrlen = (osrh->hdrlen + 1) << 3;
104 tot_len = hdrlen + sizeof(*hdr); 123 tot_len = hdrlen + sizeof(*hdr);
@@ -108,6 +127,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
108 return err; 127 return err;
109 128
110 inner_hdr = ipv6_hdr(skb); 129 inner_hdr = ipv6_hdr(skb);
130 flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
111 131
112 skb_push(skb, tot_len); 132 skb_push(skb, tot_len);
113 skb_reset_network_header(skb); 133 skb_reset_network_header(skb);
@@ -121,10 +141,10 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
121 141
122 if (skb->protocol == htons(ETH_P_IPV6)) { 142 if (skb->protocol == htons(ETH_P_IPV6)) {
123 ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), 143 ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
124 ip6_flowlabel(inner_hdr)); 144 flowlabel);
125 hdr->hop_limit = inner_hdr->hop_limit; 145 hdr->hop_limit = inner_hdr->hop_limit;
126 } else { 146 } else {
127 ip6_flow_hdr(hdr, 0, 0); 147 ip6_flow_hdr(hdr, 0, flowlabel);
128 hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); 148 hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
129 } 149 }
130 150
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 45722327375a..cd6e4cab63f6 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * SR-IPv6 implementation 2 * SR-IPv6 implementation
3 * 3 *
4 * Author: 4 * Authors:
5 * David Lebrun <david.lebrun@uclouvain.be> 5 * David Lebrun <david.lebrun@uclouvain.be>
6 * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
6 * 7 *
7 * 8 *
8 * This program is free software; you can redistribute it and/or 9 * This program is free software; you can redistribute it and/or
@@ -30,7 +31,9 @@
30#ifdef CONFIG_IPV6_SEG6_HMAC 31#ifdef CONFIG_IPV6_SEG6_HMAC
31#include <net/seg6_hmac.h> 32#include <net/seg6_hmac.h>
32#endif 33#endif
34#include <net/seg6_local.h>
33#include <linux/etherdevice.h> 35#include <linux/etherdevice.h>
36#include <linux/bpf.h>
34 37
35struct seg6_local_lwt; 38struct seg6_local_lwt;
36 39
@@ -41,6 +44,11 @@ struct seg6_action_desc {
41 int static_headroom; 44 int static_headroom;
42}; 45};
43 46
47struct bpf_lwt_prog {
48 struct bpf_prog *prog;
49 char *name;
50};
51
44struct seg6_local_lwt { 52struct seg6_local_lwt {
45 int action; 53 int action;
46 struct ipv6_sr_hdr *srh; 54 struct ipv6_sr_hdr *srh;
@@ -49,6 +57,7 @@ struct seg6_local_lwt {
49 struct in6_addr nh6; 57 struct in6_addr nh6;
50 int iif; 58 int iif;
51 int oif; 59 int oif;
60 struct bpf_lwt_prog bpf;
52 61
53 int headroom; 62 int headroom;
54 struct seg6_action_desc *desc; 63 struct seg6_action_desc *desc;
@@ -140,8 +149,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
140 *daddr = *addr; 149 *daddr = *addr;
141} 150}
142 151
143static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, 152int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
144 u32 tbl_id) 153 u32 tbl_id)
145{ 154{
146 struct net *net = dev_net(skb->dev); 155 struct net *net = dev_net(skb->dev);
147 struct ipv6hdr *hdr = ipv6_hdr(skb); 156 struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -187,6 +196,7 @@ out:
187 196
188 skb_dst_drop(skb); 197 skb_dst_drop(skb);
189 skb_dst_set(skb, dst); 198 skb_dst_set(skb, dst);
199 return dst->error;
190} 200}
191 201
192/* regular endpoint function */ 202/* regular endpoint function */
@@ -200,7 +210,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
200 210
201 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 211 advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
202 212
203 lookup_nexthop(skb, NULL, 0); 213 seg6_lookup_nexthop(skb, NULL, 0);
204 214
205 return dst_input(skb); 215 return dst_input(skb);
206 216
@@ -220,7 +230,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
220 230
221 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 231 advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
222 232
223 lookup_nexthop(skb, &slwt->nh6, 0); 233 seg6_lookup_nexthop(skb, &slwt->nh6, 0);
224 234
225 return dst_input(skb); 235 return dst_input(skb);
226 236
@@ -239,7 +249,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
239 249
240 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 250 advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
241 251
242 lookup_nexthop(skb, NULL, slwt->table); 252 seg6_lookup_nexthop(skb, NULL, slwt->table);
243 253
244 return dst_input(skb); 254 return dst_input(skb);
245 255
@@ -331,7 +341,7 @@ static int input_action_end_dx6(struct sk_buff *skb,
331 if (!ipv6_addr_any(&slwt->nh6)) 341 if (!ipv6_addr_any(&slwt->nh6))
332 nhaddr = &slwt->nh6; 342 nhaddr = &slwt->nh6;
333 343
334 lookup_nexthop(skb, nhaddr, 0); 344 seg6_lookup_nexthop(skb, nhaddr, 0);
335 345
336 return dst_input(skb); 346 return dst_input(skb);
337drop: 347drop:
@@ -380,7 +390,7 @@ static int input_action_end_dt6(struct sk_buff *skb,
380 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 390 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
381 goto drop; 391 goto drop;
382 392
383 lookup_nexthop(skb, NULL, slwt->table); 393 seg6_lookup_nexthop(skb, NULL, slwt->table);
384 394
385 return dst_input(skb); 395 return dst_input(skb);
386 396
@@ -406,7 +416,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
406 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 416 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
407 skb_set_transport_header(skb, sizeof(struct ipv6hdr)); 417 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
408 418
409 lookup_nexthop(skb, NULL, 0); 419 seg6_lookup_nexthop(skb, NULL, 0);
410 420
411 return dst_input(skb); 421 return dst_input(skb);
412 422
@@ -438,7 +448,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
438 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 448 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
439 skb_set_transport_header(skb, sizeof(struct ipv6hdr)); 449 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
440 450
441 lookup_nexthop(skb, NULL, 0); 451 seg6_lookup_nexthop(skb, NULL, 0);
442 452
443 return dst_input(skb); 453 return dst_input(skb);
444 454
@@ -447,6 +457,71 @@ drop:
447 return err; 457 return err;
448} 458}
449 459
460DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
461
462static int input_action_end_bpf(struct sk_buff *skb,
463 struct seg6_local_lwt *slwt)
464{
465 struct seg6_bpf_srh_state *srh_state =
466 this_cpu_ptr(&seg6_bpf_srh_states);
467 struct seg6_bpf_srh_state local_srh_state;
468 struct ipv6_sr_hdr *srh;
469 int srhoff = 0;
470 int ret;
471
472 srh = get_and_validate_srh(skb);
473 if (!srh)
474 goto drop;
475 advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
476
477 /* preempt_disable is needed to protect the per-CPU buffer srh_state,
478 * which is also accessed by the bpf_lwt_seg6_* helpers
479 */
480 preempt_disable();
481 srh_state->hdrlen = srh->hdrlen << 3;
482 srh_state->valid = 1;
483
484 rcu_read_lock();
485 bpf_compute_data_pointers(skb);
486 ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
487 rcu_read_unlock();
488
489 local_srh_state = *srh_state;
490 preempt_enable();
491
492 switch (ret) {
493 case BPF_OK:
494 case BPF_REDIRECT:
495 break;
496 case BPF_DROP:
497 goto drop;
498 default:
499 pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
500 goto drop;
501 }
502
503 if (unlikely((local_srh_state.hdrlen & 7) != 0))
504 goto drop;
505
506 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
507 goto drop;
508 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
509 srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
510
511 if (!local_srh_state.valid &&
512 unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
513 goto drop;
514
515 if (ret != BPF_REDIRECT)
516 seg6_lookup_nexthop(skb, NULL, 0);
517
518 return dst_input(skb);
519
520drop:
521 kfree_skb(skb);
522 return -EINVAL;
523}
524
450static struct seg6_action_desc seg6_action_table[] = { 525static struct seg6_action_desc seg6_action_table[] = {
451 { 526 {
452 .action = SEG6_LOCAL_ACTION_END, 527 .action = SEG6_LOCAL_ACTION_END,
@@ -493,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
493 .attrs = (1 << SEG6_LOCAL_SRH), 568 .attrs = (1 << SEG6_LOCAL_SRH),
494 .input = input_action_end_b6_encap, 569 .input = input_action_end_b6_encap,
495 .static_headroom = sizeof(struct ipv6hdr), 570 .static_headroom = sizeof(struct ipv6hdr),
496 } 571 },
572 {
573 .action = SEG6_LOCAL_ACTION_END_BPF,
574 .attrs = (1 << SEG6_LOCAL_BPF),
575 .input = input_action_end_bpf,
576 },
577
497}; 578};
498 579
499static struct seg6_action_desc *__get_action_desc(int action) 580static struct seg6_action_desc *__get_action_desc(int action)
@@ -538,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
538 .len = sizeof(struct in6_addr) }, 619 .len = sizeof(struct in6_addr) },
539 [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, 620 [SEG6_LOCAL_IIF] = { .type = NLA_U32 },
540 [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, 621 [SEG6_LOCAL_OIF] = { .type = NLA_U32 },
622 [SEG6_LOCAL_BPF] = { .type = NLA_NESTED },
541}; 623};
542 624
543static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) 625static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -715,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
715 return 0; 797 return 0;
716} 798}
717 799
800#define MAX_PROG_NAME 256
801static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
802 [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, },
803 [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
804 .len = MAX_PROG_NAME },
805};
806
807static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
808{
809 struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
810 struct bpf_prog *p;
811 int ret;
812 u32 fd;
813
814 ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX,
815 attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL);
816 if (ret < 0)
817 return ret;
818
819 if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
820 return -EINVAL;
821
822 slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
823 if (!slwt->bpf.name)
824 return -ENOMEM;
825
826 fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
827 p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
828 if (IS_ERR(p)) {
829 kfree(slwt->bpf.name);
830 return PTR_ERR(p);
831 }
832
833 slwt->bpf.prog = p;
834 return 0;
835}
836
837static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
838{
839 struct nlattr *nest;
840
841 if (!slwt->bpf.prog)
842 return 0;
843
844 nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
845 if (!nest)
846 return -EMSGSIZE;
847
848 if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
849 return -EMSGSIZE;
850
851 if (slwt->bpf.name &&
852 nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
853 return -EMSGSIZE;
854
855 return nla_nest_end(skb, nest);
856}
857
858static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
859{
860 if (!a->bpf.name && !b->bpf.name)
861 return 0;
862
863 if (!a->bpf.name || !b->bpf.name)
864 return 1;
865
866 return strcmp(a->bpf.name, b->bpf.name);
867}
868
718struct seg6_action_param { 869struct seg6_action_param {
719 int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); 870 int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
720 int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); 871 int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -745,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
745 [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, 896 [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif,
746 .put = put_nla_oif, 897 .put = put_nla_oif,
747 .cmp = cmp_nla_oif }, 898 .cmp = cmp_nla_oif },
899
900 [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf,
901 .put = put_nla_bpf,
902 .cmp = cmp_nla_bpf },
903
748}; 904};
749 905
750static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) 906static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -830,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
830 struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); 986 struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
831 987
832 kfree(slwt->srh); 988 kfree(slwt->srh);
989
990 if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
991 kfree(slwt->bpf.name);
992 bpf_prog_put(slwt->bpf.prog);
993 }
994
995 return;
833} 996}
834 997
835static int seg6_local_fill_encap(struct sk_buff *skb, 998static int seg6_local_fill_encap(struct sk_buff *skb,
@@ -882,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
882 if (attrs & (1 << SEG6_LOCAL_OIF)) 1045 if (attrs & (1 << SEG6_LOCAL_OIF))
883 nlsize += nla_total_size(4); 1046 nlsize += nla_total_size(4);
884 1047
1048 if (attrs & (1 << SEG6_LOCAL_BPF))
1049 nlsize += nla_total_size(sizeof(struct nlattr)) +
1050 nla_total_size(MAX_PROG_NAME) +
1051 nla_total_size(4);
1052
885 return nlsize; 1053 return nlsize;
886} 1054}
887 1055
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6fbdef630152..e15cd37024fd 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -152,6 +152,13 @@ static struct ctl_table ipv6_table_template[] = {
152 .extra1 = &zero, 152 .extra1 = &zero,
153 .extra2 = &one, 153 .extra2 = &one,
154 }, 154 },
155 {
156 .procname = "seg6_flowlabel",
157 .data = &init_net.ipv6.sysctl.seg6_flowlabel,
158 .maxlen = sizeof(int),
159 .mode = 0644,
160 .proc_handler = proc_dointvec
161 },
155 { } 162 { }
156}; 163};
157 164
@@ -217,6 +224,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
217 ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; 224 ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
218 ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; 225 ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
219 ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, 226 ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
227 ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
220 228
221 ipv6_route_table = ipv6_route_sysctl_init(net); 229 ipv6_route_table = ipv6_route_sysctl_init(net);
222 if (!ipv6_route_table) 230 if (!ipv6_route_table)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d2ce66b23430..b620d9b72e59 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
803 unsigned int tot_len = sizeof(struct tcphdr); 803 unsigned int tot_len = sizeof(struct tcphdr);
804 struct dst_entry *dst; 804 struct dst_entry *dst;
805 __be32 *topt; 805 __be32 *topt;
806 __u32 mark = 0;
806 807
807 if (tsecr) 808 if (tsecr)
808 tot_len += TCPOLEN_TSTAMP_ALIGNED; 809 tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
871 fl6.flowi6_oif = oif; 872 fl6.flowi6_oif = oif;
872 } 873 }
873 874
874 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); 875 if (sk)
876 mark = (sk->sk_state == TCP_TIME_WAIT) ?
877 inet_twsk(sk)->tw_mark : sk->sk_mark;
878 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
875 fl6.fl6_dport = t1->dest; 879 fl6.fl6_dport = t1->dest;
876 fl6.fl6_sport = t1->source; 880 fl6.fl6_sport = t1->source;
877 fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 881 fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
@@ -1318,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
1318 } 1322 }
1319 } 1323 }
1320 1324
1321 tcp_rcv_established(sk, skb, tcp_hdr(skb)); 1325 tcp_rcv_established(sk, skb);
1322 if (opt_skb) 1326 if (opt_skb)
1323 goto ipv6_pktoptions; 1327 goto ipv6_pktoptions;
1324 return 0; 1328 return 0;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 00e2112da26d..164afd31aebf 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -285,9 +285,7 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
285/* Must be called under rcu_read_lock(). 285/* Must be called under rcu_read_lock().
286 * Does increment socket refcount. 286 * Does increment socket refcount.
287 */ 287 */
288#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ 288#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
289 IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
290 IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
291struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, 289struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
292 const struct in6_addr *daddr, __be16 dport, int dif) 290 const struct in6_addr *daddr, __be16 dport, int dif)
293{ 291{
@@ -546,10 +544,10 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
546 __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); 544 __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
547} 545}
548 546
549static struct static_key udpv6_encap_needed __read_mostly; 547static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
550void udpv6_encap_enable(void) 548void udpv6_encap_enable(void)
551{ 549{
552 static_key_enable(&udpv6_encap_needed); 550 static_branch_enable(&udpv6_encap_needed_key);
553} 551}
554EXPORT_SYMBOL(udpv6_encap_enable); 552EXPORT_SYMBOL(udpv6_encap_enable);
555 553
@@ -561,7 +559,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
561 if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) 559 if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
562 goto drop; 560 goto drop;
563 561
564 if (static_key_false(&udpv6_encap_needed) && up->encap_type) { 562 if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
565 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); 563 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
566 564
567 /* 565 /*
@@ -1023,7 +1021,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
1023 * Sending 1021 * Sending
1024 */ 1022 */
1025 1023
1026static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6) 1024static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
1025 struct inet_cork *cork)
1027{ 1026{
1028 struct sock *sk = skb->sk; 1027 struct sock *sk = skb->sk;
1029 struct udphdr *uh; 1028 struct udphdr *uh;
@@ -1042,12 +1041,32 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
1042 uh->len = htons(len); 1041 uh->len = htons(len);
1043 uh->check = 0; 1042 uh->check = 0;
1044 1043
1044 if (cork->gso_size) {
1045 const int hlen = skb_network_header_len(skb) +
1046 sizeof(struct udphdr);
1047
1048 if (hlen + cork->gso_size > cork->fragsize)
1049 return -EINVAL;
1050 if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
1051 return -EINVAL;
1052 if (udp_sk(sk)->no_check6_tx)
1053 return -EINVAL;
1054 if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
1055 dst_xfrm(skb_dst(skb)))
1056 return -EIO;
1057
1058 skb_shinfo(skb)->gso_size = cork->gso_size;
1059 skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
1060 goto csum_partial;
1061 }
1062
1045 if (is_udplite) 1063 if (is_udplite)
1046 csum = udplite_csum(skb); 1064 csum = udplite_csum(skb);
1047 else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */ 1065 else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
1048 skb->ip_summed = CHECKSUM_NONE; 1066 skb->ip_summed = CHECKSUM_NONE;
1049 goto send; 1067 goto send;
1050 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 1068 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
1069csum_partial:
1051 udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len); 1070 udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
1052 goto send; 1071 goto send;
1053 } else 1072 } else
@@ -1093,7 +1112,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
1093 if (!skb) 1112 if (!skb)
1094 goto out; 1113 goto out;
1095 1114
1096 err = udp_v6_send_skb(skb, &fl6); 1115 err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
1097 1116
1098out: 1117out:
1099 up->len = 0; 1118 up->len = 0;
@@ -1127,6 +1146,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1127 ipc6.hlimit = -1; 1146 ipc6.hlimit = -1;
1128 ipc6.tclass = -1; 1147 ipc6.tclass = -1;
1129 ipc6.dontfrag = -1; 1148 ipc6.dontfrag = -1;
1149 ipc6.gso_size = up->gso_size;
1130 sockc.tsflags = sk->sk_tsflags; 1150 sockc.tsflags = sk->sk_tsflags;
1131 1151
1132 /* destination address check */ 1152 /* destination address check */
@@ -1259,7 +1279,10 @@ do_udp_sendmsg:
1259 opt->tot_len = sizeof(*opt); 1279 opt->tot_len = sizeof(*opt);
1260 ipc6.opt = opt; 1280 ipc6.opt = opt;
1261 1281
1262 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); 1282 err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
1283 if (err > 0)
1284 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
1285 &ipc6, &sockc);
1263 if (err < 0) { 1286 if (err < 0) {
1264 fl6_sock_release(flowlabel); 1287 fl6_sock_release(flowlabel);
1265 return err; 1288 return err;
@@ -1291,6 +1314,29 @@ do_udp_sendmsg:
1291 fl6.saddr = np->saddr; 1314 fl6.saddr = np->saddr;
1292 fl6.fl6_sport = inet->inet_sport; 1315 fl6.fl6_sport = inet->inet_sport;
1293 1316
1317 if (cgroup_bpf_enabled && !connected) {
1318 err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
1319 (struct sockaddr *)sin6, &fl6.saddr);
1320 if (err)
1321 goto out_no_dst;
1322 if (sin6) {
1323 if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
1324 /* BPF program rewrote IPv6-only by IPv4-mapped
1325 * IPv6. It's currently unsupported.
1326 */
1327 err = -ENOTSUPP;
1328 goto out_no_dst;
1329 }
1330 if (sin6->sin6_port == 0) {
1331 /* BPF program set invalid port. Reject it. */
1332 err = -EINVAL;
1333 goto out_no_dst;
1334 }
1335 fl6.fl6_dport = sin6->sin6_port;
1336 fl6.daddr = sin6->sin6_addr;
1337 }
1338 }
1339
1294 final_p = fl6_update_dst(&fl6, opt, &final); 1340 final_p = fl6_update_dst(&fl6, opt, &final);
1295 if (final_p) 1341 if (final_p)
1296 connected = false; 1342 connected = false;
@@ -1324,15 +1370,16 @@ back_from_confirm:
1324 1370
1325 /* Lockless fast path for the non-corking case */ 1371 /* Lockless fast path for the non-corking case */
1326 if (!corkreq) { 1372 if (!corkreq) {
1373 struct inet_cork_full cork;
1327 struct sk_buff *skb; 1374 struct sk_buff *skb;
1328 1375
1329 skb = ip6_make_skb(sk, getfrag, msg, ulen, 1376 skb = ip6_make_skb(sk, getfrag, msg, ulen,
1330 sizeof(struct udphdr), &ipc6, 1377 sizeof(struct udphdr), &ipc6,
1331 &fl6, (struct rt6_info *)dst, 1378 &fl6, (struct rt6_info *)dst,
1332 msg->msg_flags, &sockc); 1379 msg->msg_flags, &cork, &sockc);
1333 err = PTR_ERR(skb); 1380 err = PTR_ERR(skb);
1334 if (!IS_ERR_OR_NULL(skb)) 1381 if (!IS_ERR_OR_NULL(skb))
1335 err = udp_v6_send_skb(skb, &fl6); 1382 err = udp_v6_send_skb(skb, &fl6, &cork.base);
1336 goto out; 1383 goto out;
1337 } 1384 }
1338 1385
@@ -1369,6 +1416,7 @@ do_append_data:
1369 1416
1370out: 1417out:
1371 dst_release(dst); 1418 dst_release(dst);
1419out_no_dst:
1372 fl6_sock_release(flowlabel); 1420 fl6_sock_release(flowlabel);
1373 txopt_put(opt_to_free); 1421 txopt_put(opt_to_free);
1374 if (!err) 1422 if (!err)
@@ -1402,7 +1450,7 @@ void udpv6_destroy_sock(struct sock *sk)
1402 udp_v6_flush_pending_frames(sk); 1450 udp_v6_flush_pending_frames(sk);
1403 release_sock(sk); 1451 release_sock(sk);
1404 1452
1405 if (static_key_false(&udpv6_encap_needed) && up->encap_type) { 1453 if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
1406 void (*encap_destroy)(struct sock *sk); 1454 void (*encap_destroy)(struct sock *sk);
1407 encap_destroy = READ_ONCE(up->encap_destroy); 1455 encap_destroy = READ_ONCE(up->encap_destroy);
1408 if (encap_destroy) 1456 if (encap_destroy)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a04dc9c781b..03a2ff3fe1e6 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -42,12 +42,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
42 const struct ipv6hdr *ipv6h; 42 const struct ipv6hdr *ipv6h;
43 struct udphdr *uh; 43 struct udphdr *uh;
44 44
45 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) 45 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
46 goto out; 46 goto out;
47 47
48 if (!pskb_may_pull(skb, sizeof(struct udphdr))) 48 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
49 goto out; 49 goto out;
50 50
51 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
52 return __udp_gso_segment(skb, features);
53
51 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot 54 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
52 * do checksum of UDP packets sent as multiple IP fragments. 55 * do checksum of UDP packets sent as multiple IP fragments.
53 */ 56 */
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 86dba282a147..ef3defaf43b9 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -107,8 +107,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
107 * it was magically lost, so this code needs audit */ 107 * it was magically lost, so this code needs audit */
108 xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | 108 xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
109 RTF_LOCAL); 109 RTF_LOCAL);
110 xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
111 xdst->u.rt6.rt6i_node = rt->rt6i_node;
112 xdst->route_cookie = rt6_get_cookie(rt); 110 xdst->route_cookie = rt6_get_cookie(rt);
113 xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; 111 xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
114 xdst->u.rt6.rt6i_dst = rt->rt6i_dst; 112 xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 16f434791763..5bdca3d5d6b7 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -60,11 +60,9 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
60static int 60static int
61__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) 61__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
62{ 62{
63 int i; 63 int count[XFRM_MAX_DEPTH] = { };
64 int class[XFRM_MAX_DEPTH]; 64 int class[XFRM_MAX_DEPTH];
65 int count[maxclass]; 65 int i;
66
67 memset(count, 0, sizeof(count));
68 66
69 for (i = 0; i < n; i++) { 67 for (i = 0; i < n; i++) {
70 int c; 68 int c;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 7f1e842ef05a..e87686f7d63c 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -57,6 +57,10 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
57 57
58static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) 58static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
59{ 59{
60 /* Drop reference taken during previous invocation */
61 if (pd->session)
62 l2tp_session_dec_refcount(pd->session);
63
60 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); 64 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
61 pd->session_idx++; 65 pd->session_idx++;
62 66
@@ -105,11 +109,16 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
105 if (!pd || pd == SEQ_START_TOKEN) 109 if (!pd || pd == SEQ_START_TOKEN)
106 return; 110 return;
107 111
108 /* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */ 112 /* Drop reference taken by last invocation of l2tp_dfs_next_session()
113 * or l2tp_dfs_next_tunnel().
114 */
115 if (pd->session) {
116 l2tp_session_dec_refcount(pd->session);
117 pd->session = NULL;
118 }
109 if (pd->tunnel) { 119 if (pd->tunnel) {
110 l2tp_tunnel_dec_refcount(pd->tunnel); 120 l2tp_tunnel_dec_refcount(pd->tunnel);
111 pd->tunnel = NULL; 121 pd->tunnel = NULL;
112 pd->session = NULL;
113 } 122 }
114} 123}
115 124
@@ -250,13 +259,10 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
250 goto out; 259 goto out;
251 } 260 }
252 261
253 /* Show the tunnel or session context */ 262 if (!pd->session)
254 if (!pd->session) {
255 l2tp_dfs_seq_tunnel_show(m, pd->tunnel); 263 l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
256 } else { 264 else
257 l2tp_dfs_seq_session_show(m, pd->session); 265 l2tp_dfs_seq_session_show(m, pd->session);
258 l2tp_session_dec_refcount(pd->session);
259 }
260 266
261out: 267out:
262 return 0; 268 return 0;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 3d8ca1231f8f..b56cb1df4fc0 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -428,16 +428,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
428 */ 428 */
429static void pppol2tp_session_close(struct l2tp_session *session) 429static void pppol2tp_session_close(struct l2tp_session *session)
430{ 430{
431 struct pppol2tp_session *ps;
432
433 ps = l2tp_session_priv(session);
434 mutex_lock(&ps->sk_lock);
435 ps->__sk = rcu_dereference_protected(ps->sk,
436 lockdep_is_held(&ps->sk_lock));
437 RCU_INIT_POINTER(ps->sk, NULL);
438 if (ps->__sk)
439 call_rcu(&ps->rcu, pppol2tp_put_sk);
440 mutex_unlock(&ps->sk_lock);
441} 431}
442 432
443/* Really kill the session socket. (Called from sock_put() if 433/* Really kill the session socket. (Called from sock_put() if
@@ -480,15 +470,24 @@ static int pppol2tp_release(struct socket *sock)
480 sock_orphan(sk); 470 sock_orphan(sk);
481 sock->sk = NULL; 471 sock->sk = NULL;
482 472
483 /* If the socket is associated with a session,
484 * l2tp_session_delete will call pppol2tp_session_close which
485 * will drop the session's ref on the socket.
486 */
487 session = pppol2tp_sock_to_session(sk); 473 session = pppol2tp_sock_to_session(sk);
488 if (session) { 474 if (session) {
475 struct pppol2tp_session *ps;
476
489 l2tp_session_delete(session); 477 l2tp_session_delete(session);
490 /* drop the ref obtained by pppol2tp_sock_to_session */ 478
491 sock_put(sk); 479 ps = l2tp_session_priv(session);
480 mutex_lock(&ps->sk_lock);
481 ps->__sk = rcu_dereference_protected(ps->sk,
482 lockdep_is_held(&ps->sk_lock));
483 RCU_INIT_POINTER(ps->sk, NULL);
484 mutex_unlock(&ps->sk_lock);
485 call_rcu(&ps->rcu, pppol2tp_put_sk);
486
487 /* Rely on the sock_put() call at the end of the function for
488 * dropping the reference held by pppol2tp_sock_to_session().
489 * The last reference will be dropped by pppol2tp_put_sk().
490 */
492 } 491 }
493 492
494 release_sock(sk); 493 release_sock(sk);
@@ -742,7 +741,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
742 */ 741 */
743 mutex_lock(&ps->sk_lock); 742 mutex_lock(&ps->sk_lock);
744 if (rcu_dereference_protected(ps->sk, 743 if (rcu_dereference_protected(ps->sk,
745 lockdep_is_held(&ps->sk_lock))) { 744 lockdep_is_held(&ps->sk_lock)) ||
745 ps->__sk) {
746 mutex_unlock(&ps->sk_lock); 746 mutex_unlock(&ps->sk_lock);
747 error = -EEXIST; 747 error = -EEXIST;
748 goto end; 748 goto end;
@@ -803,7 +803,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
803 803
804out_no_ppp: 804out_no_ppp:
805 /* This is how we get the session context from the socket. */ 805 /* This is how we get the session context from the socket. */
806 sock_hold(sk);
807 sk->sk_user_data = session; 806 sk->sk_user_data = session;
808 rcu_assign_pointer(ps->sk, sk); 807 rcu_assign_pointer(ps->sk, sk);
809 mutex_unlock(&ps->sk_lock); 808 mutex_unlock(&ps->sk_lock);
@@ -1576,6 +1575,10 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
1576 1575
1577static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) 1576static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
1578{ 1577{
1578 /* Drop reference taken during previous invocation */
1579 if (pd->session)
1580 l2tp_session_dec_refcount(pd->session);
1581
1579 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); 1582 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
1580 pd->session_idx++; 1583 pd->session_idx++;
1581 1584
@@ -1624,11 +1627,16 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v)
1624 if (!pd || pd == SEQ_START_TOKEN) 1627 if (!pd || pd == SEQ_START_TOKEN)
1625 return; 1628 return;
1626 1629
1627 /* Drop reference taken by last invocation of pppol2tp_next_tunnel() */ 1630 /* Drop reference taken by last invocation of pppol2tp_next_session()
1631 * or pppol2tp_next_tunnel().
1632 */
1633 if (pd->session) {
1634 l2tp_session_dec_refcount(pd->session);
1635 pd->session = NULL;
1636 }
1628 if (pd->tunnel) { 1637 if (pd->tunnel) {
1629 l2tp_tunnel_dec_refcount(pd->tunnel); 1638 l2tp_tunnel_dec_refcount(pd->tunnel);
1630 pd->tunnel = NULL; 1639 pd->tunnel = NULL;
1631 pd->session = NULL;
1632 } 1640 }
1633} 1641}
1634 1642
@@ -1723,14 +1731,10 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
1723 goto out; 1731 goto out;
1724 } 1732 }
1725 1733
1726 /* Show the tunnel or session context. 1734 if (!pd->session)
1727 */
1728 if (!pd->session) {
1729 pppol2tp_seq_tunnel_show(m, pd->tunnel); 1735 pppol2tp_seq_tunnel_show(m, pd->tunnel);
1730 } else { 1736 else
1731 pppol2tp_seq_session_show(m, pd->session); 1737 pppol2tp_seq_session_show(m, pd->session);
1732 l2tp_session_dec_refcount(pd->session);
1733 }
1734 1738
1735out: 1739out:
1736 return 0; 1740 return 0;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 85dbaa891059..bdf6fa78d0d2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -695,7 +695,7 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
695 if (sta) { 695 if (sta) {
696 ret = 0; 696 ret = 0;
697 memcpy(mac, sta->sta.addr, ETH_ALEN); 697 memcpy(mac, sta->sta.addr, ETH_ALEN);
698 sta_set_sinfo(sta, sinfo); 698 sta_set_sinfo(sta, sinfo, true);
699 } 699 }
700 700
701 mutex_unlock(&local->sta_mtx); 701 mutex_unlock(&local->sta_mtx);
@@ -724,7 +724,7 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev,
724 sta = sta_info_get_bss(sdata, mac); 724 sta = sta_info_get_bss(sdata, mac);
725 if (sta) { 725 if (sta) {
726 ret = 0; 726 ret = 0;
727 sta_set_sinfo(sta, sinfo); 727 sta_set_sinfo(sta, sinfo, true);
728 } 728 }
729 729
730 mutex_unlock(&local->sta_mtx); 730 mutex_unlock(&local->sta_mtx);
@@ -2376,6 +2376,11 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
2376 (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) 2376 (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
2377 ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); 2377 ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
2378 2378
2379 if (changed & (WIPHY_PARAM_TXQ_LIMIT |
2380 WIPHY_PARAM_TXQ_MEMORY_LIMIT |
2381 WIPHY_PARAM_TXQ_QUANTUM))
2382 ieee80211_txq_set_params(local);
2383
2379 return 0; 2384 return 0;
2380} 2385}
2381 2386
@@ -3705,6 +3710,99 @@ static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
3705 return 0; 3710 return 0;
3706} 3711}
3707 3712
3713void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
3714 struct txq_info *txqi)
3715{
3716 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) {
3717 txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES);
3718 txqstats->backlog_bytes = txqi->tin.backlog_bytes;
3719 }
3720
3721 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) {
3722 txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS);
3723 txqstats->backlog_packets = txqi->tin.backlog_packets;
3724 }
3725
3726 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) {
3727 txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS);
3728 txqstats->flows = txqi->tin.flows;
3729 }
3730
3731 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) {
3732 txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS);
3733 txqstats->drops = txqi->cstats.drop_count;
3734 }
3735
3736 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) {
3737 txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS);
3738 txqstats->ecn_marks = txqi->cstats.ecn_mark;
3739 }
3740
3741 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) {
3742 txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT);
3743 txqstats->overlimit = txqi->tin.overlimit;
3744 }
3745
3746 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) {
3747 txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS);
3748 txqstats->collisions = txqi->tin.collisions;
3749 }
3750
3751 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) {
3752 txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES);
3753 txqstats->tx_bytes = txqi->tin.tx_bytes;
3754 }
3755
3756 if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) {
3757 txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS);
3758 txqstats->tx_packets = txqi->tin.tx_packets;
3759 }
3760}
3761
3762static int ieee80211_get_txq_stats(struct wiphy *wiphy,
3763 struct wireless_dev *wdev,
3764 struct cfg80211_txq_stats *txqstats)
3765{
3766 struct ieee80211_local *local = wiphy_priv(wiphy);
3767 struct ieee80211_sub_if_data *sdata;
3768 int ret = 0;
3769
3770 if (!local->ops->wake_tx_queue)
3771 return 1;
3772
3773 spin_lock_bh(&local->fq.lock);
3774 rcu_read_lock();
3775
3776 if (wdev) {
3777 sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
3778 if (!sdata->vif.txq) {
3779 ret = 1;
3780 goto out;
3781 }
3782 ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq));
3783 } else {
3784 /* phy stats */
3785 txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) |
3786 BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) |
3787 BIT(NL80211_TXQ_STATS_OVERLIMIT) |
3788 BIT(NL80211_TXQ_STATS_OVERMEMORY) |
3789 BIT(NL80211_TXQ_STATS_COLLISIONS) |
3790 BIT(NL80211_TXQ_STATS_MAX_FLOWS);
3791 txqstats->backlog_packets = local->fq.backlog;
3792 txqstats->backlog_bytes = local->fq.memory_usage;
3793 txqstats->overlimit = local->fq.overlimit;
3794 txqstats->overmemory = local->fq.overmemory;
3795 txqstats->collisions = local->fq.collisions;
3796 txqstats->max_flows = local->fq.flows_cnt;
3797 }
3798
3799out:
3800 rcu_read_unlock();
3801 spin_unlock_bh(&local->fq.lock);
3802
3803 return ret;
3804}
3805
3708const struct cfg80211_ops mac80211_config_ops = { 3806const struct cfg80211_ops mac80211_config_ops = {
3709 .add_virtual_intf = ieee80211_add_iface, 3807 .add_virtual_intf = ieee80211_add_iface,
3710 .del_virtual_intf = ieee80211_del_iface, 3808 .del_virtual_intf = ieee80211_del_iface,
@@ -3798,4 +3896,5 @@ const struct cfg80211_ops mac80211_config_ops = {
3798 .del_nan_func = ieee80211_del_nan_func, 3896 .del_nan_func = ieee80211_del_nan_func,
3799 .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, 3897 .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
3800 .tx_control_port = ieee80211_tx_control_port, 3898 .tx_control_port = ieee80211_tx_control_port,
3899 .get_txq_stats = ieee80211_get_txq_stats,
3801}; 3900};
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 4d82fe7d627c..8f6998091d26 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -2,6 +2,7 @@
2/* 2/*
3* Portions of this file 3* Portions of this file
4* Copyright(c) 2016 Intel Deutschland GmbH 4* Copyright(c) 2016 Intel Deutschland GmbH
5* Copyright (C) 2018 Intel Corporation
5*/ 6*/
6 7
7#ifndef __MAC80211_DRIVER_OPS 8#ifndef __MAC80211_DRIVER_OPS
@@ -813,7 +814,8 @@ drv_allow_buffered_frames(struct ieee80211_local *local,
813} 814}
814 815
815static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, 816static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
816 struct ieee80211_sub_if_data *sdata) 817 struct ieee80211_sub_if_data *sdata,
818 u16 duration)
817{ 819{
818 might_sleep(); 820 might_sleep();
819 821
@@ -821,9 +823,9 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
821 return; 823 return;
822 WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); 824 WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
823 825
824 trace_drv_mgd_prepare_tx(local, sdata); 826 trace_drv_mgd_prepare_tx(local, sdata, duration);
825 if (local->ops->mgd_prepare_tx) 827 if (local->ops->mgd_prepare_tx)
826 local->ops->mgd_prepare_tx(&local->hw, &sdata->vif); 828 local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration);
827 trace_drv_return_void(local); 829 trace_drv_return_void(local);
828} 830}
829 831
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 9cc986deda61..690c142a7a44 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -4,6 +4,7 @@
4 * Copied from cfg.c - originally 4 * Copied from cfg.c - originally
5 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2014 Intel Corporation (Author: Johannes Berg) 6 * Copyright 2014 Intel Corporation (Author: Johannes Berg)
7 * Copyright (C) 2018 Intel Corporation
7 * 8 *
8 * This file is GPLv2 as found in COPYING. 9 * This file is GPLv2 as found in COPYING.
9 */ 10 */
@@ -106,8 +107,8 @@ static void ieee80211_get_stats(struct net_device *dev,
106 if (!(sta && !WARN_ON(sta->sdata->dev != dev))) 107 if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
107 goto do_survey; 108 goto do_survey;
108 109
109 sinfo.filled = 0; 110 memset(&sinfo, 0, sizeof(sinfo));
110 sta_set_sinfo(sta, &sinfo); 111 sta_set_sinfo(sta, &sinfo, false);
111 112
112 i = 0; 113 i = 0;
113 ADD_STA_STATS(sta); 114 ADD_STA_STATS(sta);
@@ -116,11 +117,11 @@ static void ieee80211_get_stats(struct net_device *dev,
116 117
117 118
118 if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) 119 if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
119 data[i] = 100000 * 120 data[i] = 100000ULL *
120 cfg80211_calculate_bitrate(&sinfo.txrate); 121 cfg80211_calculate_bitrate(&sinfo.txrate);
121 i++; 122 i++;
122 if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE)) 123 if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
123 data[i] = 100000 * 124 data[i] = 100000ULL *
124 cfg80211_calculate_bitrate(&sinfo.rxrate); 125 cfg80211_calculate_bitrate(&sinfo.rxrate);
125 i++; 126 i++;
126 127
@@ -133,8 +134,8 @@ static void ieee80211_get_stats(struct net_device *dev,
133 if (sta->sdata->dev != dev) 134 if (sta->sdata->dev != dev)
134 continue; 135 continue;
135 136
136 sinfo.filled = 0; 137 memset(&sinfo, 0, sizeof(sinfo));
137 sta_set_sinfo(sta, &sinfo); 138 sta_set_sinfo(sta, &sinfo, false);
138 i = 0; 139 i = 0;
139 ADD_STA_STATS(sta); 140 ADD_STA_STATS(sta);
140 } 141 }
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c78036a0ac94..26a7ba3b698f 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -301,26 +301,27 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
301 ___ieee80211_stop_tx_ba_session(sta, i, reason); 301 ___ieee80211_stop_tx_ba_session(sta, i, reason);
302 mutex_unlock(&sta->ampdu_mlme.mtx); 302 mutex_unlock(&sta->ampdu_mlme.mtx);
303 303
304 /* stopping might queue the work again - so cancel only afterwards */
305 cancel_work_sync(&sta->ampdu_mlme.work);
306
307 /* 304 /*
308 * In case the tear down is part of a reconfigure due to HW restart 305 * In case the tear down is part of a reconfigure due to HW restart
309 * request, it is possible that the low level driver requested to stop 306 * request, it is possible that the low level driver requested to stop
310 * the BA session, so handle it to properly clean tid_tx data. 307 * the BA session, so handle it to properly clean tid_tx data.
311 */ 308 */
312 mutex_lock(&sta->ampdu_mlme.mtx); 309 if(reason == AGG_STOP_DESTROY_STA) {
313 for (i = 0; i < IEEE80211_NUM_TIDS; i++) { 310 cancel_work_sync(&sta->ampdu_mlme.work);
314 struct tid_ampdu_tx *tid_tx =
315 rcu_dereference_protected_tid_tx(sta, i);
316 311
317 if (!tid_tx) 312 mutex_lock(&sta->ampdu_mlme.mtx);
318 continue; 313 for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
314 struct tid_ampdu_tx *tid_tx =
315 rcu_dereference_protected_tid_tx(sta, i);
319 316
320 if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) 317 if (!tid_tx)
321 ieee80211_stop_tx_ba_cb(sta, i, tid_tx); 318 continue;
319
320 if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
321 ieee80211_stop_tx_ba_cb(sta, i, tid_tx);
322 }
323 mutex_unlock(&sta->ampdu_mlme.mtx);
322 } 324 }
323 mutex_unlock(&sta->ampdu_mlme.mtx);
324} 325}
325 326
326void ieee80211_ba_session_work(struct work_struct *work) 327void ieee80211_ba_session_work(struct work_struct *work)
@@ -328,16 +329,11 @@ void ieee80211_ba_session_work(struct work_struct *work)
328 struct sta_info *sta = 329 struct sta_info *sta =
329 container_of(work, struct sta_info, ampdu_mlme.work); 330 container_of(work, struct sta_info, ampdu_mlme.work);
330 struct tid_ampdu_tx *tid_tx; 331 struct tid_ampdu_tx *tid_tx;
332 bool blocked;
331 int tid; 333 int tid;
332 334
333 /* 335 /* When this flag is set, new sessions should be blocked. */
334 * When this flag is set, new sessions should be 336 blocked = test_sta_flag(sta, WLAN_STA_BLOCK_BA);
335 * blocked, and existing sessions will be torn
336 * down by the code that set the flag, so this
337 * need not run.
338 */
339 if (test_sta_flag(sta, WLAN_STA_BLOCK_BA))
340 return;
341 337
342 mutex_lock(&sta->ampdu_mlme.mtx); 338 mutex_lock(&sta->ampdu_mlme.mtx);
343 for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { 339 for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
@@ -352,7 +348,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
352 sta, tid, WLAN_BACK_RECIPIENT, 348 sta, tid, WLAN_BACK_RECIPIENT,
353 WLAN_REASON_UNSPECIFIED, true); 349 WLAN_REASON_UNSPECIFIED, true);
354 350
355 if (test_and_clear_bit(tid, 351 if (!blocked &&
352 test_and_clear_bit(tid,
356 sta->ampdu_mlme.tid_rx_manage_offl)) 353 sta->ampdu_mlme.tid_rx_manage_offl))
357 ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, 354 ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
358 IEEE80211_MAX_AMPDU_BUF, 355 IEEE80211_MAX_AMPDU_BUF,
@@ -367,7 +364,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
367 spin_lock_bh(&sta->lock); 364 spin_lock_bh(&sta->lock);
368 365
369 tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; 366 tid_tx = sta->ampdu_mlme.tid_start_tx[tid];
370 if (tid_tx) { 367 if (!blocked && tid_tx) {
371 /* 368 /*
372 * Assign it over to the normal tid_tx array 369 * Assign it over to the normal tid_tx array
373 * where it "goes live". 370 * where it "goes live".
@@ -390,7 +387,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
390 if (!tid_tx) 387 if (!tid_tx)
391 continue; 388 continue;
392 389
393 if (test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state)) 390 if (!blocked &&
391 test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state))
394 ieee80211_start_tx_ba_cb(sta, tid, tid_tx); 392 ieee80211_start_tx_ba_cb(sta, tid, tid_tx);
395 if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) 393 if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state))
396 ___ieee80211_stop_tx_ba_session(sta, tid, 394 ___ieee80211_stop_tx_ba_session(sta, tid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6372dbdadf53..d1978aa1c15d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2012,6 +2012,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
2012} 2012}
2013 2013
2014int ieee80211_txq_setup_flows(struct ieee80211_local *local); 2014int ieee80211_txq_setup_flows(struct ieee80211_local *local);
2015void ieee80211_txq_set_params(struct ieee80211_local *local);
2015void ieee80211_txq_teardown_flows(struct ieee80211_local *local); 2016void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
2016void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, 2017void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
2017 struct sta_info *sta, 2018 struct sta_info *sta,
@@ -2020,6 +2021,8 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
2020 struct txq_info *txqi); 2021 struct txq_info *txqi);
2021void ieee80211_txq_remove_vlan(struct ieee80211_local *local, 2022void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
2022 struct ieee80211_sub_if_data *sdata); 2023 struct ieee80211_sub_if_data *sdata);
2024void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
2025 struct txq_info *txqi);
2023void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, 2026void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
2024 u16 transaction, u16 auth_alg, u16 status, 2027 u16 transaction, u16 auth_alg, u16 status,
2025 const u8 *extra, size_t extra_len, const u8 *bssid, 2028 const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 9ea17afaa237..4d2e797e3f16 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -565,6 +565,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
565 if (!ops->set_key) 565 if (!ops->set_key)
566 wiphy->flags |= WIPHY_FLAG_IBSS_RSN; 566 wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
567 567
568 if (ops->wake_tx_queue)
569 wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS);
570
568 wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM); 571 wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM);
569 572
570 wiphy->bss_priv_size = sizeof(struct ieee80211_bss); 573 wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 233068756502..a59187c016e0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -864,7 +864,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
864 return; 864 return;
865 } 865 }
866 866
867 drv_mgd_prepare_tx(local, sdata); 867 drv_mgd_prepare_tx(local, sdata, 0);
868 868
869 IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; 869 IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
870 if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) 870 if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
@@ -2022,7 +2022,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
2022 */ 2022 */
2023 if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) && 2023 if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) &&
2024 !ifmgd->have_beacon) 2024 !ifmgd->have_beacon)
2025 drv_mgd_prepare_tx(sdata->local, sdata); 2025 drv_mgd_prepare_tx(sdata->local, sdata, 0);
2026 2026
2027 ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype, 2027 ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
2028 reason, tx, frame_buf); 2028 reason, tx, frame_buf);
@@ -2560,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
2560 if (!elems.challenge) 2560 if (!elems.challenge)
2561 return; 2561 return;
2562 auth_data->expected_transaction = 4; 2562 auth_data->expected_transaction = 4;
2563 drv_mgd_prepare_tx(sdata->local, sdata); 2563 drv_mgd_prepare_tx(sdata->local, sdata, 0);
2564 if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) 2564 if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
2565 tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | 2565 tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
2566 IEEE80211_TX_INTFL_MLME_CONN_TX; 2566 IEEE80211_TX_INTFL_MLME_CONN_TX;
@@ -3769,6 +3769,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
3769 u32 tx_flags = 0; 3769 u32 tx_flags = 0;
3770 u16 trans = 1; 3770 u16 trans = 1;
3771 u16 status = 0; 3771 u16 status = 0;
3772 u16 prepare_tx_duration = 0;
3772 3773
3773 sdata_assert_lock(sdata); 3774 sdata_assert_lock(sdata);
3774 3775
@@ -3790,7 +3791,11 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
3790 return -ETIMEDOUT; 3791 return -ETIMEDOUT;
3791 } 3792 }
3792 3793
3793 drv_mgd_prepare_tx(local, sdata); 3794 if (auth_data->algorithm == WLAN_AUTH_SAE)
3795 prepare_tx_duration =
3796 jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE);
3797
3798 drv_mgd_prepare_tx(local, sdata, prepare_tx_duration);
3794 3799
3795 sdata_info(sdata, "send auth to %pM (try %d/%d)\n", 3800 sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
3796 auth_data->bss->bssid, auth_data->tries, 3801 auth_data->bss->bssid, auth_data->tries,
@@ -4994,7 +4999,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
4994 req->bssid, req->reason_code, 4999 req->bssid, req->reason_code,
4995 ieee80211_get_reason_code_string(req->reason_code)); 5000 ieee80211_get_reason_code_string(req->reason_code));
4996 5001
4997 drv_mgd_prepare_tx(sdata->local, sdata); 5002 drv_mgd_prepare_tx(sdata->local, sdata, 0);
4998 ieee80211_send_deauth_disassoc(sdata, req->bssid, 5003 ieee80211_send_deauth_disassoc(sdata, req->bssid,
4999 IEEE80211_STYPE_DEAUTH, 5004 IEEE80211_STYPE_DEAUTH,
5000 req->reason_code, tx, 5005 req->reason_code, tx,
@@ -5014,7 +5019,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
5014 req->bssid, req->reason_code, 5019 req->bssid, req->reason_code,
5015 ieee80211_get_reason_code_string(req->reason_code)); 5020 ieee80211_get_reason_code_string(req->reason_code));
5016 5021
5017 drv_mgd_prepare_tx(sdata->local, sdata); 5022 drv_mgd_prepare_tx(sdata->local, sdata, 0);
5018 ieee80211_send_deauth_disassoc(sdata, req->bssid, 5023 ieee80211_send_deauth_disassoc(sdata, req->bssid,
5019 IEEE80211_STYPE_DEAUTH, 5024 IEEE80211_STYPE_DEAUTH,
5020 req->reason_code, tx, 5025 req->reason_code, tx,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 03102aff0953..0a38cc1cbebc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -5,6 +5,7 @@
5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2013-2014 Intel Mobile Communications GmbH 6 * Copyright 2013-2014 Intel Mobile Communications GmbH
7 * Copyright(c) 2015 - 2017 Intel Deutschland GmbH 7 * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
8 * Copyright (C) 2018 Intel Corporation
8 * 9 *
9 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as 11 * it under the terms of the GNU General Public License version 2 as
@@ -97,27 +98,27 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
97 */ 98 */
98static void remove_monitor_info(struct sk_buff *skb, 99static void remove_monitor_info(struct sk_buff *skb,
99 unsigned int present_fcs_len, 100 unsigned int present_fcs_len,
100 unsigned int rtap_vendor_space) 101 unsigned int rtap_space)
101{ 102{
102 if (present_fcs_len) 103 if (present_fcs_len)
103 __pskb_trim(skb, skb->len - present_fcs_len); 104 __pskb_trim(skb, skb->len - present_fcs_len);
104 __pskb_pull(skb, rtap_vendor_space); 105 __pskb_pull(skb, rtap_space);
105} 106}
106 107
107static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, 108static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
108 unsigned int rtap_vendor_space) 109 unsigned int rtap_space)
109{ 110{
110 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); 111 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
111 struct ieee80211_hdr *hdr; 112 struct ieee80211_hdr *hdr;
112 113
113 hdr = (void *)(skb->data + rtap_vendor_space); 114 hdr = (void *)(skb->data + rtap_space);
114 115
115 if (status->flag & (RX_FLAG_FAILED_FCS_CRC | 116 if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
116 RX_FLAG_FAILED_PLCP_CRC | 117 RX_FLAG_FAILED_PLCP_CRC |
117 RX_FLAG_ONLY_MONITOR)) 118 RX_FLAG_ONLY_MONITOR))
118 return true; 119 return true;
119 120
120 if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space)) 121 if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
121 return true; 122 return true;
122 123
123 if (ieee80211_is_ctl(hdr->frame_control) && 124 if (ieee80211_is_ctl(hdr->frame_control) &&
@@ -199,7 +200,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
199 200
200static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, 201static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
201 struct sk_buff *skb, 202 struct sk_buff *skb,
202 int rtap_vendor_space) 203 int rtap_space)
203{ 204{
204 struct { 205 struct {
205 struct ieee80211_hdr_3addr hdr; 206 struct ieee80211_hdr_3addr hdr;
@@ -212,14 +213,14 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
212 213
213 BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1); 214 BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
214 215
215 if (skb->len < rtap_vendor_space + sizeof(action) + 216 if (skb->len < rtap_space + sizeof(action) +
216 VHT_MUMIMO_GROUPS_DATA_LEN) 217 VHT_MUMIMO_GROUPS_DATA_LEN)
217 return; 218 return;
218 219
219 if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr)) 220 if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
220 return; 221 return;
221 222
222 skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action)); 223 skb_copy_bits(skb, rtap_space, &action, sizeof(action));
223 224
224 if (!ieee80211_is_action(action.hdr.frame_control)) 225 if (!ieee80211_is_action(action.hdr.frame_control))
225 return; 226 return;
@@ -545,7 +546,7 @@ static struct sk_buff *
545ieee80211_make_monitor_skb(struct ieee80211_local *local, 546ieee80211_make_monitor_skb(struct ieee80211_local *local,
546 struct sk_buff **origskb, 547 struct sk_buff **origskb,
547 struct ieee80211_rate *rate, 548 struct ieee80211_rate *rate,
548 int rtap_vendor_space, bool use_origskb) 549 int rtap_space, bool use_origskb)
549{ 550{
550 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb); 551 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb);
551 int rt_hdrlen, needed_headroom; 552 int rt_hdrlen, needed_headroom;
@@ -553,7 +554,7 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local,
553 554
554 /* room for the radiotap header based on driver features */ 555 /* room for the radiotap header based on driver features */
555 rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb); 556 rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb);
556 needed_headroom = rt_hdrlen - rtap_vendor_space; 557 needed_headroom = rt_hdrlen - rtap_space;
557 558
558 if (use_origskb) { 559 if (use_origskb) {
559 /* only need to expand headroom if necessary */ 560 /* only need to expand headroom if necessary */
@@ -607,7 +608,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
607 struct ieee80211_sub_if_data *sdata; 608 struct ieee80211_sub_if_data *sdata;
608 struct sk_buff *monskb = NULL; 609 struct sk_buff *monskb = NULL;
609 int present_fcs_len = 0; 610 int present_fcs_len = 0;
610 unsigned int rtap_vendor_space = 0; 611 unsigned int rtap_space = 0;
611 struct ieee80211_sub_if_data *monitor_sdata = 612 struct ieee80211_sub_if_data *monitor_sdata =
612 rcu_dereference(local->monitor_sdata); 613 rcu_dereference(local->monitor_sdata);
613 bool only_monitor = false; 614 bool only_monitor = false;
@@ -615,7 +616,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
615 if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { 616 if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
616 struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; 617 struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
617 618
618 rtap_vendor_space = sizeof(*rtap) + rtap->len + rtap->pad; 619 rtap_space += sizeof(*rtap) + rtap->len + rtap->pad;
619 } 620 }
620 621
621 /* 622 /*
@@ -638,13 +639,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
638 } 639 }
639 640
640 /* ensure hdr->frame_control and vendor radiotap data are in skb head */ 641 /* ensure hdr->frame_control and vendor radiotap data are in skb head */
641 if (!pskb_may_pull(origskb, 2 + rtap_vendor_space)) { 642 if (!pskb_may_pull(origskb, 2 + rtap_space)) {
642 dev_kfree_skb(origskb); 643 dev_kfree_skb(origskb);
643 return NULL; 644 return NULL;
644 } 645 }
645 646
646 only_monitor = should_drop_frame(origskb, present_fcs_len, 647 only_monitor = should_drop_frame(origskb, present_fcs_len, rtap_space);
647 rtap_vendor_space);
648 648
649 if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) { 649 if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
650 if (only_monitor) { 650 if (only_monitor) {
@@ -652,12 +652,11 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
652 return NULL; 652 return NULL;
653 } 653 }
654 654
655 remove_monitor_info(origskb, present_fcs_len, 655 remove_monitor_info(origskb, present_fcs_len, rtap_space);
656 rtap_vendor_space);
657 return origskb; 656 return origskb;
658 } 657 }
659 658
660 ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space); 659 ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space);
661 660
662 list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) { 661 list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) {
663 bool last_monitor = list_is_last(&sdata->u.mntr.list, 662 bool last_monitor = list_is_last(&sdata->u.mntr.list,
@@ -665,8 +664,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
665 664
666 if (!monskb) 665 if (!monskb)
667 monskb = ieee80211_make_monitor_skb(local, &origskb, 666 monskb = ieee80211_make_monitor_skb(local, &origskb,
668 rate, 667 rate, rtap_space,
669 rtap_vendor_space,
670 only_monitor && 668 only_monitor &&
671 last_monitor); 669 last_monitor);
672 670
@@ -698,7 +696,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
698 if (!origskb) 696 if (!origskb)
699 return NULL; 697 return NULL;
700 698
701 remove_monitor_info(origskb, present_fcs_len, rtap_vendor_space); 699 remove_monitor_info(origskb, present_fcs_len, rtap_space);
702 return origskb; 700 return origskb;
703} 701}
704 702
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 655c3d8b0d80..6428f1ac37b6 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -3,6 +3,7 @@
3 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 3 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
4 * Copyright 2013-2014 Intel Mobile Communications GmbH 4 * Copyright 2013-2014 Intel Mobile Communications GmbH
5 * Copyright (C) 2015 - 2017 Intel Deutschland GmbH 5 * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
6 * Copyright (C) 2018 Intel Corporation
6 * 7 *
7 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
@@ -357,6 +358,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
357 358
358 sta->last_connected = ktime_get_seconds(); 359 sta->last_connected = ktime_get_seconds();
359 ewma_signal_init(&sta->rx_stats_avg.signal); 360 ewma_signal_init(&sta->rx_stats_avg.signal);
361 ewma_avg_signal_init(&sta->status_stats.avg_ack_signal);
360 for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++) 362 for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++)
361 ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]); 363 ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]);
362 364
@@ -1006,7 +1008,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
1006 1008
1007 sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); 1009 sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
1008 if (sinfo) 1010 if (sinfo)
1009 sta_set_sinfo(sta, sinfo); 1011 sta_set_sinfo(sta, sinfo, true);
1010 cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); 1012 cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
1011 kfree(sinfo); 1013 kfree(sinfo);
1012 1014
@@ -1992,7 +1994,6 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
1992 int band = STA_STATS_GET(LEGACY_BAND, rate); 1994 int band = STA_STATS_GET(LEGACY_BAND, rate);
1993 int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); 1995 int rate_idx = STA_STATS_GET(LEGACY_IDX, rate);
1994 1996
1995 rinfo->flags = 0;
1996 sband = local->hw.wiphy->bands[band]; 1997 sband = local->hw.wiphy->bands[band];
1997 brate = sband->bitrates[rate_idx].bitrate; 1998 brate = sband->bitrates[rate_idx].bitrate;
1998 if (rinfo->bw == RATE_INFO_BW_5) 1999 if (rinfo->bw == RATE_INFO_BW_5)
@@ -2051,6 +2052,18 @@ static void sta_set_tidstats(struct sta_info *sta,
2051 tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); 2052 tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
2052 tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid]; 2053 tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid];
2053 } 2054 }
2055
2056 if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) {
2057 spin_lock_bh(&local->fq.lock);
2058 rcu_read_lock();
2059
2060 tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS);
2061 ieee80211_fill_txq_stats(&tidstats->txq_stats,
2062 to_txq_info(sta->sta.txq[tid]));
2063
2064 rcu_read_unlock();
2065 spin_unlock_bh(&local->fq.lock);
2066 }
2054} 2067}
2055 2068
2056static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) 2069static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
@@ -2066,7 +2079,8 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
2066 return value; 2079 return value;
2067} 2080}
2068 2081
2069void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) 2082void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2083 bool tidstats)
2070{ 2084{
2071 struct ieee80211_sub_if_data *sdata = sta->sdata; 2085 struct ieee80211_sub_if_data *sdata = sta->sdata;
2072 struct ieee80211_local *local = sdata->local; 2086 struct ieee80211_local *local = sdata->local;
@@ -2220,11 +2234,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2220 sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); 2234 sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
2221 } 2235 }
2222 2236
2223 sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS); 2237 if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
2224 for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { 2238 for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
2225 struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i]; 2239 struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
2226 2240
2227 sta_set_tidstats(sta, tidstats, i); 2241 sta_set_tidstats(sta, tidstats, i);
2242 }
2228 } 2243 }
2229 2244
2230 if (ieee80211_vif_is_mesh(&sdata->vif)) { 2245 if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -2294,6 +2309,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2294 sinfo->ack_signal = sta->status_stats.last_ack_signal; 2309 sinfo->ack_signal = sta->status_stats.last_ack_signal;
2295 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); 2310 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
2296 } 2311 }
2312
2313 if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) &&
2314 !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) {
2315 sinfo->avg_ack_signal =
2316 -(s8)ewma_avg_signal_read(
2317 &sta->status_stats.avg_ack_signal);
2318 sinfo->filled |=
2319 BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG);
2320 }
2297} 2321}
2298 2322
2299u32 sta_get_expected_throughput(struct sta_info *sta) 2323u32 sta_get_expected_throughput(struct sta_info *sta)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index f64eb86ca64b..81b35f623792 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -119,6 +119,7 @@ enum ieee80211_sta_info_flags {
119#define HT_AGG_STATE_START_CB 6 119#define HT_AGG_STATE_START_CB 6
120#define HT_AGG_STATE_STOP_CB 7 120#define HT_AGG_STATE_STOP_CB 7
121 121
122DECLARE_EWMA(avg_signal, 10, 8)
122enum ieee80211_agg_stop_reason { 123enum ieee80211_agg_stop_reason {
123 AGG_STOP_DECLINED, 124 AGG_STOP_DECLINED,
124 AGG_STOP_LOCAL_REQUEST, 125 AGG_STOP_LOCAL_REQUEST,
@@ -550,6 +551,7 @@ struct sta_info {
550 unsigned long last_ack; 551 unsigned long last_ack;
551 s8 last_ack_signal; 552 s8 last_ack_signal;
552 bool ack_signal_filled; 553 bool ack_signal_filled;
554 struct ewma_avg_signal avg_ack_signal;
553 } status_stats; 555 } status_stats;
554 556
555 /* Updated from TX path only, no locking requirements */ 557 /* Updated from TX path only, no locking requirements */
@@ -742,7 +744,8 @@ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
742void sta_set_rate_info_tx(struct sta_info *sta, 744void sta_set_rate_info_tx(struct sta_info *sta,
743 const struct ieee80211_tx_rate *rate, 745 const struct ieee80211_tx_rate *rate,
744 struct rate_info *rinfo); 746 struct rate_info *rinfo);
745void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); 747void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
748 bool tidstats);
746 749
747u32 sta_get_expected_throughput(struct sta_info *sta); 750u32 sta_get_expected_throughput(struct sta_info *sta);
748 751
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 743e89c5926c..9a6d7208bf4f 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -195,6 +195,8 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
195 sta->status_stats.last_ack_signal = 195 sta->status_stats.last_ack_signal =
196 (s8)txinfo->status.ack_signal; 196 (s8)txinfo->status.ack_signal;
197 sta->status_stats.ack_signal_filled = true; 197 sta->status_stats.ack_signal_filled = true;
198 ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
199 -txinfo->status.ack_signal);
198 } 200 }
199 } 201 }
200 202
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 591ad02e1fa4..80a7edf8d314 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2,6 +2,7 @@
2/* 2/*
3* Portions of this file 3* Portions of this file
4* Copyright(c) 2016 Intel Deutschland GmbH 4* Copyright(c) 2016 Intel Deutschland GmbH
5* Copyright (C) 2018 Intel Corporation
5*/ 6*/
6 7
7#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) 8#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
@@ -1413,11 +1414,29 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
1413 TP_ARGS(local, sta, tids, num_frames, reason, more_data) 1414 TP_ARGS(local, sta, tids, num_frames, reason, more_data)
1414); 1415);
1415 1416
1416DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx, 1417TRACE_EVENT(drv_mgd_prepare_tx,
1417 TP_PROTO(struct ieee80211_local *local, 1418 TP_PROTO(struct ieee80211_local *local,
1418 struct ieee80211_sub_if_data *sdata), 1419 struct ieee80211_sub_if_data *sdata,
1420 u16 duration),
1419 1421
1420 TP_ARGS(local, sdata) 1422 TP_ARGS(local, sdata, duration),
1423
1424 TP_STRUCT__entry(
1425 LOCAL_ENTRY
1426 VIF_ENTRY
1427 __field(u32, duration)
1428 ),
1429
1430 TP_fast_assign(
1431 LOCAL_ASSIGN;
1432 VIF_ASSIGN;
1433 __entry->duration = duration;
1434 ),
1435
1436 TP_printk(
1437 LOCAL_PR_FMT VIF_PR_FMT " duration: %u",
1438 LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration
1439 )
1421); 1440);
1422 1441
1423DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover, 1442DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 05a265cd573d..44b5dfe8727d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1460,6 +1460,24 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
1460 ieee80211_purge_tx_queue(&local->hw, &txqi->frags); 1460 ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
1461} 1461}
1462 1462
1463void ieee80211_txq_set_params(struct ieee80211_local *local)
1464{
1465 if (local->hw.wiphy->txq_limit)
1466 local->fq.limit = local->hw.wiphy->txq_limit;
1467 else
1468 local->hw.wiphy->txq_limit = local->fq.limit;
1469
1470 if (local->hw.wiphy->txq_memory_limit)
1471 local->fq.memory_limit = local->hw.wiphy->txq_memory_limit;
1472 else
1473 local->hw.wiphy->txq_memory_limit = local->fq.memory_limit;
1474
1475 if (local->hw.wiphy->txq_quantum)
1476 local->fq.quantum = local->hw.wiphy->txq_quantum;
1477 else
1478 local->hw.wiphy->txq_quantum = local->fq.quantum;
1479}
1480
1463int ieee80211_txq_setup_flows(struct ieee80211_local *local) 1481int ieee80211_txq_setup_flows(struct ieee80211_local *local)
1464{ 1482{
1465 struct fq *fq = &local->fq; 1483 struct fq *fq = &local->fq;
@@ -1509,6 +1527,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
1509 for (i = 0; i < fq->flows_cnt; i++) 1527 for (i = 0; i < fq->flows_cnt; i++)
1510 codel_vars_init(&local->cvars[i]); 1528 codel_vars_init(&local->cvars[i]);
1511 1529
1530 ieee80211_txq_set_params(local);
1531
1512 return 0; 1532 return 0;
1513} 1533}
1514 1534
@@ -4085,6 +4105,31 @@ unlock:
4085} 4105}
4086EXPORT_SYMBOL(ieee80211_csa_update_counter); 4106EXPORT_SYMBOL(ieee80211_csa_update_counter);
4087 4107
4108void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
4109{
4110 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
4111 struct beacon_data *beacon = NULL;
4112
4113 rcu_read_lock();
4114
4115 if (sdata->vif.type == NL80211_IFTYPE_AP)
4116 beacon = rcu_dereference(sdata->u.ap.beacon);
4117 else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
4118 beacon = rcu_dereference(sdata->u.ibss.presp);
4119 else if (ieee80211_vif_is_mesh(&sdata->vif))
4120 beacon = rcu_dereference(sdata->u.mesh.beacon);
4121
4122 if (!beacon)
4123 goto unlock;
4124
4125 if (counter < beacon->csa_current_counter)
4126 beacon->csa_current_counter = counter;
4127
4128unlock:
4129 rcu_read_unlock();
4130}
4131EXPORT_SYMBOL(ieee80211_csa_set_counter);
4132
4088bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) 4133bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
4089{ 4134{
4090 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); 4135 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 11f9cfc016d9..2d82c88efd0b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2793,12 +2793,13 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
2793 2793
2794 memset(&ri, 0, sizeof(ri)); 2794 memset(&ri, 0, sizeof(ri));
2795 2795
2796 ri.bw = status->bw;
2797
2796 /* Fill cfg80211 rate info */ 2798 /* Fill cfg80211 rate info */
2797 switch (status->encoding) { 2799 switch (status->encoding) {
2798 case RX_ENC_HT: 2800 case RX_ENC_HT:
2799 ri.mcs = status->rate_idx; 2801 ri.mcs = status->rate_idx;
2800 ri.flags |= RATE_INFO_FLAGS_MCS; 2802 ri.flags |= RATE_INFO_FLAGS_MCS;
2801 ri.bw = status->bw;
2802 if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) 2803 if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
2803 ri.flags |= RATE_INFO_FLAGS_SHORT_GI; 2804 ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
2804 break; 2805 break;
@@ -2806,7 +2807,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
2806 ri.flags |= RATE_INFO_FLAGS_VHT_MCS; 2807 ri.flags |= RATE_INFO_FLAGS_VHT_MCS;
2807 ri.mcs = status->rate_idx; 2808 ri.mcs = status->rate_idx;
2808 ri.nss = status->nss; 2809 ri.nss = status->nss;
2809 ri.bw = status->bw;
2810 if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) 2810 if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
2811 ri.flags |= RATE_INFO_FLAGS_SHORT_GI; 2811 ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
2812 break; 2812 break;
@@ -2818,8 +2818,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
2818 int shift = 0; 2818 int shift = 0;
2819 int bitrate; 2819 int bitrate;
2820 2820
2821 ri.bw = status->bw;
2822
2823 switch (status->bw) { 2821 switch (status->bw) {
2824 case RATE_INFO_BW_10: 2822 case RATE_INFO_BW_10:
2825 shift = 1; 2823 shift = 1;
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8da84312cd3b..8055e3965cef 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -68,15 +68,6 @@ enum {
68 NCSI_MODE_MAX 68 NCSI_MODE_MAX
69}; 69};
70 70
71enum {
72 NCSI_FILTER_BASE = 0,
73 NCSI_FILTER_VLAN = 0,
74 NCSI_FILTER_UC,
75 NCSI_FILTER_MC,
76 NCSI_FILTER_MIXED,
77 NCSI_FILTER_MAX
78};
79
80struct ncsi_channel_version { 71struct ncsi_channel_version {
81 u32 version; /* Supported BCD encoded NCSI version */ 72 u32 version; /* Supported BCD encoded NCSI version */
82 u32 alpha2; /* Supported BCD encoded NCSI version */ 73 u32 alpha2; /* Supported BCD encoded NCSI version */
@@ -98,11 +89,18 @@ struct ncsi_channel_mode {
98 u32 data[8]; /* Data entries */ 89 u32 data[8]; /* Data entries */
99}; 90};
100 91
101struct ncsi_channel_filter { 92struct ncsi_channel_mac_filter {
102 u32 index; /* Index of channel filters */ 93 u8 n_uc;
103 u32 total; /* Total entries in the filter table */ 94 u8 n_mc;
104 u64 bitmap; /* Bitmap of valid entries */ 95 u8 n_mixed;
105 u32 data[]; /* Data for the valid entries */ 96 u64 bitmap;
97 unsigned char *addrs;
98};
99
100struct ncsi_channel_vlan_filter {
101 u8 n_vids;
102 u64 bitmap;
103 u16 *vids;
106}; 104};
107 105
108struct ncsi_channel_stats { 106struct ncsi_channel_stats {
@@ -186,7 +184,9 @@ struct ncsi_channel {
186 struct ncsi_channel_version version; 184 struct ncsi_channel_version version;
187 struct ncsi_channel_cap caps[NCSI_CAP_MAX]; 185 struct ncsi_channel_cap caps[NCSI_CAP_MAX];
188 struct ncsi_channel_mode modes[NCSI_MODE_MAX]; 186 struct ncsi_channel_mode modes[NCSI_MODE_MAX];
189 struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; 187 /* Filtering Settings */
188 struct ncsi_channel_mac_filter mac_filter;
189 struct ncsi_channel_vlan_filter vlan_filter;
190 struct ncsi_channel_stats stats; 190 struct ncsi_channel_stats stats;
191 struct { 191 struct {
192 struct timer_list timer; 192 struct timer_list timer;
@@ -320,10 +320,6 @@ extern spinlock_t ncsi_dev_lock;
320 list_for_each_entry_rcu(nc, &np->channels, node) 320 list_for_each_entry_rcu(nc, &np->channels, node)
321 321
322/* Resources */ 322/* Resources */
323u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index);
324int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
325int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
326int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
327void ncsi_start_channel_monitor(struct ncsi_channel *nc); 323void ncsi_start_channel_monitor(struct ncsi_channel *nc);
328void ncsi_stop_channel_monitor(struct ncsi_channel *nc); 324void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
329struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, 325struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index c3695ba0cf94..5561e221b71f 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -27,125 +27,6 @@
27LIST_HEAD(ncsi_dev_list); 27LIST_HEAD(ncsi_dev_list);
28DEFINE_SPINLOCK(ncsi_dev_lock); 28DEFINE_SPINLOCK(ncsi_dev_lock);
29 29
30static inline int ncsi_filter_size(int table)
31{
32 int sizes[] = { 2, 6, 6, 6 };
33
34 BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX);
35 if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX)
36 return -EINVAL;
37
38 return sizes[table];
39}
40
41u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
42{
43 struct ncsi_channel_filter *ncf;
44 int size;
45
46 ncf = nc->filters[table];
47 if (!ncf)
48 return NULL;
49
50 size = ncsi_filter_size(table);
51 if (size < 0)
52 return NULL;
53
54 return ncf->data + size * index;
55}
56
57/* Find the first active filter in a filter table that matches the given
58 * data parameter. If data is NULL, this returns the first active filter.
59 */
60int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
61{
62 struct ncsi_channel_filter *ncf;
63 void *bitmap;
64 int index, size;
65 unsigned long flags;
66
67 ncf = nc->filters[table];
68 if (!ncf)
69 return -ENXIO;
70
71 size = ncsi_filter_size(table);
72 if (size < 0)
73 return size;
74
75 spin_lock_irqsave(&nc->lock, flags);
76 bitmap = (void *)&ncf->bitmap;
77 index = -1;
78 while ((index = find_next_bit(bitmap, ncf->total, index + 1))
79 < ncf->total) {
80 if (!data || !memcmp(ncf->data + size * index, data, size)) {
81 spin_unlock_irqrestore(&nc->lock, flags);
82 return index;
83 }
84 }
85 spin_unlock_irqrestore(&nc->lock, flags);
86
87 return -ENOENT;
88}
89
90int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data)
91{
92 struct ncsi_channel_filter *ncf;
93 int index, size;
94 void *bitmap;
95 unsigned long flags;
96
97 size = ncsi_filter_size(table);
98 if (size < 0)
99 return size;
100
101 index = ncsi_find_filter(nc, table, data);
102 if (index >= 0)
103 return index;
104
105 ncf = nc->filters[table];
106 if (!ncf)
107 return -ENODEV;
108
109 spin_lock_irqsave(&nc->lock, flags);
110 bitmap = (void *)&ncf->bitmap;
111 do {
112 index = find_next_zero_bit(bitmap, ncf->total, 0);
113 if (index >= ncf->total) {
114 spin_unlock_irqrestore(&nc->lock, flags);
115 return -ENOSPC;
116 }
117 } while (test_and_set_bit(index, bitmap));
118
119 memcpy(ncf->data + size * index, data, size);
120 spin_unlock_irqrestore(&nc->lock, flags);
121
122 return index;
123}
124
125int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
126{
127 struct ncsi_channel_filter *ncf;
128 int size;
129 void *bitmap;
130 unsigned long flags;
131
132 size = ncsi_filter_size(table);
133 if (size < 0)
134 return size;
135
136 ncf = nc->filters[table];
137 if (!ncf || index >= ncf->total)
138 return -ENODEV;
139
140 spin_lock_irqsave(&nc->lock, flags);
141 bitmap = (void *)&ncf->bitmap;
142 if (test_and_clear_bit(index, bitmap))
143 memset(ncf->data + size * index, 0, size);
144 spin_unlock_irqrestore(&nc->lock, flags);
145
146 return 0;
147}
148
149static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) 30static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
150{ 31{
151 struct ncsi_dev *nd = &ndp->ndev; 32 struct ncsi_dev *nd = &ndp->ndev;
@@ -339,20 +220,13 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
339static void ncsi_remove_channel(struct ncsi_channel *nc) 220static void ncsi_remove_channel(struct ncsi_channel *nc)
340{ 221{
341 struct ncsi_package *np = nc->package; 222 struct ncsi_package *np = nc->package;
342 struct ncsi_channel_filter *ncf;
343 unsigned long flags; 223 unsigned long flags;
344 int i;
345 224
346 /* Release filters */
347 spin_lock_irqsave(&nc->lock, flags); 225 spin_lock_irqsave(&nc->lock, flags);
348 for (i = 0; i < NCSI_FILTER_MAX; i++) {
349 ncf = nc->filters[i];
350 if (!ncf)
351 continue;
352 226
353 nc->filters[i] = NULL; 227 /* Release filters */
354 kfree(ncf); 228 kfree(nc->mac_filter.addrs);
355 } 229 kfree(nc->vlan_filter.vids);
356 230
357 nc->state = NCSI_CHANNEL_INACTIVE; 231 nc->state = NCSI_CHANNEL_INACTIVE;
358 spin_unlock_irqrestore(&nc->lock, flags); 232 spin_unlock_irqrestore(&nc->lock, flags);
@@ -670,32 +544,26 @@ error:
670static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, 544static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
671 struct ncsi_cmd_arg *nca) 545 struct ncsi_cmd_arg *nca)
672{ 546{
547 struct ncsi_channel_vlan_filter *ncf;
548 unsigned long flags;
549 void *bitmap;
673 int index; 550 int index;
674 u32 *data;
675 u16 vid; 551 u16 vid;
676 552
677 index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, NULL); 553 ncf = &nc->vlan_filter;
678 if (index < 0) { 554 bitmap = &ncf->bitmap;
679 /* Filter table empty */
680 return -1;
681 }
682 555
683 data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index); 556 spin_lock_irqsave(&nc->lock, flags);
684 if (!data) { 557 index = find_next_bit(bitmap, ncf->n_vids, 0);
685 netdev_err(ndp->ndev.dev, 558 if (index >= ncf->n_vids) {
686 "NCSI: failed to retrieve filter %d\n", index); 559 spin_unlock_irqrestore(&nc->lock, flags);
687 /* Set the VLAN id to 0 - this will still disable the entry in 560 return -1;
688 * the filter table, but we won't know what it was.
689 */
690 vid = 0;
691 } else {
692 vid = *(u16 *)data;
693 } 561 }
562 vid = ncf->vids[index];
694 563
695 netdev_printk(KERN_DEBUG, ndp->ndev.dev, 564 clear_bit(index, bitmap);
696 "NCSI: removed vlan tag %u at index %d\n", 565 ncf->vids[index] = 0;
697 vid, index + 1); 566 spin_unlock_irqrestore(&nc->lock, flags);
698 ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index);
699 567
700 nca->type = NCSI_PKT_CMD_SVF; 568 nca->type = NCSI_PKT_CMD_SVF;
701 nca->words[1] = vid; 569 nca->words[1] = vid;
@@ -711,45 +579,55 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
711static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, 579static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
712 struct ncsi_cmd_arg *nca) 580 struct ncsi_cmd_arg *nca)
713{ 581{
582 struct ncsi_channel_vlan_filter *ncf;
714 struct vlan_vid *vlan = NULL; 583 struct vlan_vid *vlan = NULL;
715 int index = 0; 584 unsigned long flags;
585 int i, index;
586 void *bitmap;
587 u16 vid;
716 588
589 if (list_empty(&ndp->vlan_vids))
590 return -1;
591
592 ncf = &nc->vlan_filter;
593 bitmap = &ncf->bitmap;
594
595 spin_lock_irqsave(&nc->lock, flags);
596
597 rcu_read_lock();
717 list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) { 598 list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
718 index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan->vid); 599 vid = vlan->vid;
719 if (index < 0) { 600 for (i = 0; i < ncf->n_vids; i++)
720 /* New tag to add */ 601 if (ncf->vids[i] == vid) {
721 netdev_printk(KERN_DEBUG, ndp->ndev.dev, 602 vid = 0;
722 "NCSI: new vlan id to set: %u\n", 603 break;
723 vlan->vid); 604 }
605 if (vid)
724 break; 606 break;
725 }
726 netdev_printk(KERN_DEBUG, ndp->ndev.dev,
727 "vid %u already at filter pos %d\n",
728 vlan->vid, index);
729 } 607 }
608 rcu_read_unlock();
730 609
731 if (!vlan || index >= 0) { 610 if (!vid) {
732 netdev_printk(KERN_DEBUG, ndp->ndev.dev, 611 /* No VLAN ID is not set */
733 "no vlan ids left to set\n"); 612 spin_unlock_irqrestore(&nc->lock, flags);
734 return -1; 613 return -1;
735 } 614 }
736 615
737 index = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan->vid); 616 index = find_next_zero_bit(bitmap, ncf->n_vids, 0);
738 if (index < 0) { 617 if (index < 0 || index >= ncf->n_vids) {
739 netdev_err(ndp->ndev.dev, 618 netdev_err(ndp->ndev.dev,
740 "Failed to add new VLAN tag, error %d\n", index); 619 "Channel %u already has all VLAN filters set\n",
741 if (index == -ENOSPC) 620 nc->id);
742 netdev_err(ndp->ndev.dev, 621 spin_unlock_irqrestore(&nc->lock, flags);
743 "Channel %u already has all VLAN filters set\n",
744 nc->id);
745 return -1; 622 return -1;
746 } 623 }
747 624
748 netdev_printk(KERN_DEBUG, ndp->ndev.dev, 625 ncf->vids[index] = vid;
749 "NCSI: set vid %u in packet, index %u\n", 626 set_bit(index, bitmap);
750 vlan->vid, index + 1); 627 spin_unlock_irqrestore(&nc->lock, flags);
628
751 nca->type = NCSI_PKT_CMD_SVF; 629 nca->type = NCSI_PKT_CMD_SVF;
752 nca->words[1] = vlan->vid; 630 nca->words[1] = vid;
753 /* HW filter index starts at 1 */ 631 /* HW filter index starts at 1 */
754 nca->bytes[6] = index + 1; 632 nca->bytes[6] = index + 1;
755 nca->bytes[7] = 0x01; 633 nca->bytes[7] = 0x01;
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 41cede4041d3..82e6edf9c5d9 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -58,10 +58,9 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
58 struct ncsi_dev_priv *ndp, 58 struct ncsi_dev_priv *ndp,
59 struct ncsi_channel *nc) 59 struct ncsi_channel *nc)
60{ 60{
61 struct nlattr *vid_nest; 61 struct ncsi_channel_vlan_filter *ncf;
62 struct ncsi_channel_filter *ncf;
63 struct ncsi_channel_mode *m; 62 struct ncsi_channel_mode *m;
64 u32 *data; 63 struct nlattr *vid_nest;
65 int i; 64 int i;
66 65
67 nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); 66 nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id);
@@ -79,18 +78,13 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
79 vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); 78 vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST);
80 if (!vid_nest) 79 if (!vid_nest)
81 return -ENOMEM; 80 return -ENOMEM;
82 ncf = nc->filters[NCSI_FILTER_VLAN]; 81 ncf = &nc->vlan_filter;
83 i = -1; 82 i = -1;
84 if (ncf) { 83 while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids,
85 while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total, 84 i + 1)) < ncf->n_vids) {
86 i + 1)) < ncf->total) { 85 if (ncf->vids[i])
87 data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i);
88 /* Uninitialised channels will have 'zero' vlan ids */
89 if (!data || !*data)
90 continue;
91 nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, 86 nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID,
92 *(u16 *)data); 87 ncf->vids[i]);
93 }
94 } 88 }
95 nla_nest_end(skb, vid_nest); 89 nla_nest_end(skb, vid_nest);
96 90
@@ -207,7 +201,6 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
207 return genlmsg_reply(skb, info); 201 return genlmsg_reply(skb, info);
208 202
209err: 203err:
210 genlmsg_cancel(skb, hdr);
211 kfree_skb(skb); 204 kfree_skb(skb);
212 return rc; 205 return rc;
213} 206}
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index efd933ff5570..930c1d3796f0 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -334,9 +334,9 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
334 struct ncsi_rsp_pkt *rsp; 334 struct ncsi_rsp_pkt *rsp;
335 struct ncsi_dev_priv *ndp = nr->ndp; 335 struct ncsi_dev_priv *ndp = nr->ndp;
336 struct ncsi_channel *nc; 336 struct ncsi_channel *nc;
337 struct ncsi_channel_filter *ncf; 337 struct ncsi_channel_vlan_filter *ncf;
338 unsigned short vlan; 338 unsigned long flags;
339 int ret; 339 void *bitmap;
340 340
341 /* Find the package and channel */ 341 /* Find the package and channel */
342 rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); 342 rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -346,22 +346,23 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
346 return -ENODEV; 346 return -ENODEV;
347 347
348 cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd); 348 cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd);
349 ncf = nc->filters[NCSI_FILTER_VLAN]; 349 ncf = &nc->vlan_filter;
350 if (!ncf) 350 if (cmd->index == 0 || cmd->index > ncf->n_vids)
351 return -ENOENT;
352 if (cmd->index >= ncf->total)
353 return -ERANGE; 351 return -ERANGE;
354 352
355 /* Add or remove the VLAN filter */ 353 /* Add or remove the VLAN filter. Remember HW indexes from 1 */
354 spin_lock_irqsave(&nc->lock, flags);
355 bitmap = &ncf->bitmap;
356 if (!(cmd->enable & 0x1)) { 356 if (!(cmd->enable & 0x1)) {
357 /* HW indexes from 1 */ 357 if (test_and_clear_bit(cmd->index - 1, bitmap))
358 ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index - 1); 358 ncf->vids[cmd->index - 1] = 0;
359 } else { 359 } else {
360 vlan = ntohs(cmd->vlan); 360 set_bit(cmd->index - 1, bitmap);
361 ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); 361 ncf->vids[cmd->index - 1] = ntohs(cmd->vlan);
362 } 362 }
363 spin_unlock_irqrestore(&nc->lock, flags);
363 364
364 return ret; 365 return 0;
365} 366}
366 367
367static int ncsi_rsp_handler_ev(struct ncsi_request *nr) 368static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
@@ -422,8 +423,12 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
422 struct ncsi_rsp_pkt *rsp; 423 struct ncsi_rsp_pkt *rsp;
423 struct ncsi_dev_priv *ndp = nr->ndp; 424 struct ncsi_dev_priv *ndp = nr->ndp;
424 struct ncsi_channel *nc; 425 struct ncsi_channel *nc;
425 struct ncsi_channel_filter *ncf; 426 struct ncsi_channel_mac_filter *ncf;
427 unsigned long flags;
426 void *bitmap; 428 void *bitmap;
429 bool enabled;
430 int index;
431
427 432
428 /* Find the package and channel */ 433 /* Find the package and channel */
429 rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); 434 rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -436,31 +441,24 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
436 * isn't supported yet. 441 * isn't supported yet.
437 */ 442 */
438 cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd); 443 cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd);
439 switch (cmd->at_e >> 5) { 444 enabled = cmd->at_e & 0x1;
440 case 0x0: /* UC address */ 445 ncf = &nc->mac_filter;
441 ncf = nc->filters[NCSI_FILTER_UC]; 446 bitmap = &ncf->bitmap;
442 break;
443 case 0x1: /* MC address */
444 ncf = nc->filters[NCSI_FILTER_MC];
445 break;
446 default:
447 return -EINVAL;
448 }
449 447
450 /* Sanity check on the filter */ 448 if (cmd->index == 0 ||
451 if (!ncf) 449 cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed)
452 return -ENOENT;
453 else if (cmd->index >= ncf->total)
454 return -ERANGE; 450 return -ERANGE;
455 451
456 bitmap = &ncf->bitmap; 452 index = (cmd->index - 1) * ETH_ALEN;
457 if (cmd->at_e & 0x1) { 453 spin_lock_irqsave(&nc->lock, flags);
458 set_bit(cmd->index, bitmap); 454 if (enabled) {
459 memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6); 455 set_bit(cmd->index - 1, bitmap);
456 memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN);
460 } else { 457 } else {
461 clear_bit(cmd->index, bitmap); 458 clear_bit(cmd->index - 1, bitmap);
462 memset(ncf->data + 6 * cmd->index, 0, 6); 459 memset(&ncf->addrs[index], 0, ETH_ALEN);
463 } 460 }
461 spin_unlock_irqrestore(&nc->lock, flags);
464 462
465 return 0; 463 return 0;
466} 464}
@@ -631,9 +629,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
631 struct ncsi_rsp_gc_pkt *rsp; 629 struct ncsi_rsp_gc_pkt *rsp;
632 struct ncsi_dev_priv *ndp = nr->ndp; 630 struct ncsi_dev_priv *ndp = nr->ndp;
633 struct ncsi_channel *nc; 631 struct ncsi_channel *nc;
634 struct ncsi_channel_filter *ncf; 632 size_t size;
635 size_t size, entry_size;
636 int cnt, i;
637 633
638 /* Find the channel */ 634 /* Find the channel */
639 rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp); 635 rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp);
@@ -655,64 +651,40 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
655 nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode & 651 nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode &
656 NCSI_CAP_VLAN_MASK; 652 NCSI_CAP_VLAN_MASK;
657 653
658 /* Build filters */ 654 size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN;
659 for (i = 0; i < NCSI_FILTER_MAX; i++) { 655 nc->mac_filter.addrs = kzalloc(size, GFP_ATOMIC);
660 switch (i) { 656 if (!nc->mac_filter.addrs)
661 case NCSI_FILTER_VLAN: 657 return -ENOMEM;
662 cnt = rsp->vlan_cnt; 658 nc->mac_filter.n_uc = rsp->uc_cnt;
663 entry_size = 2; 659 nc->mac_filter.n_mc = rsp->mc_cnt;
664 break; 660 nc->mac_filter.n_mixed = rsp->mixed_cnt;
665 case NCSI_FILTER_MIXED: 661
666 cnt = rsp->mixed_cnt; 662 nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt,
667 entry_size = 6; 663 sizeof(*nc->vlan_filter.vids),
668 break; 664 GFP_ATOMIC);
669 case NCSI_FILTER_MC: 665 if (!nc->vlan_filter.vids)
670 cnt = rsp->mc_cnt; 666 return -ENOMEM;
671 entry_size = 6; 667 /* Set VLAN filters active so they are cleared in the first
672 break; 668 * configuration state
673 case NCSI_FILTER_UC: 669 */
674 cnt = rsp->uc_cnt; 670 nc->vlan_filter.bitmap = U64_MAX;
675 entry_size = 6; 671 nc->vlan_filter.n_vids = rsp->vlan_cnt;
676 break;
677 default:
678 continue;
679 }
680
681 if (!cnt || nc->filters[i])
682 continue;
683
684 size = sizeof(*ncf) + cnt * entry_size;
685 ncf = kzalloc(size, GFP_ATOMIC);
686 if (!ncf) {
687 pr_warn("%s: Cannot alloc filter table (%d)\n",
688 __func__, i);
689 return -ENOMEM;
690 }
691
692 ncf->index = i;
693 ncf->total = cnt;
694 if (i == NCSI_FILTER_VLAN) {
695 /* Set VLAN filters active so they are cleared in
696 * first configuration state
697 */
698 ncf->bitmap = U64_MAX;
699 } else {
700 ncf->bitmap = 0x0ul;
701 }
702 nc->filters[i] = ncf;
703 }
704 672
705 return 0; 673 return 0;
706} 674}
707 675
708static int ncsi_rsp_handler_gp(struct ncsi_request *nr) 676static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
709{ 677{
710 struct ncsi_rsp_gp_pkt *rsp; 678 struct ncsi_channel_vlan_filter *ncvf;
679 struct ncsi_channel_mac_filter *ncmf;
711 struct ncsi_dev_priv *ndp = nr->ndp; 680 struct ncsi_dev_priv *ndp = nr->ndp;
681 struct ncsi_rsp_gp_pkt *rsp;
712 struct ncsi_channel *nc; 682 struct ncsi_channel *nc;
713 unsigned short enable, vlan; 683 unsigned short enable;
714 unsigned char *pdata; 684 unsigned char *pdata;
715 int table, i; 685 unsigned long flags;
686 void *bitmap;
687 int i;
716 688
717 /* Find the channel */ 689 /* Find the channel */
718 rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp); 690 rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp);
@@ -746,36 +718,33 @@ static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
746 /* MAC addresses filter table */ 718 /* MAC addresses filter table */
747 pdata = (unsigned char *)rsp + 48; 719 pdata = (unsigned char *)rsp + 48;
748 enable = rsp->mac_enable; 720 enable = rsp->mac_enable;
721 ncmf = &nc->mac_filter;
722 spin_lock_irqsave(&nc->lock, flags);
723 bitmap = &ncmf->bitmap;
749 for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) { 724 for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) {
750 if (i >= (nc->filters[NCSI_FILTER_UC]->total +
751 nc->filters[NCSI_FILTER_MC]->total))
752 table = NCSI_FILTER_MIXED;
753 else if (i >= nc->filters[NCSI_FILTER_UC]->total)
754 table = NCSI_FILTER_MC;
755 else
756 table = NCSI_FILTER_UC;
757
758 if (!(enable & (0x1 << i))) 725 if (!(enable & (0x1 << i)))
759 continue; 726 clear_bit(i, bitmap);
760 727 else
761 if (ncsi_find_filter(nc, table, pdata) >= 0) 728 set_bit(i, bitmap);
762 continue;
763 729
764 ncsi_add_filter(nc, table, pdata); 730 memcpy(&ncmf->addrs[i * ETH_ALEN], pdata, ETH_ALEN);
765 } 731 }
732 spin_unlock_irqrestore(&nc->lock, flags);
766 733
767 /* VLAN filter table */ 734 /* VLAN filter table */
768 enable = ntohs(rsp->vlan_enable); 735 enable = ntohs(rsp->vlan_enable);
736 ncvf = &nc->vlan_filter;
737 bitmap = &ncvf->bitmap;
738 spin_lock_irqsave(&nc->lock, flags);
769 for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) { 739 for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) {
770 if (!(enable & (0x1 << i))) 740 if (!(enable & (0x1 << i)))
771 continue; 741 clear_bit(i, bitmap);
772 742 else
773 vlan = ntohs(*(__be16 *)pdata); 743 set_bit(i, bitmap);
774 if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0)
775 continue;
776 744
777 ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); 745 ncvf->vids[i] = ntohs(*(__be16 *)pdata);
778 } 746 }
747 spin_unlock_irqrestore(&nc->lock, flags);
779 748
780 return 0; 749 return 0;
781} 750}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 44d8a55e9721..dbd7d1fad277 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -433,17 +433,16 @@ config NF_NAT_TFTP
433 default NF_NAT && NF_CONNTRACK_TFTP 433 default NF_NAT && NF_CONNTRACK_TFTP
434 434
435config NF_NAT_REDIRECT 435config NF_NAT_REDIRECT
436 tristate "IPv4/IPv6 redirect support" 436 bool
437 depends on NF_NAT
438 help
439 This is the kernel functionality to redirect packets to local
440 machine through NAT.
441 437
442config NETFILTER_SYNPROXY 438config NETFILTER_SYNPROXY
443 tristate 439 tristate
444 440
445endif # NF_CONNTRACK 441endif # NF_CONNTRACK
446 442
443config NF_OSF
444 tristate
445
447config NF_TABLES 446config NF_TABLES
448 select NETFILTER_NETLINK 447 select NETFILTER_NETLINK
449 tristate "Netfilter nf_tables support" 448 tristate "Netfilter nf_tables support"
@@ -474,24 +473,6 @@ config NF_TABLES_NETDEV
474 help 473 help
475 This option enables support for the "netdev" table. 474 This option enables support for the "netdev" table.
476 475
477config NFT_EXTHDR
478 tristate "Netfilter nf_tables exthdr module"
479 help
480 This option adds the "exthdr" expression that you can use to match
481 IPv6 extension headers and tcp options.
482
483config NFT_META
484 tristate "Netfilter nf_tables meta module"
485 help
486 This option adds the "meta" expression that you can use to match and
487 to set packet metainformation such as the packet mark.
488
489config NFT_RT
490 tristate "Netfilter nf_tables routing module"
491 help
492 This option adds the "rt" expression that you can use to match
493 packet routing information such as the packet nexthop.
494
495config NFT_NUMGEN 476config NFT_NUMGEN
496 tristate "Netfilter nf_tables number generator module" 477 tristate "Netfilter nf_tables number generator module"
497 help 478 help
@@ -536,6 +517,15 @@ config NFT_COUNTER
536 This option adds the "counter" expression that you can use to 517 This option adds the "counter" expression that you can use to
537 include packet and byte counters in a rule. 518 include packet and byte counters in a rule.
538 519
520config NFT_CONNLIMIT
521 tristate "Netfilter nf_tables connlimit module"
522 depends on NF_CONNTRACK
523 depends on NETFILTER_ADVANCED
524 select NETFILTER_CONNCOUNT
525 help
526 This option adds the "connlimit" expression that you can use to
527 ratelimit rule matchings per connections.
528
539config NFT_LOG 529config NFT_LOG
540 tristate "Netfilter nf_tables log module" 530 tristate "Netfilter nf_tables log module"
541 help 531 help
@@ -632,6 +622,15 @@ config NFT_FIB_INET
632 The lookup will be delegated to the IPv4 or IPv6 FIB depending 622 The lookup will be delegated to the IPv4 or IPv6 FIB depending
633 on the protocol of the packet. 623 on the protocol of the packet.
634 624
625config NFT_SOCKET
626 tristate "Netfilter nf_tables socket match support"
627 depends on IPV6 || IPV6=n
628 select NF_SOCKET_IPV4
629 select NF_SOCKET_IPV6 if IPV6
630 help
631 This option allows matching for the presence or absence of a
632 corresponding socket and its attributes.
633
635if NF_TABLES_NETDEV 634if NF_TABLES_NETDEV
636 635
637config NF_DUP_NETDEV 636config NF_DUP_NETDEV
@@ -667,8 +666,7 @@ endif # NF_TABLES
667 666
668config NF_FLOW_TABLE_INET 667config NF_FLOW_TABLE_INET
669 tristate "Netfilter flow table mixed IPv4/IPv6 module" 668 tristate "Netfilter flow table mixed IPv4/IPv6 module"
670 depends on NF_FLOW_TABLE_IPV4 669 depends on NF_FLOW_TABLE
671 depends on NF_FLOW_TABLE_IPV6
672 help 670 help
673 This option adds the flow table mixed IPv4/IPv6 support. 671 This option adds the flow table mixed IPv4/IPv6 support.
674 672
@@ -1000,6 +998,8 @@ config NETFILTER_XT_TARGET_TPROXY
1000 depends on IP_NF_MANGLE 998 depends on IP_NF_MANGLE
1001 select NF_DEFRAG_IPV4 999 select NF_DEFRAG_IPV4
1002 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n 1000 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
1001 select NF_TPROXY_IPV4
1002 select NF_TPROXY_IPV6 if IP6_NF_IPTABLES
1003 help 1003 help
1004 This option adds a `TPROXY' target, which is somewhat similar to 1004 This option adds a `TPROXY' target, which is somewhat similar to
1005 REDIRECT. It can only be used in the mangle table and is useful 1005 REDIRECT. It can only be used in the mangle table and is useful
@@ -1378,6 +1378,7 @@ config NETFILTER_XT_MATCH_NFACCT
1378config NETFILTER_XT_MATCH_OSF 1378config NETFILTER_XT_MATCH_OSF
1379 tristate '"osf" Passive OS fingerprint match' 1379 tristate '"osf" Passive OS fingerprint match'
1380 depends on NETFILTER_ADVANCED && NETFILTER_NETLINK 1380 depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
1381 select NF_OSF
1381 help 1382 help
1382 This option selects the Passive OS Fingerprinting match module 1383 This option selects the Passive OS Fingerprinting match module
1383 that allows to passively match the remote operating system by 1384 that allows to passively match the remote operating system by
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index fd32bd2c9521..44449389e527 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
55obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o 55obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
56 56
57obj-$(CONFIG_NF_NAT) += nf_nat.o 57obj-$(CONFIG_NF_NAT) += nf_nat.o
58obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o 58nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
59 59
60# NAT helpers 60# NAT helpers
61obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o 61obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
@@ -76,13 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
76nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ 76nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
77 nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ 77 nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
78 nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ 78 nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
79 nft_dynset.o 79 nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
80 80
81obj-$(CONFIG_NF_TABLES) += nf_tables.o 81obj-$(CONFIG_NF_TABLES) += nf_tables.o
82obj-$(CONFIG_NFT_COMPAT) += nft_compat.o 82obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
83obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o 83obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
84obj-$(CONFIG_NFT_META) += nft_meta.o
85obj-$(CONFIG_NFT_RT) += nft_rt.o
86obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o 84obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
87obj-$(CONFIG_NFT_CT) += nft_ct.o 85obj-$(CONFIG_NFT_CT) += nft_ct.o
88obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o 86obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
@@ -104,6 +102,8 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
104obj-$(CONFIG_NFT_FIB) += nft_fib.o 102obj-$(CONFIG_NFT_FIB) += nft_fib.o
105obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o 103obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
106obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o 104obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
105obj-$(CONFIG_NF_OSF) += nf_osf.o
106obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
107 107
108# nf_tables netdev 108# nf_tables netdev
109obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o 109obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
@@ -111,6 +111,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
111 111
112# flow table infrastructure 112# flow table infrastructure
113obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o 113obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
114nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
115
114obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o 116obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
115 117
116# generic X tables 118# generic X tables
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 206fb2c4c319..168af54db975 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -138,11 +138,6 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
138 continue; 138 continue;
139 } 139 }
140 140
141 if (reg->nat_hook && orig_ops[i]->nat_hook) {
142 kvfree(new);
143 return ERR_PTR(-EBUSY);
144 }
145
146 if (inserted || reg->priority > orig_ops[i]->priority) { 141 if (inserted || reg->priority > orig_ops[i]->priority) {
147 new_ops[nhooks] = (void *)orig_ops[i]; 142 new_ops[nhooks] = (void *)orig_ops[i];
148 new->hooks[nhooks] = old->hooks[i]; 143 new->hooks[nhooks] = old->hooks[i];
@@ -186,9 +181,31 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
186#endif 181#endif
187} 182}
188 183
184int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
185 const struct nf_hook_ops *reg)
186{
187 struct nf_hook_entries *new_hooks;
188 struct nf_hook_entries *p;
189
190 p = rcu_dereference_raw(*pp);
191 new_hooks = nf_hook_entries_grow(p, reg);
192 if (IS_ERR(new_hooks))
193 return PTR_ERR(new_hooks);
194
195 hooks_validate(new_hooks);
196
197 rcu_assign_pointer(*pp, new_hooks);
198
199 BUG_ON(p == new_hooks);
200 nf_hook_entries_free(p);
201 return 0;
202}
203EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);
204
189/* 205/*
190 * __nf_hook_entries_try_shrink - try to shrink hook array 206 * __nf_hook_entries_try_shrink - try to shrink hook array
191 * 207 *
208 * @old -- current hook blob at @pp
192 * @pp -- location of hook blob 209 * @pp -- location of hook blob
193 * 210 *
194 * Hook unregistration must always succeed, so to-be-removed hooks 211 * Hook unregistration must always succeed, so to-be-removed hooks
@@ -201,14 +218,14 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
201 * 218 *
202 * Returns address to free, or NULL. 219 * Returns address to free, or NULL.
203 */ 220 */
204static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp) 221static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
222 struct nf_hook_entries __rcu **pp)
205{ 223{
206 struct nf_hook_entries *old, *new = NULL;
207 unsigned int i, j, skip = 0, hook_entries; 224 unsigned int i, j, skip = 0, hook_entries;
225 struct nf_hook_entries *new = NULL;
208 struct nf_hook_ops **orig_ops; 226 struct nf_hook_ops **orig_ops;
209 struct nf_hook_ops **new_ops; 227 struct nf_hook_ops **new_ops;
210 228
211 old = nf_entry_dereference(*pp);
212 if (WARN_ON_ONCE(!old)) 229 if (WARN_ON_ONCE(!old))
213 return NULL; 230 return NULL;
214 231
@@ -347,11 +364,10 @@ static int __nf_register_net_hook(struct net *net, int pf,
347 * This cannot fail, hook unregistration must always succeed. 364 * This cannot fail, hook unregistration must always succeed.
348 * Therefore replace the to-be-removed hook with a dummy hook. 365 * Therefore replace the to-be-removed hook with a dummy hook.
349 */ 366 */
350static void nf_remove_net_hook(struct nf_hook_entries *old, 367static bool nf_remove_net_hook(struct nf_hook_entries *old,
351 const struct nf_hook_ops *unreg, int pf) 368 const struct nf_hook_ops *unreg)
352{ 369{
353 struct nf_hook_ops **orig_ops; 370 struct nf_hook_ops **orig_ops;
354 bool found = false;
355 unsigned int i; 371 unsigned int i;
356 372
357 orig_ops = nf_hook_entries_get_hook_ops(old); 373 orig_ops = nf_hook_entries_get_hook_ops(old);
@@ -360,21 +376,10 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
360 continue; 376 continue;
361 WRITE_ONCE(old->hooks[i].hook, accept_all); 377 WRITE_ONCE(old->hooks[i].hook, accept_all);
362 WRITE_ONCE(orig_ops[i], &dummy_ops); 378 WRITE_ONCE(orig_ops[i], &dummy_ops);
363 found = true; 379 return true;
364 break;
365 } 380 }
366 381
367 if (found) { 382 return false;
368#ifdef CONFIG_NETFILTER_INGRESS
369 if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
370 net_dec_ingress_queue();
371#endif
372#ifdef HAVE_JUMP_LABEL
373 static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]);
374#endif
375 } else {
376 WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum);
377 }
378} 383}
379 384
380static void __nf_unregister_net_hook(struct net *net, int pf, 385static void __nf_unregister_net_hook(struct net *net, int pf,
@@ -395,9 +400,19 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
395 return; 400 return;
396 } 401 }
397 402
398 nf_remove_net_hook(p, reg, pf); 403 if (nf_remove_net_hook(p, reg)) {
404#ifdef CONFIG_NETFILTER_INGRESS
405 if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
406 net_dec_ingress_queue();
407#endif
408#ifdef HAVE_JUMP_LABEL
409 static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
410#endif
411 } else {
412 WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
413 }
399 414
400 p = __nf_hook_entries_try_shrink(pp); 415 p = __nf_hook_entries_try_shrink(p, pp);
401 mutex_unlock(&nf_hook_mutex); 416 mutex_unlock(&nf_hook_mutex);
402 if (!p) 417 if (!p)
403 return; 418 return;
@@ -417,6 +432,19 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
417} 432}
418EXPORT_SYMBOL(nf_unregister_net_hook); 433EXPORT_SYMBOL(nf_unregister_net_hook);
419 434
435void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
436 const struct nf_hook_ops *reg)
437{
438 struct nf_hook_entries *p;
439
440 p = rcu_dereference_raw(*pp);
441 if (nf_remove_net_hook(p, reg)) {
442 p = __nf_hook_entries_try_shrink(p, pp);
443 nf_hook_entries_free(p);
444 }
445}
446EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);
447
420int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) 448int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
421{ 449{
422 int err; 450 int err;
@@ -535,6 +563,9 @@ EXPORT_SYMBOL(skb_make_writable);
535struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; 563struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
536EXPORT_SYMBOL_GPL(nfnl_ct_hook); 564EXPORT_SYMBOL_GPL(nfnl_ct_hook);
537 565
566struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
567EXPORT_SYMBOL_GPL(nf_ct_hook);
568
538#if IS_ENABLED(CONFIG_NF_CONNTRACK) 569#if IS_ENABLED(CONFIG_NF_CONNTRACK)
539/* This does not belong here, but locally generated errors need it if connection 570/* This does not belong here, but locally generated errors need it if connection
540 tracking in use: without this, connection may not be in hash table, and hence 571 tracking in use: without this, connection may not be in hash table, and hence
@@ -543,6 +574,9 @@ void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
543 __rcu __read_mostly; 574 __rcu __read_mostly;
544EXPORT_SYMBOL(ip_ct_attach); 575EXPORT_SYMBOL(ip_ct_attach);
545 576
577struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
578EXPORT_SYMBOL_GPL(nf_nat_hook);
579
546void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) 580void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
547{ 581{
548 void (*attach)(struct sk_buff *, const struct sk_buff *); 582 void (*attach)(struct sk_buff *, const struct sk_buff *);
@@ -557,17 +591,14 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
557} 591}
558EXPORT_SYMBOL(nf_ct_attach); 592EXPORT_SYMBOL(nf_ct_attach);
559 593
560void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
561EXPORT_SYMBOL(nf_ct_destroy);
562
563void nf_conntrack_destroy(struct nf_conntrack *nfct) 594void nf_conntrack_destroy(struct nf_conntrack *nfct)
564{ 595{
565 void (*destroy)(struct nf_conntrack *); 596 struct nf_ct_hook *ct_hook;
566 597
567 rcu_read_lock(); 598 rcu_read_lock();
568 destroy = rcu_dereference(nf_ct_destroy); 599 ct_hook = rcu_dereference(nf_ct_hook);
569 BUG_ON(destroy == NULL); 600 BUG_ON(ct_hook == NULL);
570 destroy(nfct); 601 ct_hook->destroy(nfct);
571 rcu_read_unlock(); 602 rcu_read_unlock();
572} 603}
573EXPORT_SYMBOL(nf_conntrack_destroy); 604EXPORT_SYMBOL(nf_conntrack_destroy);
@@ -580,11 +611,6 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = {
580EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); 611EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
581#endif /* CONFIG_NF_CONNTRACK */ 612#endif /* CONFIG_NF_CONNTRACK */
582 613
583#ifdef CONFIG_NF_NAT_NEEDED
584void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
585EXPORT_SYMBOL(nf_nat_decode_session_hook);
586#endif
587
588static void __net_init 614static void __net_init
589__netfilter_net_init(struct nf_hook_entries __rcu **e, int max) 615__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
590{ 616{
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..05dc1b77e466 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,25 @@ config IP_VS_SH
225 If you want to compile it in kernel, say Y. To compile it as a 225 If you want to compile it in kernel, say Y. To compile it as a
226 module, choose M here. If unsure, say N. 226 module, choose M here. If unsure, say N.
227 227
228config IP_VS_MH
229 tristate "maglev hashing scheduling"
230 ---help---
231 The maglev consistent hashing scheduling algorithm provides the
232 Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
233 network connections to the servers through looking up a statically
234 assigned special hash table called the lookup table. Maglev hashing
235 is to assign a preference list of all the lookup table positions
236 to each destination.
237
238 Through this operation, The maglev hashing gives an almost equal
239 share of the lookup table to each of the destinations and provides
240 minimal disruption by using the lookup table. When the set of
241 destinations changes, a connection will likely be sent to the same
242 destination as it was before.
243
244 If you want to compile it in kernel, say Y. To compile it as a
245 module, choose M here. If unsure, say N.
246
228config IP_VS_SED 247config IP_VS_SED
229 tristate "shortest expected delay scheduling" 248 tristate "shortest expected delay scheduling"
230 ---help--- 249 ---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
266 needs to be large enough to effectively fit all the destinations 285 needs to be large enough to effectively fit all the destinations
267 multiplied by their respective weights. 286 multiplied by their respective weights.
268 287
288comment 'IPVS MH scheduler'
289
290config IP_VS_MH_TAB_INDEX
291 int "IPVS maglev hashing table index of size (the prime numbers)"
292 range 8 17
293 default 12
294 ---help---
295 The maglev hashing scheduler maps source IPs to destinations
296 stored in a hash table. This table is assigned by a preference
297 list of the positions to each destination until all slots in
298 the table are filled. The index determines the prime for size of
299 the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
300 65521 or 131071. When using weights to allow destinations to
301 receive more connections, the table is assigned an amount
302 proportional to the weights specified. The table needs to be large
303 enough to effectively fit all the destinations multiplied by their
304 respective weights.
305
269comment 'IPVS application helper' 306comment 'IPVS application helper'
270 307
271config IP_VS_FTP 308config IP_VS_FTP
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..bfce2677fda2 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
33obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o 33obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
34obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o 34obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
35obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o 35obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
36obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
36obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o 37obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
37obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o 38obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
38 39
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index c3db074fc1f7..7588aeaa605f 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
355} 355}
356 356
357static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, 357static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
358 struct ip_vs_app *app) 358 struct ip_vs_app *app,
359 struct ip_vs_iphdr *ipvsh)
359{ 360{
360 int diff; 361 int diff;
361 const unsigned int tcp_offset = ip_hdrlen(skb); 362 const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
386 if (app->pkt_out == NULL) 387 if (app->pkt_out == NULL)
387 return 1; 388 return 1;
388 389
389 if (!app->pkt_out(app, cp, skb, &diff)) 390 if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
390 return 0; 391 return 0;
391 392
392 /* 393 /*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
404 * called by ipvs packet handler, assumes previously checked cp!=NULL 405 * called by ipvs packet handler, assumes previously checked cp!=NULL
405 * returns false if it can't handle packet (oom) 406 * returns false if it can't handle packet (oom)
406 */ 407 */
407int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) 408int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
409 struct ip_vs_iphdr *ipvsh)
408{ 410{
409 struct ip_vs_app *app; 411 struct ip_vs_app *app;
410 412
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
417 419
418 /* TCP is complicated */ 420 /* TCP is complicated */
419 if (cp->protocol == IPPROTO_TCP) 421 if (cp->protocol == IPPROTO_TCP)
420 return app_tcp_pkt_out(cp, skb, app); 422 return app_tcp_pkt_out(cp, skb, app, ipvsh);
421 423
422 /* 424 /*
423 * Call private output hook function 425 * Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
425 if (app->pkt_out == NULL) 427 if (app->pkt_out == NULL)
426 return 1; 428 return 1;
427 429
428 return app->pkt_out(app, cp, skb, NULL); 430 return app->pkt_out(app, cp, skb, NULL, ipvsh);
429} 431}
430 432
431 433
432static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, 434static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
433 struct ip_vs_app *app) 435 struct ip_vs_app *app,
436 struct ip_vs_iphdr *ipvsh)
434{ 437{
435 int diff; 438 int diff;
436 const unsigned int tcp_offset = ip_hdrlen(skb); 439 const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
461 if (app->pkt_in == NULL) 464 if (app->pkt_in == NULL)
462 return 1; 465 return 1;
463 466
464 if (!app->pkt_in(app, cp, skb, &diff)) 467 if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
465 return 0; 468 return 0;
466 469
467 /* 470 /*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
479 * called by ipvs packet handler, assumes previously checked cp!=NULL. 482 * called by ipvs packet handler, assumes previously checked cp!=NULL.
480 * returns false if can't handle packet (oom). 483 * returns false if can't handle packet (oom).
481 */ 484 */
482int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) 485int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
486 struct ip_vs_iphdr *ipvsh)
483{ 487{
484 struct ip_vs_app *app; 488 struct ip_vs_app *app;
485 489
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
492 496
493 /* TCP is complicated */ 497 /* TCP is complicated */
494 if (cp->protocol == IPPROTO_TCP) 498 if (cp->protocol == IPPROTO_TCP)
495 return app_tcp_pkt_in(cp, skb, app); 499 return app_tcp_pkt_in(cp, skb, app, ipvsh);
496 500
497 /* 501 /*
498 * Call private input hook function 502 * Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
500 if (app->pkt_in == NULL) 504 if (app->pkt_in == NULL)
501 return 1; 505 return 1;
502 506
503 return app->pkt_in(app, cp, skb, NULL); 507 return app->pkt_in(app, cp, skb, NULL, ipvsh);
504} 508}
505 509
506 510
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 141b1509c948..0c03c0e16a96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
821 if (add && udest->af != svc->af) 821 if (add && udest->af != svc->af)
822 ipvs->mixed_address_family_dests++; 822 ipvs->mixed_address_family_dests++;
823 823
824 /* keep the last_weight with latest non-0 weight */
825 if (add || udest->weight != 0)
826 atomic_set(&dest->last_weight, udest->weight);
827
824 /* set the weight and the flags */ 828 /* set the weight and the flags */
825 atomic_set(&dest->weight, udest->weight); 829 atomic_set(&dest->weight, udest->weight);
826 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 830 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..07459e71d907 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
43#include <linux/module.h> 43#include <linux/module.h>
44#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/skbuff.h> 45#include <linux/skbuff.h>
46#include <linux/hash.h>
46 47
47#include <net/ip_vs.h> 48#include <net/ip_vs.h>
48 49
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
81 addr_fold = addr->ip6[0]^addr->ip6[1]^ 82 addr_fold = addr->ip6[0]^addr->ip6[1]^
82 addr->ip6[2]^addr->ip6[3]; 83 addr->ip6[2]^addr->ip6[3];
83#endif 84#endif
84 return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; 85 return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
85} 86}
86 87
87 88
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
29#include <linux/moduleparam.h> 29#include <linux/moduleparam.h>
30#include <linux/kernel.h> 30#include <linux/kernel.h>
31#include <linux/skbuff.h> 31#include <linux/skbuff.h>
32#include <linux/ctype.h>
33#include <linux/inet.h>
32#include <linux/in.h> 34#include <linux/in.h>
33#include <linux/ip.h> 35#include <linux/ip.h>
34#include <linux/netfilter.h> 36#include <linux/netfilter.h>
@@ -44,9 +46,18 @@
44#include <net/ip_vs.h> 46#include <net/ip_vs.h>
45 47
46 48
47#define SERVER_STRING "227 " 49#define SERVER_STRING_PASV "227 "
48#define CLIENT_STRING "PORT" 50#define CLIENT_STRING_PORT "PORT"
51#define SERVER_STRING_EPSV "229 "
52#define CLIENT_STRING_EPRT "EPRT"
49 53
54enum {
55 IP_VS_FTP_ACTIVE = 0,
56 IP_VS_FTP_PORT = 0,
57 IP_VS_FTP_PASV,
58 IP_VS_FTP_EPRT,
59 IP_VS_FTP_EPSV,
60};
50 61
51/* 62/*
52 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper 63 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
58MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); 69MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
59 70
60 71
61/* Dummy variable */ 72static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
62static int ip_vs_ftp_pasv; 73{
74 struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
75
76 if ((th->doff << 2) < sizeof(struct tcphdr))
77 return NULL;
63 78
79 return (char *)th + (th->doff << 2);
80}
64 81
65static int 82static int
66ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) 83ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
78} 95}
79 96
80 97
81/* 98/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
82 * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started 99 * with the "pattern". <addr,port> is in network order.
83 * with the "pattern", ignoring before "skip" and terminated with 100 * Parse extended format depending on ext. In this case addr can be pre-set.
84 * the "term" character.
85 * <addr,port> is in network order.
86 */ 101 */
87static int ip_vs_ftp_get_addrport(char *data, char *data_limit, 102static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
88 const char *pattern, size_t plen, 103 const char *pattern, size_t plen,
89 char skip, char term, 104 char skip, bool ext, int mode,
90 __be32 *addr, __be16 *port, 105 union nf_inet_addr *addr, __be16 *port,
91 char **start, char **end) 106 __u16 af, char **start, char **end)
92{ 107{
93 char *s, c; 108 char *s, c;
94 unsigned char p[6]; 109 unsigned char p[6];
110 char edelim;
111 __u16 hport;
95 int i = 0; 112 int i = 0;
96 113
97 if (data_limit - data < plen) { 114 if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
113 if (s == data_limit) 130 if (s == data_limit)
114 return -1; 131 return -1;
115 if (!found) { 132 if (!found) {
133 /* "(" is optional for non-extended format,
134 * so catch the start of IPv4 address
135 */
136 if (!ext && isdigit(*s))
137 break;
116 if (*s == skip) 138 if (*s == skip)
117 found = 1; 139 found = 1;
118 } else if (*s != skip) { 140 } else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
120 } 142 }
121 } 143 }
122 } 144 }
145 /* Old IPv4-only format? */
146 if (!ext) {
147 p[0] = 0;
148 for (data = s; ; data++) {
149 if (data == data_limit)
150 return -1;
151 c = *data;
152 if (isdigit(c)) {
153 p[i] = p[i]*10 + c - '0';
154 } else if (c == ',' && i < 5) {
155 i++;
156 p[i] = 0;
157 } else {
158 /* unexpected character or terminator */
159 break;
160 }
161 }
123 162
124 for (data = s; ; data++) { 163 if (i != 5)
125 if (data == data_limit)
126 return -1; 164 return -1;
127 if (*data == term) 165
128 break; 166 *start = s;
167 *end = data;
168 addr->ip = get_unaligned((__be32 *) p);
169 *port = get_unaligned((__be16 *) (p + 4));
170 return 1;
129 } 171 }
130 *end = data; 172 if (s == data_limit)
173 return -1;
174 *start = s;
175 edelim = *s++;
176 if (edelim < 33 || edelim > 126)
177 return -1;
178 if (s == data_limit)
179 return -1;
180 if (*s == edelim) {
181 /* Address family is usually missing for EPSV response */
182 if (mode != IP_VS_FTP_EPSV)
183 return -1;
184 s++;
185 if (s == data_limit)
186 return -1;
187 /* Then address should be missing too */
188 if (*s != edelim)
189 return -1;
190 /* Caller can pre-set addr, if needed */
191 s++;
192 } else {
193 const char *ep;
131 194
132 memset(p, 0, sizeof(p)); 195 /* We allow address only from same family */
133 for (data = s; ; data++) { 196 if (af == AF_INET6 && *s != '2')
134 c = *data;
135 if (c == term)
136 break;
137 if (c >= '0' && c <= '9') {
138 p[i] = p[i]*10 + c - '0';
139 } else if (c == ',' && i < 5) {
140 i++;
141 } else {
142 /* unexpected character */
143 return -1; 197 return -1;
198 if (af == AF_INET && *s != '1')
199 return -1;
200 s++;
201 if (s == data_limit)
202 return -1;
203 if (*s != edelim)
204 return -1;
205 s++;
206 if (s == data_limit)
207 return -1;
208 if (af == AF_INET6) {
209 if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
210 &ep) <= 0)
211 return -1;
212 } else {
213 if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
214 &ep) <= 0)
215 return -1;
144 } 216 }
217 s = (char *) ep;
218 if (s == data_limit)
219 return -1;
220 if (*s != edelim)
221 return -1;
222 s++;
145 } 223 }
146 224 for (hport = 0; ; s++)
147 if (i != 5) 225 {
226 if (s == data_limit)
227 return -1;
228 if (!isdigit(*s))
229 break;
230 hport = hport * 10 + *s - '0';
231 }
232 if (s == data_limit || !hport || *s != edelim)
148 return -1; 233 return -1;
149 234 s++;
150 *start = s; 235 *end = s;
151 *addr = get_unaligned((__be32 *) p); 236 *port = htons(hport);
152 *port = get_unaligned((__be16 *) (p + 4));
153 return 1; 237 return 1;
154} 238}
155 239
156/* 240/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
157 * Look at outgoing ftp packets to catch the response to a PASV command
158 * from the server (inside-to-outside). 241 * from the server (inside-to-outside).
159 * When we see one, we build a connection entry with the client address, 242 * When we see one, we build a connection entry with the client address,
160 * client port 0 (unknown at the moment), the server address and the 243 * client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
165 * The outgoing packet should be something like 248 * The outgoing packet should be something like
166 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". 249 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
167 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. 250 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
251 * The extended format for EPSV response provides usually only port:
252 * "229 Entering Extended Passive Mode (|||ppp|)"
168 */ 253 */
169static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, 254static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
170 struct sk_buff *skb, int *diff) 255 struct sk_buff *skb, int *diff,
256 struct ip_vs_iphdr *ipvsh)
171{ 257{
172 struct iphdr *iph;
173 struct tcphdr *th;
174 char *data, *data_limit; 258 char *data, *data_limit;
175 char *start, *end; 259 char *start, *end;
176 union nf_inet_addr from; 260 union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
184 268
185 *diff = 0; 269 *diff = 0;
186 270
187#ifdef CONFIG_IP_VS_IPV6
188 /* This application helper doesn't work with IPv6 yet,
189 * so turn this into a no-op for IPv6 packets
190 */
191 if (cp->af == AF_INET6)
192 return 1;
193#endif
194
195 /* Only useful for established sessions */ 271 /* Only useful for established sessions */
196 if (cp->state != IP_VS_TCP_S_ESTABLISHED) 272 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
197 return 1; 273 return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
200 if (!skb_make_writable(skb, skb->len)) 276 if (!skb_make_writable(skb, skb->len))
201 return 0; 277 return 0;
202 278
203 if (cp->app_data == &ip_vs_ftp_pasv) { 279 if (cp->app_data == (void *) IP_VS_FTP_PASV) {
204 iph = ip_hdr(skb); 280 data = ip_vs_ftp_data_ptr(skb, ipvsh);
205 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
206 data = (char *)th + (th->doff << 2);
207 data_limit = skb_tail_pointer(skb); 281 data_limit = skb_tail_pointer(skb);
208 282
283 if (!data || data >= data_limit)
284 return 1;
285
209 if (ip_vs_ftp_get_addrport(data, data_limit, 286 if (ip_vs_ftp_get_addrport(data, data_limit,
210 SERVER_STRING, 287 SERVER_STRING_PASV,
211 sizeof(SERVER_STRING)-1, 288 sizeof(SERVER_STRING_PASV)-1,
212 '(', ')', 289 '(', false, IP_VS_FTP_PASV,
213 &from.ip, &port, 290 &from, &port, cp->af,
214 &start, &end) != 1) 291 &start, &end) != 1)
215 return 1; 292 return 1;
216 293
217 IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", 294 IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
218 &from.ip, ntohs(port), &cp->caddr.ip, 0); 295 &from.ip, ntohs(port), &cp->caddr.ip, 0);
296 } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
297 data = ip_vs_ftp_data_ptr(skb, ipvsh);
298 data_limit = skb_tail_pointer(skb);
219 299
220 /* 300 if (!data || data >= data_limit)
221 * Now update or create an connection entry for it 301 return 1;
302
303 /* Usually, data address is not specified but
304 * we support different address, so pre-set it.
222 */ 305 */
223 { 306 from = cp->daddr;
224 struct ip_vs_conn_param p; 307 if (ip_vs_ftp_get_addrport(data, data_limit,
225 ip_vs_conn_fill_param(cp->ipvs, AF_INET, 308 SERVER_STRING_EPSV,
226 iph->protocol, &from, port, 309 sizeof(SERVER_STRING_EPSV)-1,
227 &cp->caddr, 0, &p); 310 '(', true, IP_VS_FTP_EPSV,
228 n_cp = ip_vs_conn_out_get(&p); 311 &from, &port, cp->af,
229 } 312 &start, &end) != 1)
230 if (!n_cp) { 313 return 1;
231 struct ip_vs_conn_param p;
232 ip_vs_conn_fill_param(cp->ipvs,
233 AF_INET, IPPROTO_TCP, &cp->caddr,
234 0, &cp->vaddr, port, &p);
235 /* As above, this is ipv4 only */
236 n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
237 IP_VS_CONN_F_NO_CPORT |
238 IP_VS_CONN_F_NFCT,
239 cp->dest, skb->mark);
240 if (!n_cp)
241 return 0;
242 314
243 /* add its controller */ 315 IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
244 ip_vs_control_add(n_cp, cp); 316 IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
245 } 317 IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
318 } else {
319 return 1;
320 }
246 321
247 /* 322 /* Now update or create a connection entry for it */
248 * Replace the old passive address with the new one 323 {
249 */ 324 struct ip_vs_conn_param p;
325
326 ip_vs_conn_fill_param(cp->ipvs, cp->af,
327 ipvsh->protocol, &from, port,
328 &cp->caddr, 0, &p);
329 n_cp = ip_vs_conn_out_get(&p);
330 }
331 if (!n_cp) {
332 struct ip_vs_conn_param p;
333
334 ip_vs_conn_fill_param(cp->ipvs,
335 cp->af, ipvsh->protocol, &cp->caddr,
336 0, &cp->vaddr, port, &p);
337 n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
338 IP_VS_CONN_F_NO_CPORT |
339 IP_VS_CONN_F_NFCT,
340 cp->dest, skb->mark);
341 if (!n_cp)
342 return 0;
343
344 /* add its controller */
345 ip_vs_control_add(n_cp, cp);
346 }
347
348 /* Replace the old passive address with the new one */
349 if (cp->app_data == (void *) IP_VS_FTP_PASV) {
250 from.ip = n_cp->vaddr.ip; 350 from.ip = n_cp->vaddr.ip;
251 port = n_cp->vport; 351 port = n_cp->vport;
252 snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", 352 snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
256 ((unsigned char *)&from.ip)[3], 356 ((unsigned char *)&from.ip)[3],
257 ntohs(port) >> 8, 357 ntohs(port) >> 8,
258 ntohs(port) & 0xFF); 358 ntohs(port) & 0xFF);
359 } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
360 from = n_cp->vaddr;
361 port = n_cp->vport;
362 /* Only port, client will use VIP for the data connection */
363 snprintf(buf, sizeof(buf), "|||%u|",
364 ntohs(port));
365 } else {
366 *buf = 0;
367 }
368 buf_len = strlen(buf);
259 369
260 buf_len = strlen(buf); 370 ct = nf_ct_get(skb, &ctinfo);
261 371 if (ct) {
262 ct = nf_ct_get(skb, &ctinfo); 372 bool mangled;
263 if (ct) {
264 bool mangled;
265
266 /* If mangling fails this function will return 0
267 * which will cause the packet to be dropped.
268 * Mangling can only fail under memory pressure,
269 * hopefully it will succeed on the retransmitted
270 * packet.
271 */
272 mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
273 iph->ihl * 4,
274 start - data,
275 end - start,
276 buf, buf_len);
277 if (mangled) {
278 ip_vs_nfct_expect_related(skb, ct, n_cp,
279 IPPROTO_TCP, 0, 0);
280 if (skb->ip_summed == CHECKSUM_COMPLETE)
281 skb->ip_summed = CHECKSUM_UNNECESSARY;
282 /* csum is updated */
283 ret = 1;
284 }
285 }
286 373
287 /* 374 /* If mangling fails this function will return 0
288 * Not setting 'diff' is intentional, otherwise the sequence 375 * which will cause the packet to be dropped.
289 * would be adjusted twice. 376 * Mangling can only fail under memory pressure,
377 * hopefully it will succeed on the retransmitted
378 * packet.
290 */ 379 */
291 380 mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
292 cp->app_data = NULL; 381 ipvsh->len,
293 ip_vs_tcp_conn_listen(n_cp); 382 start - data,
294 ip_vs_conn_put(n_cp); 383 end - start,
295 return ret; 384 buf, buf_len);
385 if (mangled) {
386 ip_vs_nfct_expect_related(skb, ct, n_cp,
387 ipvsh->protocol, 0, 0);
388 if (skb->ip_summed == CHECKSUM_COMPLETE)
389 skb->ip_summed = CHECKSUM_UNNECESSARY;
390 /* csum is updated */
391 ret = 1;
392 }
296 } 393 }
297 return 1; 394
395 /* Not setting 'diff' is intentional, otherwise the sequence
396 * would be adjusted twice.
397 */
398
399 cp->app_data = (void *) IP_VS_FTP_ACTIVE;
400 ip_vs_tcp_conn_listen(n_cp);
401 ip_vs_conn_put(n_cp);
402 return ret;
298} 403}
299 404
300 405
301/* 406/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
302 * Look at incoming ftp packets to catch the PASV/PORT command
303 * (outside-to-inside). 407 * (outside-to-inside).
304 * 408 *
305 * The incoming packet having the PORT command should be something like 409 * The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
308 * In this case, we create a connection entry using the client address and 412 * In this case, we create a connection entry using the client address and
309 * port, so that the active ftp data connection from the server can reach 413 * port, so that the active ftp data connection from the server can reach
310 * the client. 414 * the client.
415 * Extended format:
416 * "EPSV\r\n" when client requests server address from same family
417 * "EPSV 1\r\n" when client requests IPv4 server address
418 * "EPSV 2\r\n" when client requests IPv6 server address
419 * "EPSV ALL\r\n" - not supported
420 * EPRT with specified delimiter (ASCII 33..126), "|" by default:
421 * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
422 * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
311 */ 423 */
312static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, 424static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
313 struct sk_buff *skb, int *diff) 425 struct sk_buff *skb, int *diff,
426 struct ip_vs_iphdr *ipvsh)
314{ 427{
315 struct iphdr *iph;
316 struct tcphdr *th;
317 char *data, *data_start, *data_limit; 428 char *data, *data_start, *data_limit;
318 char *start, *end; 429 char *start, *end;
319 union nf_inet_addr to; 430 union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
323 /* no diff required for incoming packets */ 434 /* no diff required for incoming packets */
324 *diff = 0; 435 *diff = 0;
325 436
326#ifdef CONFIG_IP_VS_IPV6
327 /* This application helper doesn't work with IPv6 yet,
328 * so turn this into a no-op for IPv6 packets
329 */
330 if (cp->af == AF_INET6)
331 return 1;
332#endif
333
334 /* Only useful for established sessions */ 437 /* Only useful for established sessions */
335 if (cp->state != IP_VS_TCP_S_ESTABLISHED) 438 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
336 return 1; 439 return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
339 if (!skb_make_writable(skb, skb->len)) 442 if (!skb_make_writable(skb, skb->len))
340 return 0; 443 return 0;
341 444
342 /* 445 data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
343 * Detecting whether it is passive
344 */
345 iph = ip_hdr(skb);
346 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
347
348 /* Since there may be OPTIONS in the TCP packet and the HLEN is
349 the length of the header in 32-bit multiples, it is accurate
350 to calculate data address by th+HLEN*4 */
351 data = data_start = (char *)th + (th->doff << 2);
352 data_limit = skb_tail_pointer(skb); 446 data_limit = skb_tail_pointer(skb);
447 if (!data || data >= data_limit)
448 return 1;
353 449
354 while (data <= data_limit - 6) { 450 while (data <= data_limit - 6) {
355 if (strncasecmp(data, "PASV\r\n", 6) == 0) { 451 if (cp->af == AF_INET &&
452 strncasecmp(data, "PASV\r\n", 6) == 0) {
356 /* Passive mode on */ 453 /* Passive mode on */
357 IP_VS_DBG(7, "got PASV at %td of %td\n", 454 IP_VS_DBG(7, "got PASV at %td of %td\n",
358 data - data_start, 455 data - data_start,
359 data_limit - data_start); 456 data_limit - data_start);
360 cp->app_data = &ip_vs_ftp_pasv; 457 cp->app_data = (void *) IP_VS_FTP_PASV;
361 return 1; 458 return 1;
362 } 459 }
460
461 /* EPSV or EPSV<space><net-prt> */
462 if (strncasecmp(data, "EPSV", 4) == 0 &&
463 (data[4] == ' ' || data[4] == '\r')) {
464 if (data[4] == ' ') {
465 char proto = data[5];
466
467 if (data > data_limit - 7 || data[6] != '\r')
468 return 1;
469
470#ifdef CONFIG_IP_VS_IPV6
471 if (cp->af == AF_INET6 && proto == '2') {
472 } else
473#endif
474 if (cp->af == AF_INET && proto == '1') {
475 } else {
476 return 1;
477 }
478 }
479 /* Extended Passive mode on */
480 IP_VS_DBG(7, "got EPSV at %td of %td\n",
481 data - data_start,
482 data_limit - data_start);
483 cp->app_data = (void *) IP_VS_FTP_EPSV;
484 return 1;
485 }
486
363 data++; 487 data++;
364 } 488 }
365 489
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
370 * then create a new connection entry for the coming data 494 * then create a new connection entry for the coming data
371 * connection. 495 * connection.
372 */ 496 */
373 if (ip_vs_ftp_get_addrport(data_start, data_limit, 497 if (cp->af == AF_INET &&
374 CLIENT_STRING, sizeof(CLIENT_STRING)-1, 498 ip_vs_ftp_get_addrport(data_start, data_limit,
375 ' ', '\r', &to.ip, &port, 499 CLIENT_STRING_PORT,
376 &start, &end) != 1) 500 sizeof(CLIENT_STRING_PORT)-1,
501 ' ', false, IP_VS_FTP_PORT,
502 &to, &port, cp->af,
503 &start, &end) == 1) {
504
505 IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
506
507 /* Now update or create a connection entry for it */
508 IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
509 ip_vs_proto_name(ipvsh->protocol),
510 &to.ip, ntohs(port), &cp->vaddr.ip,
511 ntohs(cp->vport)-1);
512 } else if (ip_vs_ftp_get_addrport(data_start, data_limit,
513 CLIENT_STRING_EPRT,
514 sizeof(CLIENT_STRING_EPRT)-1,
515 ' ', true, IP_VS_FTP_EPRT,
516 &to, &port, cp->af,
517 &start, &end) == 1) {
518
519 IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
520 IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
521
522 /* Now update or create a connection entry for it */
523 IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
524 ip_vs_proto_name(ipvsh->protocol),
525 IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
526 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
527 ntohs(cp->vport)-1);
528 } else {
377 return 1; 529 return 1;
378 530 }
379 IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
380 531
381 /* Passive mode off */ 532 /* Passive mode off */
382 cp->app_data = NULL; 533 cp->app_data = (void *) IP_VS_FTP_ACTIVE;
383
384 /*
385 * Now update or create a connection entry for it
386 */
387 IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
388 ip_vs_proto_name(iph->protocol),
389 &to.ip, ntohs(port), &cp->vaddr.ip, 0);
390 534
391 { 535 {
392 struct ip_vs_conn_param p; 536 struct ip_vs_conn_param p;
393 ip_vs_conn_fill_param(cp->ipvs, AF_INET, 537 ip_vs_conn_fill_param(cp->ipvs, cp->af,
394 iph->protocol, &to, port, &cp->vaddr, 538 ipvsh->protocol, &to, port, &cp->vaddr,
395 htons(ntohs(cp->vport)-1), &p); 539 htons(ntohs(cp->vport)-1), &p);
396 n_cp = ip_vs_conn_in_get(&p); 540 n_cp = ip_vs_conn_in_get(&p);
397 if (!n_cp) { 541 if (!n_cp) {
398 /* This is ipv4 only */ 542 n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
399 n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
400 htons(ntohs(cp->dport)-1), 543 htons(ntohs(cp->dport)-1),
401 IP_VS_CONN_F_NFCT, cp->dest, 544 IP_VS_CONN_F_NFCT, cp->dest,
402 skb->mark); 545 skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
454 ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); 597 ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
455 if (ret) 598 if (ret)
456 goto err_unreg; 599 goto err_unreg;
457 pr_info("%s: loaded support on port[%d] = %d\n", 600 pr_info("%s: loaded support on port[%d] = %u\n",
458 app->name, i, ports[i]); 601 app->name, i, ports[i]);
459 } 602 }
460 return 0; 603 return 0;
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..b9f375e6dc93 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
48#include <linux/kernel.h> 48#include <linux/kernel.h>
49#include <linux/skbuff.h> 49#include <linux/skbuff.h>
50#include <linux/jiffies.h> 50#include <linux/jiffies.h>
51#include <linux/hash.h>
51 52
52/* for sysctl */ 53/* for sysctl */
53#include <linux/fs.h> 54#include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
160 addr_fold = addr->ip6[0]^addr->ip6[1]^ 161 addr_fold = addr->ip6[0]^addr->ip6[1]^
161 addr->ip6[2]^addr->ip6[3]; 162 addr->ip6[2]^addr->ip6[3];
162#endif 163#endif
163 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; 164 return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
164} 165}
165 166
166 167
@@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
371 tbl->counter = 1; 372 tbl->counter = 1;
372 tbl->dead = false; 373 tbl->dead = false;
373 tbl->svc = svc; 374 tbl->svc = svc;
375 atomic_set(&tbl->entries, 0);
374 376
375 /* 377 /*
376 * Hook periodic timer for garbage collection 378 * Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..542c4949937a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
47#include <linux/jiffies.h> 47#include <linux/jiffies.h>
48#include <linux/list.h> 48#include <linux/list.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/hash.h>
50 51
51/* for sysctl */ 52/* for sysctl */
52#include <linux/fs.h> 53#include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
323 addr_fold = addr->ip6[0]^addr->ip6[1]^ 324 addr_fold = addr->ip6[0]^addr->ip6[1]^
324 addr->ip6[2]^addr->ip6[3]; 325 addr->ip6[2]^addr->ip6[3];
325#endif 326#endif
326 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; 327 return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
327} 328}
328 329
329 330
@@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
534 tbl->counter = 1; 535 tbl->counter = 1;
535 tbl->dead = false; 536 tbl->dead = false;
536 tbl->svc = svc; 537 tbl->svc = svc;
538 atomic_set(&tbl->entries, 0);
537 539
538 /* 540 /*
539 * Hook periodic timer for garbage collection 541 * Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 000000000000..0f795b186eb3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,540 @@
1// SPDX-License-Identifier: GPL-2.0
2/* IPVS: Maglev Hashing scheduling module
3 *
4 * Authors: Inju Song <inju.song@navercorp.com>
5 *
6 */
7
8/* The mh algorithm is to assign a preference list of all the lookup
9 * table positions to each destination and populate the table with
10 * the most-preferred position of destinations. Then it is to select
11 * destination with the hash key of source IP address through looking
12 * up a the lookup table.
13 *
14 * The algorithm is detailed in:
15 * [3.4 Consistent Hasing]
16https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
17 *
18 */
19
20#define KMSG_COMPONENT "IPVS"
21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23#include <linux/ip.h>
24#include <linux/slab.h>
25#include <linux/module.h>
26#include <linux/kernel.h>
27#include <linux/skbuff.h>
28
29#include <net/ip_vs.h>
30
31#include <linux/siphash.h>
32#include <linux/bitops.h>
33#include <linux/gcd.h>
34
35#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
36#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
37
38struct ip_vs_mh_lookup {
39 struct ip_vs_dest __rcu *dest; /* real server (cache) */
40};
41
42struct ip_vs_mh_dest_setup {
43 unsigned int offset; /* starting offset */
44 unsigned int skip; /* skip */
45 unsigned int perm; /* next_offset */
46 int turns; /* weight / gcd() and rshift */
47};
48
49/* Available prime numbers for MH table */
50static int primes[] = {251, 509, 1021, 2039, 4093,
51 8191, 16381, 32749, 65521, 131071};
52
53/* For IPVS MH entry hash table */
54#ifndef CONFIG_IP_VS_MH_TAB_INDEX
55#define CONFIG_IP_VS_MH_TAB_INDEX 12
56#endif
57#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
58#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
59#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
60
61struct ip_vs_mh_state {
62 struct rcu_head rcu_head;
63 struct ip_vs_mh_lookup *lookup;
64 struct ip_vs_mh_dest_setup *dest_setup;
65 hsiphash_key_t hash1, hash2;
66 int gcd;
67 int rshift;
68};
69
70static inline void generate_hash_secret(hsiphash_key_t *hash1,
71 hsiphash_key_t *hash2)
72{
73 hash1->key[0] = 2654435761UL;
74 hash1->key[1] = 2654435761UL;
75
76 hash2->key[0] = 2654446892UL;
77 hash2->key[1] = 2654446892UL;
78}
79
80/* Helper function to determine if server is unavailable */
81static inline bool is_unavailable(struct ip_vs_dest *dest)
82{
83 return atomic_read(&dest->weight) <= 0 ||
84 dest->flags & IP_VS_DEST_F_OVERLOAD;
85}
86
87/* Returns hash value for IPVS MH entry */
88static inline unsigned int
89ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
90 __be16 port, hsiphash_key_t *key, unsigned int offset)
91{
92 unsigned int v;
93 __be32 addr_fold = addr->ip;
94
95#ifdef CONFIG_IP_VS_IPV6
96 if (af == AF_INET6)
97 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
98 addr->ip6[2] ^ addr->ip6[3];
99#endif
100 v = (offset + ntohs(port) + ntohl(addr_fold));
101 return hsiphash(&v, sizeof(v), key);
102}
103
104/* Reset all the hash buckets of the specified table. */
105static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
106{
107 int i;
108 struct ip_vs_mh_lookup *l;
109 struct ip_vs_dest *dest;
110
111 l = &s->lookup[0];
112 for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
113 dest = rcu_dereference_protected(l->dest, 1);
114 if (dest) {
115 ip_vs_dest_put(dest);
116 RCU_INIT_POINTER(l->dest, NULL);
117 }
118 l++;
119 }
120}
121
122static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
123 struct ip_vs_service *svc)
124{
125 struct list_head *p;
126 struct ip_vs_mh_dest_setup *ds;
127 struct ip_vs_dest *dest;
128 int lw;
129
130 /* If gcd is smaller then 1, number of dests or
131 * all last_weight of dests are zero. So, skip
132 * permutation for the dests.
133 */
134 if (s->gcd < 1)
135 return 0;
136
137 /* Set dest_setup for the dests permutation */
138 p = &svc->destinations;
139 ds = &s->dest_setup[0];
140 while ((p = p->next) != &svc->destinations) {
141 dest = list_entry(p, struct ip_vs_dest, n_list);
142
143 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
144 dest->port, &s->hash1, 0) %
145 IP_VS_MH_TAB_SIZE;
146 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
147 dest->port, &s->hash2, 0) %
148 (IP_VS_MH_TAB_SIZE - 1) + 1;
149 ds->perm = ds->offset;
150
151 lw = atomic_read(&dest->last_weight);
152 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
153 ds++;
154 }
155
156 return 0;
157}
158
159static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
160 struct ip_vs_service *svc)
161{
162 int n, c, dt_count;
163 unsigned long *table;
164 struct list_head *p;
165 struct ip_vs_mh_dest_setup *ds;
166 struct ip_vs_dest *dest, *new_dest;
167
168 /* If gcd is smaller then 1, number of dests or
169 * all last_weight of dests are zero. So, skip
170 * the population for the dests and reset lookup table.
171 */
172 if (s->gcd < 1) {
173 ip_vs_mh_reset(s);
174 return 0;
175 }
176
177 table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
178 sizeof(unsigned long), GFP_KERNEL);
179 if (!table)
180 return -ENOMEM;
181
182 p = &svc->destinations;
183 n = 0;
184 dt_count = 0;
185 while (n < IP_VS_MH_TAB_SIZE) {
186 if (p == &svc->destinations)
187 p = p->next;
188
189 ds = &s->dest_setup[0];
190 while (p != &svc->destinations) {
191 /* Ignore added server with zero weight */
192 if (ds->turns < 1) {
193 p = p->next;
194 ds++;
195 continue;
196 }
197
198 c = ds->perm;
199 while (test_bit(c, table)) {
200 /* Add skip, mod IP_VS_MH_TAB_SIZE */
201 ds->perm += ds->skip;
202 if (ds->perm >= IP_VS_MH_TAB_SIZE)
203 ds->perm -= IP_VS_MH_TAB_SIZE;
204 c = ds->perm;
205 }
206
207 __set_bit(c, table);
208
209 dest = rcu_dereference_protected(s->lookup[c].dest, 1);
210 new_dest = list_entry(p, struct ip_vs_dest, n_list);
211 if (dest != new_dest) {
212 if (dest)
213 ip_vs_dest_put(dest);
214 ip_vs_dest_hold(new_dest);
215 RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
216 }
217
218 if (++n == IP_VS_MH_TAB_SIZE)
219 goto out;
220
221 if (++dt_count >= ds->turns) {
222 dt_count = 0;
223 p = p->next;
224 ds++;
225 }
226 }
227 }
228
229out:
230 kfree(table);
231 return 0;
232}
233
234/* Get ip_vs_dest associated with supplied parameters. */
235static inline struct ip_vs_dest *
236ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
237 const union nf_inet_addr *addr, __be16 port)
238{
239 unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
240 % IP_VS_MH_TAB_SIZE;
241 struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
242
243 return (!dest || is_unavailable(dest)) ? NULL : dest;
244}
245
246/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
247static inline struct ip_vs_dest *
248ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
249 const union nf_inet_addr *addr, __be16 port)
250{
251 unsigned int offset, roffset;
252 unsigned int hash, ihash;
253 struct ip_vs_dest *dest;
254
255 /* First try the dest it's supposed to go to */
256 ihash = ip_vs_mh_hashkey(svc->af, addr, port,
257 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
258 dest = rcu_dereference(s->lookup[ihash].dest);
259 if (!dest)
260 return NULL;
261 if (!is_unavailable(dest))
262 return dest;
263
264 IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
265 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
266
267 /* If the original dest is unavailable, loop around the table
268 * starting from ihash to find a new dest
269 */
270 for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
271 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
272 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
273 roffset) % IP_VS_MH_TAB_SIZE;
274 dest = rcu_dereference(s->lookup[hash].dest);
275 if (!dest)
276 break;
277 if (!is_unavailable(dest))
278 return dest;
279 IP_VS_DBG_BUF(6,
280 "MH: selected unavailable server %s:%u (offset %u), reselecting",
281 IP_VS_DBG_ADDR(dest->af, &dest->addr),
282 ntohs(dest->port), roffset);
283 }
284
285 return NULL;
286}
287
288/* Assign all the hash buckets of the specified table with the service. */
289static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
290 struct ip_vs_service *svc)
291{
292 int ret;
293
294 if (svc->num_dests > IP_VS_MH_TAB_SIZE)
295 return -EINVAL;
296
297 if (svc->num_dests >= 1) {
298 s->dest_setup = kcalloc(svc->num_dests,
299 sizeof(struct ip_vs_mh_dest_setup),
300 GFP_KERNEL);
301 if (!s->dest_setup)
302 return -ENOMEM;
303 }
304
305 ip_vs_mh_permutate(s, svc);
306
307 ret = ip_vs_mh_populate(s, svc);
308 if (ret < 0)
309 goto out;
310
311 IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
312 IP_VS_DBG_ADDR(svc->af, &svc->addr),
313 ntohs(svc->port));
314
315out:
316 if (svc->num_dests >= 1) {
317 kfree(s->dest_setup);
318 s->dest_setup = NULL;
319 }
320 return ret;
321}
322
323static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
324{
325 struct ip_vs_dest *dest;
326 int weight;
327 int g = 0;
328
329 list_for_each_entry(dest, &svc->destinations, n_list) {
330 weight = atomic_read(&dest->last_weight);
331 if (weight > 0) {
332 if (g > 0)
333 g = gcd(weight, g);
334 else
335 g = weight;
336 }
337 }
338 return g;
339}
340
341/* To avoid assigning huge weight for the MH table,
342 * calculate shift value with gcd.
343 */
344static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
345{
346 struct ip_vs_dest *dest;
347 int new_weight, weight = 0;
348 int mw, shift;
349
350 /* If gcd is smaller then 1, number of dests or
351 * all last_weight of dests are zero. So, return
352 * shift value as zero.
353 */
354 if (gcd < 1)
355 return 0;
356
357 list_for_each_entry(dest, &svc->destinations, n_list) {
358 new_weight = atomic_read(&dest->last_weight);
359 if (new_weight > weight)
360 weight = new_weight;
361 }
362
363 /* Because gcd is greater than zero,
364 * the maximum weight and gcd are always greater than zero
365 */
366 mw = weight / gcd;
367
368 /* shift = occupied bits of weight/gcd - MH highest bits */
369 shift = fls(mw) - IP_VS_MH_TAB_BITS;
370 return (shift >= 0) ? shift : 0;
371}
372
373static void ip_vs_mh_state_free(struct rcu_head *head)
374{
375 struct ip_vs_mh_state *s;
376
377 s = container_of(head, struct ip_vs_mh_state, rcu_head);
378 kfree(s->lookup);
379 kfree(s);
380}
381
382static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
383{
384 int ret;
385 struct ip_vs_mh_state *s;
386
387 /* Allocate the MH table for this service */
388 s = kzalloc(sizeof(*s), GFP_KERNEL);
389 if (!s)
390 return -ENOMEM;
391
392 s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
393 GFP_KERNEL);
394 if (!s->lookup) {
395 kfree(s);
396 return -ENOMEM;
397 }
398
399 generate_hash_secret(&s->hash1, &s->hash2);
400 s->gcd = ip_vs_mh_gcd_weight(svc);
401 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
402
403 IP_VS_DBG(6,
404 "MH lookup table (memory=%zdbytes) allocated for current service\n",
405 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
406
407 /* Assign the lookup table with current dests */
408 ret = ip_vs_mh_reassign(s, svc);
409 if (ret < 0) {
410 ip_vs_mh_reset(s);
411 ip_vs_mh_state_free(&s->rcu_head);
412 return ret;
413 }
414
415 /* No more failures, attach state */
416 svc->sched_data = s;
417 return 0;
418}
419
420static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
421{
422 struct ip_vs_mh_state *s = svc->sched_data;
423
424 /* Got to clean up lookup entry here */
425 ip_vs_mh_reset(s);
426
427 call_rcu(&s->rcu_head, ip_vs_mh_state_free);
428 IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
429 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
430}
431
432static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
433 struct ip_vs_dest *dest)
434{
435 struct ip_vs_mh_state *s = svc->sched_data;
436
437 s->gcd = ip_vs_mh_gcd_weight(svc);
438 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
439
440 /* Assign the lookup table with the updated service */
441 return ip_vs_mh_reassign(s, svc);
442}
443
444/* Helper function to get port number */
445static inline __be16
446ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
447{
448 __be16 _ports[2], *ports;
449
450 /* At this point we know that we have a valid packet of some kind.
451 * Because ICMP packets are only guaranteed to have the first 8
452 * bytes, let's just grab the ports. Fortunately they're in the
453 * same position for all three of the protocols we care about.
454 */
455 switch (iph->protocol) {
456 case IPPROTO_TCP:
457 case IPPROTO_UDP:
458 case IPPROTO_SCTP:
459 ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
460 &_ports);
461 if (unlikely(!ports))
462 return 0;
463
464 if (likely(!ip_vs_iph_inverse(iph)))
465 return ports[0];
466 else
467 return ports[1];
468 default:
469 return 0;
470 }
471}
472
473/* Maglev Hashing scheduling */
474static struct ip_vs_dest *
475ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
476 struct ip_vs_iphdr *iph)
477{
478 struct ip_vs_dest *dest;
479 struct ip_vs_mh_state *s;
480 __be16 port = 0;
481 const union nf_inet_addr *hash_addr;
482
483 hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
484
485 IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
486
487 if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
488 port = ip_vs_mh_get_port(skb, iph);
489
490 s = (struct ip_vs_mh_state *)svc->sched_data;
491
492 if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
493 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
494 else
495 dest = ip_vs_mh_get(svc, s, hash_addr, port);
496
497 if (!dest) {
498 ip_vs_scheduler_err(svc, "no destination available");
499 return NULL;
500 }
501
502 IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
503 IP_VS_DBG_ADDR(svc->af, hash_addr),
504 ntohs(port),
505 IP_VS_DBG_ADDR(dest->af, &dest->addr),
506 ntohs(dest->port));
507
508 return dest;
509}
510
511/* IPVS MH Scheduler structure */
512static struct ip_vs_scheduler ip_vs_mh_scheduler = {
513 .name = "mh",
514 .refcnt = ATOMIC_INIT(0),
515 .module = THIS_MODULE,
516 .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
517 .init_service = ip_vs_mh_init_svc,
518 .done_service = ip_vs_mh_done_svc,
519 .add_dest = ip_vs_mh_dest_changed,
520 .del_dest = ip_vs_mh_dest_changed,
521 .upd_dest = ip_vs_mh_dest_changed,
522 .schedule = ip_vs_mh_schedule,
523};
524
525static int __init ip_vs_mh_init(void)
526{
527 return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
528}
529
530static void __exit ip_vs_mh_cleanup(void)
531{
532 unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
533 rcu_barrier();
534}
535
536module_init(ip_vs_mh_init);
537module_exit(ip_vs_mh_cleanup);
538MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
539MODULE_LICENSE("GPL v2");
540MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 6cf3fd81a5ec..eb8b9c883889 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -67,15 +67,20 @@
67#include <net/netfilter/nf_conntrack_zones.h> 67#include <net/netfilter/nf_conntrack_zones.h>
68 68
69 69
70#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" 70#define FMT_TUPLE "%s:%u->%s:%u/%u"
71#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ 71#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \
72 &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ 72 ntohs((T)->src.u.all), \
73 IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \
74 ntohs((T)->dst.u.all), \
73 (T)->dst.protonum 75 (T)->dst.protonum
74 76
75#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" 77#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u"
76#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ 78#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \
77 &((C)->vaddr.ip), ntohs((C)->vport), \ 79 ntohs((C)->cport), \
78 &((C)->daddr.ip), ntohs((C)->dport), \ 80 IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \
81 ntohs((C)->vport), \
82 IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \
83 ntohs((C)->dport), \
79 (C)->protocol, (C)->state 84 (C)->protocol, (C)->state
80 85
81void 86void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
127 new_tuple.dst.protonum != IPPROTO_ICMPV6) 132 new_tuple.dst.protonum != IPPROTO_ICMPV6)
128 new_tuple.dst.u.tcp.port = cp->vport; 133 new_tuple.dst.u.tcp.port = cp->vport;
129 } 134 }
130 IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " 135 IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
131 "ctinfo=%d, old reply=" FMT_TUPLE 136 "ctinfo=%d, old reply=" FMT_TUPLE "\n",
132 ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", 137 __func__, ct, ct->status, ctinfo,
133 __func__, ct, ct->status, ctinfo, 138 ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
134 ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), 139 IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
135 ARG_TUPLE(&new_tuple), ARG_CONN(cp)); 140 "ctinfo=%d, new reply=" FMT_TUPLE "\n",
141 __func__, ct, ct->status, ctinfo,
142 ARG_TUPLE(&new_tuple));
136 nf_conntrack_alter_reply(ct, &new_tuple); 143 nf_conntrack_alter_reply(ct, &new_tuple);
144 IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
145 __func__, ct, ARG_CONN(cp));
137} 146}
138 147
139int ip_vs_confirm_conntrack(struct sk_buff *skb) 148int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
152 struct ip_vs_conn_param p; 161 struct ip_vs_conn_param p;
153 struct net *net = nf_ct_net(ct); 162 struct net *net = nf_ct_net(ct);
154 163
155 if (exp->tuple.src.l3num != PF_INET)
156 return;
157
158 /* 164 /*
159 * We assume that no NF locks are held before this callback. 165 * We assume that no NF locks are held before this callback.
160 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their 166 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
171 cp = ip_vs_conn_out_get(&p); 177 cp = ip_vs_conn_out_get(&p);
172 if (cp) { 178 if (cp) {
173 /* Change reply CLIENT->RS to CLIENT->VS */ 179 /* Change reply CLIENT->RS to CLIENT->VS */
180 IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
181 FMT_CONN "\n",
182 __func__, ct, ct->status, ARG_CONN(cp));
174 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; 183 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
175 IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " 184 IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
176 FMT_TUPLE ", found inout cp=" FMT_CONN "\n", 185 FMT_TUPLE "\n",
177 __func__, ct, ct->status, 186 __func__, ct, ARG_TUPLE(&new_reply));
178 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
179 ARG_CONN(cp));
180 new_reply.dst.u3 = cp->vaddr; 187 new_reply.dst.u3 = cp->vaddr;
181 new_reply.dst.u.tcp.port = cp->vport; 188 new_reply.dst.u.tcp.port = cp->vport;
182 IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
183 ", inout cp=" FMT_CONN "\n",
184 __func__, ct,
185 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
186 ARG_CONN(cp));
187 goto alter; 189 goto alter;
188 } 190 }
189 191
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
191 cp = ip_vs_conn_in_get(&p); 193 cp = ip_vs_conn_in_get(&p);
192 if (cp) { 194 if (cp) {
193 /* Change reply VS->CLIENT to RS->CLIENT */ 195 /* Change reply VS->CLIENT to RS->CLIENT */
196 IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
197 FMT_CONN "\n",
198 __func__, ct, ct->status, ARG_CONN(cp));
194 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; 199 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
195 IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " 200 IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
196 FMT_TUPLE ", found outin cp=" FMT_CONN "\n", 201 FMT_TUPLE "\n",
197 __func__, ct, ct->status, 202 __func__, ct, ARG_TUPLE(&new_reply));
198 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
199 ARG_CONN(cp));
200 new_reply.src.u3 = cp->daddr; 203 new_reply.src.u3 = cp->daddr;
201 new_reply.src.u.tcp.port = cp->dport; 204 new_reply.src.u.tcp.port = cp->dport;
202 IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
203 FMT_TUPLE ", outin cp=" FMT_CONN "\n",
204 __func__, ct,
205 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
206 ARG_CONN(cp));
207 goto alter; 205 goto alter;
208 } 206 }
209 207
210 IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE 208 IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
211 " - unknown expect\n", 209 " - unknown expect\n",
212 __func__, ct, ct->status, ARG_TUPLE(orig)); 210 __func__, ct, ct->status, ARG_TUPLE(orig));
213 return; 211 return;
214 212
215alter: 213alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
247 245
248 exp->expectfn = ip_vs_nfct_expect_callback; 246 exp->expectfn = ip_vs_nfct_expect_callback;
249 247
250 IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", 248 IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
251 __func__, ct, ARG_TUPLE(&exp->tuple)); 249 __func__, ct, ARG_TUPLE(&exp->tuple));
252 nf_ct_expect_related(exp); 250 nf_ct_expect_related(exp);
253 nf_ct_expect_put(exp); 251 nf_ct_expect_put(exp);
254} 252}
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
274 tuple.dst.u3 = cp->vaddr; 272 tuple.dst.u3 = cp->vaddr;
275 tuple.dst.u.all = cp->vport; 273 tuple.dst.u.all = cp->vport;
276 274
277 IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE 275 IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
278 " for conn " FMT_CONN "\n", 276 __func__, ARG_CONN(cp));
279 __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
280 277
281 h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); 278 h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
282 if (h) { 279 if (h) {
283 ct = nf_ct_tuplehash_to_ctrack(h); 280 ct = nf_ct_tuplehash_to_ctrack(h);
284 if (nf_ct_kill(ct)) { 281 if (nf_ct_kill(ct)) {
285 IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple=" 282 IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
286 FMT_TUPLE "\n", 283 FMT_TUPLE "\n",
287 __func__, ct, ARG_TUPLE(&tuple)); 284 __func__, ct, ARG_TUPLE(&tuple));
288 } else { 285 } else {
289 IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" 286 IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
290 FMT_TUPLE "\n", 287 FMT_TUPLE "\n",
291 __func__, ct, ARG_TUPLE(&tuple)); 288 __func__, ct, ARG_TUPLE(&tuple));
292 } 289 }
293 nf_ct_put(ct); 290 nf_ct_put(ct);
294 } else { 291 } else {
295 IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", 292 IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
296 __func__, ARG_TUPLE(&tuple)); 293 __func__, ARG_TUPLE(&tuple));
297 } 294 }
298} 295}
299 296
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
109 return 0; 109 return 0;
110 110
111 /* Call application helper if needed */ 111 /* Call application helper if needed */
112 ret = ip_vs_app_pkt_out(cp, skb); 112 ret = ip_vs_app_pkt_out(cp, skb, iph);
113 if (ret == 0) 113 if (ret == 0)
114 return 0; 114 return 0;
115 /* ret=2: csum update is needed after payload mangling */ 115 /* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
156 return 0; 156 return 0;
157 157
158 /* Call application helper if needed */ 158 /* Call application helper if needed */
159 ret = ip_vs_app_pkt_in(cp, skb); 159 ret = ip_vs_app_pkt_in(cp, skb, iph);
160 if (ret == 0) 160 if (ret == 0)
161 return 0; 161 return 0;
162 /* ret=2: csum update is needed after payload mangling */ 162 /* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index bcd9b7bde4ee..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
170 return 0; 170 return 0;
171 171
172 /* Call application helper if needed */ 172 /* Call application helper if needed */
173 if (!(ret = ip_vs_app_pkt_out(cp, skb))) 173 if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
174 return 0; 174 return 0;
175 /* ret=2: csum update is needed after payload mangling */ 175 /* ret=2: csum update is needed after payload mangling */
176 if (ret == 1) 176 if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
251 * Attempt ip_vs_app call. 251 * Attempt ip_vs_app call.
252 * It will fix ip_vs_conn and iph ack_seq stuff 252 * It will fix ip_vs_conn and iph ack_seq stuff
253 */ 253 */
254 if (!(ret = ip_vs_app_pkt_in(cp, skb))) 254 if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
255 return 0; 255 return 0;
256 /* ret=2: csum update is needed after payload mangling */ 256 /* ret=2: csum update is needed after payload mangling */
257 if (ret == 1) 257 if (ret == 1)
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
436 return tcp_state_active_table[state]; 436 return tcp_state_active_table[state];
437} 437}
438 438
439static struct tcp_states_t tcp_states [] = { 439static struct tcp_states_t tcp_states[] = {
440/* INPUT */ 440/* INPUT */
441/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 441/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
442/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, 442/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
459/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 459/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
460}; 460};
461 461
462static struct tcp_states_t tcp_states_dos [] = { 462static struct tcp_states_t tcp_states_dos[] = {
463/* INPUT */ 463/* INPUT */
464/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 464/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
465/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, 465/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
162 /* 162 /*
163 * Call application helper if needed 163 * Call application helper if needed
164 */ 164 */
165 if (!(ret = ip_vs_app_pkt_out(cp, skb))) 165 if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
166 return 0; 166 return 0;
167 /* ret=2: csum update is needed after payload mangling */ 167 /* ret=2: csum update is needed after payload mangling */
168 if (ret == 1) 168 if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
246 * Attempt ip_vs_app call. 246 * Attempt ip_vs_app call.
247 * It will fix ip_vs_conn 247 * It will fix ip_vs_conn
248 */ 248 */
249 if (!(ret = ip_vs_app_pkt_in(cp, skb))) 249 if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
250 return 0; 250 return 0;
251 /* ret=2: csum update is needed after payload mangling */ 251 /* ret=2: csum update is needed after payload mangling */
252 if (ret == 1) 252 if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
96 addr_fold = addr->ip6[0]^addr->ip6[1]^ 96 addr_fold = addr->ip6[0]^addr->ip6[1]^
97 addr->ip6[2]^addr->ip6[3]; 97 addr->ip6[2]^addr->ip6[3];
98#endif 98#endif
99 return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & 99 return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
100 IP_VS_SH_TAB_BITS)) &
100 IP_VS_SH_TAB_MASK; 101 IP_VS_SH_TAB_MASK;
101} 102}
102 103
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..ba0a0fd045c8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
266 266
267 /* check and decrement ttl */ 267 /* check and decrement ttl */
268 if (ipv6_hdr(skb)->hop_limit <= 1) { 268 if (ipv6_hdr(skb)->hop_limit <= 1) {
269 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
270
269 /* Force OUTPUT device used as source address */ 271 /* Force OUTPUT device used as source address */
270 skb->dev = dst->dev; 272 skb->dev = dst->dev;
271 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 273 icmpv6_send(skb, ICMPV6_TIME_EXCEED,
272 ICMPV6_EXC_HOPLIMIT, 0); 274 ICMPV6_EXC_HOPLIMIT, 0);
273 __IP6_INC_STATS(net, ip6_dst_idev(dst), 275 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
274 IPSTATS_MIB_INHDRERRORS);
275 276
276 return false; 277 return false;
277 } 278 }
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 153e690e2893..3b5059a8dcdd 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -79,7 +79,7 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
79 return memcmp(a, b, klen * sizeof(u32)); 79 return memcmp(a, b, klen * sizeof(u32));
80} 80}
81 81
82static bool add_hlist(struct hlist_head *head, 82bool nf_conncount_add(struct hlist_head *head,
83 const struct nf_conntrack_tuple *tuple) 83 const struct nf_conntrack_tuple *tuple)
84{ 84{
85 struct nf_conncount_tuple *conn; 85 struct nf_conncount_tuple *conn;
@@ -91,12 +91,12 @@ static bool add_hlist(struct hlist_head *head,
91 hlist_add_head(&conn->node, head); 91 hlist_add_head(&conn->node, head);
92 return true; 92 return true;
93} 93}
94EXPORT_SYMBOL_GPL(nf_conncount_add);
94 95
95static unsigned int check_hlist(struct net *net, 96unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
96 struct hlist_head *head, 97 const struct nf_conntrack_tuple *tuple,
97 const struct nf_conntrack_tuple *tuple, 98 const struct nf_conntrack_zone *zone,
98 const struct nf_conntrack_zone *zone, 99 bool *addit)
99 bool *addit)
100{ 100{
101 const struct nf_conntrack_tuple_hash *found; 101 const struct nf_conntrack_tuple_hash *found;
102 struct nf_conncount_tuple *conn; 102 struct nf_conncount_tuple *conn;
@@ -141,6 +141,7 @@ static unsigned int check_hlist(struct net *net,
141 141
142 return length; 142 return length;
143} 143}
144EXPORT_SYMBOL_GPL(nf_conncount_lookup);
144 145
145static void tree_nodes_free(struct rb_root *root, 146static void tree_nodes_free(struct rb_root *root,
146 struct nf_conncount_rb *gc_nodes[], 147 struct nf_conncount_rb *gc_nodes[],
@@ -187,13 +188,15 @@ count_tree(struct net *net, struct rb_root *root,
187 } else { 188 } else {
188 /* same source network -> be counted! */ 189 /* same source network -> be counted! */
189 unsigned int count; 190 unsigned int count;
190 count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit); 191
192 count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
193 zone, &addit);
191 194
192 tree_nodes_free(root, gc_nodes, gc_count); 195 tree_nodes_free(root, gc_nodes, gc_count);
193 if (!addit) 196 if (!addit)
194 return count; 197 return count;
195 198
196 if (!add_hlist(&rbconn->hhead, tuple)) 199 if (!nf_conncount_add(&rbconn->hhead, tuple))
197 return 0; /* hotdrop */ 200 return 0; /* hotdrop */
198 201
199 return count + 1; 202 return count + 1;
@@ -203,7 +206,7 @@ count_tree(struct net *net, struct rb_root *root,
203 continue; 206 continue;
204 207
205 /* only used for GC on hhead, retval and 'addit' ignored */ 208 /* only used for GC on hhead, retval and 'addit' ignored */
206 check_hlist(net, &rbconn->hhead, tuple, zone, &addit); 209 nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
207 if (hlist_empty(&rbconn->hhead)) 210 if (hlist_empty(&rbconn->hhead))
208 gc_nodes[gc_count++] = rbconn; 211 gc_nodes[gc_count++] = rbconn;
209 } 212 }
@@ -303,11 +306,19 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
303} 306}
304EXPORT_SYMBOL_GPL(nf_conncount_init); 307EXPORT_SYMBOL_GPL(nf_conncount_init);
305 308
306static void destroy_tree(struct rb_root *r) 309void nf_conncount_cache_free(struct hlist_head *hhead)
307{ 310{
308 struct nf_conncount_tuple *conn; 311 struct nf_conncount_tuple *conn;
309 struct nf_conncount_rb *rbconn;
310 struct hlist_node *n; 312 struct hlist_node *n;
313
314 hlist_for_each_entry_safe(conn, n, hhead, node)
315 kmem_cache_free(conncount_conn_cachep, conn);
316}
317EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
318
319static void destroy_tree(struct rb_root *r)
320{
321 struct nf_conncount_rb *rbconn;
311 struct rb_node *node; 322 struct rb_node *node;
312 323
313 while ((node = rb_first(r)) != NULL) { 324 while ((node = rb_first(r)) != NULL) {
@@ -315,8 +326,7 @@ static void destroy_tree(struct rb_root *r)
315 326
316 rb_erase(node, r); 327 rb_erase(node, r);
317 328
318 hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) 329 nf_conncount_cache_free(&rbconn->hhead);
319 kmem_cache_free(conncount_conn_cachep, conn);
320 330
321 kmem_cache_free(conncount_rb_cachep, rbconn); 331 kmem_cache_free(conncount_rb_cachep, rbconn);
322 } 332 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 41ff04ee2554..3465da2a98bd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -58,11 +58,6 @@
58 58
59#include "nf_internals.h" 59#include "nf_internals.h"
60 60
61int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
62 enum nf_nat_manip_type manip,
63 const struct nlattr *attr) __read_mostly;
64EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
65
66__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 61__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
67EXPORT_SYMBOL_GPL(nf_conntrack_locks); 62EXPORT_SYMBOL_GPL(nf_conntrack_locks);
68 63
@@ -186,6 +181,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
186EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 181EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
187 182
188unsigned int nf_conntrack_max __read_mostly; 183unsigned int nf_conntrack_max __read_mostly;
184EXPORT_SYMBOL_GPL(nf_conntrack_max);
189seqcount_t nf_conntrack_generation __read_mostly; 185seqcount_t nf_conntrack_generation __read_mostly;
190static unsigned int nf_conntrack_hash_rnd __read_mostly; 186static unsigned int nf_conntrack_hash_rnd __read_mostly;
191 187
@@ -1611,6 +1607,82 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1611 nf_conntrack_get(skb_nfct(nskb)); 1607 nf_conntrack_get(skb_nfct(nskb));
1612} 1608}
1613 1609
1610static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
1611{
1612 const struct nf_conntrack_l3proto *l3proto;
1613 const struct nf_conntrack_l4proto *l4proto;
1614 struct nf_conntrack_tuple_hash *h;
1615 struct nf_conntrack_tuple tuple;
1616 enum ip_conntrack_info ctinfo;
1617 struct nf_nat_hook *nat_hook;
1618 unsigned int dataoff, status;
1619 struct nf_conn *ct;
1620 u16 l3num;
1621 u8 l4num;
1622
1623 ct = nf_ct_get(skb, &ctinfo);
1624 if (!ct || nf_ct_is_confirmed(ct))
1625 return 0;
1626
1627 l3num = nf_ct_l3num(ct);
1628 l3proto = nf_ct_l3proto_find_get(l3num);
1629
1630 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
1631 &l4num) <= 0)
1632 return -1;
1633
1634 l4proto = nf_ct_l4proto_find_get(l3num, l4num);
1635
1636 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
1637 l4num, net, &tuple, l3proto, l4proto))
1638 return -1;
1639
1640 if (ct->status & IPS_SRC_NAT) {
1641 memcpy(tuple.src.u3.all,
1642 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
1643 sizeof(tuple.src.u3.all));
1644 tuple.src.u.all =
1645 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
1646 }
1647
1648 if (ct->status & IPS_DST_NAT) {
1649 memcpy(tuple.dst.u3.all,
1650 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
1651 sizeof(tuple.dst.u3.all));
1652 tuple.dst.u.all =
1653 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
1654 }
1655
1656 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
1657 if (!h)
1658 return 0;
1659
1660 /* Store status bits of the conntrack that is clashing to re-do NAT
1661 * mangling according to what it has been done already to this packet.
1662 */
1663 status = ct->status;
1664
1665 nf_ct_put(ct);
1666 ct = nf_ct_tuplehash_to_ctrack(h);
1667 nf_ct_set(skb, ct, ctinfo);
1668
1669 nat_hook = rcu_dereference(nf_nat_hook);
1670 if (!nat_hook)
1671 return 0;
1672
1673 if (status & IPS_SRC_NAT &&
1674 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
1675 IP_CT_DIR_ORIGINAL) == NF_DROP)
1676 return -1;
1677
1678 if (status & IPS_DST_NAT &&
1679 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
1680 IP_CT_DIR_ORIGINAL) == NF_DROP)
1681 return -1;
1682
1683 return 0;
1684}
1685
1614/* Bring out ya dead! */ 1686/* Bring out ya dead! */
1615static struct nf_conn * 1687static struct nf_conn *
1616get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 1688get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -1812,8 +1884,7 @@ void nf_conntrack_cleanup_start(void)
1812 1884
1813void nf_conntrack_cleanup_end(void) 1885void nf_conntrack_cleanup_end(void)
1814{ 1886{
1815 RCU_INIT_POINTER(nf_ct_destroy, NULL); 1887 RCU_INIT_POINTER(nf_ct_hook, NULL);
1816
1817 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 1888 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
1818 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 1889 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1819 1890
@@ -2130,11 +2201,16 @@ err_cachep:
2130 return ret; 2201 return ret;
2131} 2202}
2132 2203
2204static struct nf_ct_hook nf_conntrack_hook = {
2205 .update = nf_conntrack_update,
2206 .destroy = destroy_conntrack,
2207};
2208
2133void nf_conntrack_init_end(void) 2209void nf_conntrack_init_end(void)
2134{ 2210{
2135 /* For use by REJECT target */ 2211 /* For use by REJECT target */
2136 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); 2212 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2137 RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); 2213 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2138} 2214}
2139 2215
2140/* 2216/*
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index f0e9a7511e1a..a11c304fb771 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -566,8 +566,7 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
566 .timeout = 5 * 60, 566 .timeout = 5 * 60,
567}; 567};
568 568
569/* don't make this __exit, since it's called from __init ! */ 569static void __exit nf_conntrack_ftp_fini(void)
570static void nf_conntrack_ftp_fini(void)
571{ 570{
572 nf_conntrack_helpers_unregister(ftp, ports_c * 2); 571 nf_conntrack_helpers_unregister(ftp, ports_c * 2);
573 kfree(ftp_buffer); 572 kfree(ftp_buffer);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 5523acce9d69..4099f4d79bae 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -232,8 +232,6 @@ static int help(struct sk_buff *skb, unsigned int protoff,
232static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; 232static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
233static struct nf_conntrack_expect_policy irc_exp_policy; 233static struct nf_conntrack_expect_policy irc_exp_policy;
234 234
235static void nf_conntrack_irc_fini(void);
236
237static int __init nf_conntrack_irc_init(void) 235static int __init nf_conntrack_irc_init(void)
238{ 236{
239 int i, ret; 237 int i, ret;
@@ -276,9 +274,7 @@ static int __init nf_conntrack_irc_init(void)
276 return 0; 274 return 0;
277} 275}
278 276
279/* This function is intentionally _NOT_ defined as __exit, because 277static void __exit nf_conntrack_irc_fini(void)
280 * it is needed by the init function */
281static void nf_conntrack_irc_fini(void)
282{ 278{
283 nf_conntrack_helpers_unregister(irc, ports_c); 279 nf_conntrack_helpers_unregister(irc, ports_c);
284 kfree(irc_buffer); 280 kfree(irc_buffer);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4c1d0c5bc268..39327a42879f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1431,11 +1431,11 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
1431 enum nf_nat_manip_type manip, 1431 enum nf_nat_manip_type manip,
1432 const struct nlattr *attr) 1432 const struct nlattr *attr)
1433{ 1433{
1434 typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; 1434 struct nf_nat_hook *nat_hook;
1435 int err; 1435 int err;
1436 1436
1437 parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); 1437 nat_hook = rcu_dereference(nf_nat_hook);
1438 if (!parse_nat_setup) { 1438 if (!nat_hook) {
1439#ifdef CONFIG_MODULES 1439#ifdef CONFIG_MODULES
1440 rcu_read_unlock(); 1440 rcu_read_unlock();
1441 nfnl_unlock(NFNL_SUBSYS_CTNETLINK); 1441 nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
@@ -1446,13 +1446,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
1446 } 1446 }
1447 nfnl_lock(NFNL_SUBSYS_CTNETLINK); 1447 nfnl_lock(NFNL_SUBSYS_CTNETLINK);
1448 rcu_read_lock(); 1448 rcu_read_lock();
1449 if (nfnetlink_parse_nat_setup_hook) 1449 if (nat_hook->parse_nat_setup)
1450 return -EAGAIN; 1450 return -EAGAIN;
1451#endif 1451#endif
1452 return -EOPNOTSUPP; 1452 return -EOPNOTSUPP;
1453 } 1453 }
1454 1454
1455 err = parse_nat_setup(ct, manip, attr); 1455 err = nat_hook->parse_nat_setup(ct, manip, attr);
1456 if (err == -EAGAIN) { 1456 if (err == -EAGAIN) {
1457#ifdef CONFIG_MODULES 1457#ifdef CONFIG_MODULES
1458 rcu_read_unlock(); 1458 rcu_read_unlock();
@@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
2205 if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) 2205 if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
2206 goto nla_put_failure; 2206 goto nla_put_failure;
2207 2207
2208 if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
2209 goto nla_put_failure;
2210
2208 nlmsg_end(skb, nlh); 2211 nlmsg_end(skb, nlh);
2209 return skb->len; 2212 return skb->len;
2210 2213
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index ae457f39d5ce..5072ff96ab33 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -173,8 +173,7 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
173 .timeout = 5 * 60, 173 .timeout = 5 * 60,
174}; 174};
175 175
176/* don't make this __exit, since it's called from __init ! */ 176static void __exit nf_conntrack_sane_fini(void)
177static void nf_conntrack_sane_fini(void)
178{ 177{
179 nf_conntrack_helpers_unregister(sane, ports_c * 2); 178 nf_conntrack_helpers_unregister(sane, ports_c * 2);
180 kfree(sane_buffer); 179 kfree(sane_buffer);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 908e51e2dc2b..c8d2b6688a2a 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1617,7 +1617,7 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1
1617 }, 1617 },
1618}; 1618};
1619 1619
1620static void nf_conntrack_sip_fini(void) 1620static void __exit nf_conntrack_sip_fini(void)
1621{ 1621{
1622 nf_conntrack_helpers_unregister(sip, ports_c * 4); 1622 nf_conntrack_helpers_unregister(sip, ports_c * 4);
1623} 1623}
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 0ec6779fd5d9..548b673b3625 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -104,7 +104,7 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = {
104 .timeout = 5 * 60, 104 .timeout = 5 * 60,
105}; 105};
106 106
107static void nf_conntrack_tftp_fini(void) 107static void __exit nf_conntrack_tftp_fini(void)
108{ 108{
109 nf_conntrack_helpers_unregister(tftp, ports_c * 2); 109 nf_conntrack_helpers_unregister(tftp, ports_c * 2);
110} 110}
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c
index ec410cae9307..eb0d1658ac05 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -4,6 +4,8 @@
4#include <linux/netfilter.h> 4#include <linux/netfilter.h>
5#include <linux/rhashtable.h> 5#include <linux/rhashtable.h>
6#include <linux/netdevice.h> 6#include <linux/netdevice.h>
7#include <net/ip.h>
8#include <net/ip6_route.h>
7#include <net/netfilter/nf_tables.h> 9#include <net/netfilter/nf_tables.h>
8#include <net/netfilter/nf_flow_table.h> 10#include <net/netfilter/nf_flow_table.h>
9#include <net/netfilter/nf_conntrack.h> 11#include <net/netfilter/nf_conntrack.h>
@@ -16,6 +18,43 @@ struct flow_offload_entry {
16 struct rcu_head rcu_head; 18 struct rcu_head rcu_head;
17}; 19};
18 20
21static DEFINE_MUTEX(flowtable_lock);
22static LIST_HEAD(flowtables);
23
24static void
25flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
26 struct nf_flow_route *route,
27 enum flow_offload_tuple_dir dir)
28{
29 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
30 struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
31 struct dst_entry *dst = route->tuple[dir].dst;
32
33 ft->dir = dir;
34
35 switch (ctt->src.l3num) {
36 case NFPROTO_IPV4:
37 ft->src_v4 = ctt->src.u3.in;
38 ft->dst_v4 = ctt->dst.u3.in;
39 ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
40 break;
41 case NFPROTO_IPV6:
42 ft->src_v6 = ctt->src.u3.in6;
43 ft->dst_v6 = ctt->dst.u3.in6;
44 ft->mtu = ip6_dst_mtu_forward(dst);
45 break;
46 }
47
48 ft->l3proto = ctt->src.l3num;
49 ft->l4proto = ctt->dst.protonum;
50 ft->src_port = ctt->src.u.tcp.port;
51 ft->dst_port = ctt->dst.u.tcp.port;
52
53 ft->iifidx = route->tuple[dir].ifindex;
54 ft->oifidx = route->tuple[!dir].ifindex;
55 ft->dst_cache = dst;
56}
57
19struct flow_offload * 58struct flow_offload *
20flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) 59flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
21{ 60{
@@ -40,69 +79,12 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
40 79
41 entry->ct = ct; 80 entry->ct = ct;
42 81
43 switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { 82 flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
44 case NFPROTO_IPV4: 83 flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
45 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
46 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
47 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
48 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
49 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
50 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
51 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
52 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
53 break;
54 case NFPROTO_IPV6:
55 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
56 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
57 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
58 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
59 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
60 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
61 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
62 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
63 break;
64 }
65
66 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
67 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
68 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
69 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
70 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
71 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
72 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
73 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
74
75 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
76 route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
77 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
78 route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
79
80 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
81 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
82 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
83 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
84 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
85 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
86 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
87 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
88
89 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
90 FLOW_OFFLOAD_DIR_ORIGINAL;
91 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
92 FLOW_OFFLOAD_DIR_REPLY;
93
94 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
95 route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
96 flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
97 route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
98 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
99 route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
100 flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
101 route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
102 84
103 if (ct->status & IPS_SRC_NAT) 85 if (ct->status & IPS_SRC_NAT)
104 flow->flags |= FLOW_OFFLOAD_SNAT; 86 flow->flags |= FLOW_OFFLOAD_SNAT;
105 else if (ct->status & IPS_DST_NAT) 87 if (ct->status & IPS_DST_NAT)
106 flow->flags |= FLOW_OFFLOAD_DNAT; 88 flow->flags |= FLOW_OFFLOAD_DNAT;
107 89
108 return flow; 90 return flow;
@@ -118,6 +100,43 @@ err_ct_refcnt:
118} 100}
119EXPORT_SYMBOL_GPL(flow_offload_alloc); 101EXPORT_SYMBOL_GPL(flow_offload_alloc);
120 102
103static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
104{
105 tcp->state = TCP_CONNTRACK_ESTABLISHED;
106 tcp->seen[0].td_maxwin = 0;
107 tcp->seen[1].td_maxwin = 0;
108}
109
110static void flow_offload_fixup_ct_state(struct nf_conn *ct)
111{
112 const struct nf_conntrack_l4proto *l4proto;
113 struct net *net = nf_ct_net(ct);
114 unsigned int *timeouts;
115 unsigned int timeout;
116 int l4num;
117
118 l4num = nf_ct_protonum(ct);
119 if (l4num == IPPROTO_TCP)
120 flow_offload_fixup_tcp(&ct->proto.tcp);
121
122 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num);
123 if (!l4proto)
124 return;
125
126 timeouts = l4proto->get_timeouts(net);
127 if (!timeouts)
128 return;
129
130 if (l4num == IPPROTO_TCP)
131 timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
132 else if (l4num == IPPROTO_UDP)
133 timeout = timeouts[UDP_CT_REPLIED];
134 else
135 return;
136
137 ct->timeout = nfct_time_stamp + timeout;
138}
139
121void flow_offload_free(struct flow_offload *flow) 140void flow_offload_free(struct flow_offload *flow)
122{ 141{
123 struct flow_offload_entry *e; 142 struct flow_offload_entry *e;
@@ -125,17 +144,46 @@ void flow_offload_free(struct flow_offload *flow)
125 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); 144 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
126 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); 145 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
127 e = container_of(flow, struct flow_offload_entry, flow); 146 e = container_of(flow, struct flow_offload_entry, flow);
128 nf_ct_delete(e->ct, 0, 0); 147 if (flow->flags & FLOW_OFFLOAD_DYING)
148 nf_ct_delete(e->ct, 0, 0);
129 nf_ct_put(e->ct); 149 nf_ct_put(e->ct);
130 kfree_rcu(e, rcu_head); 150 kfree_rcu(e, rcu_head);
131} 151}
132EXPORT_SYMBOL_GPL(flow_offload_free); 152EXPORT_SYMBOL_GPL(flow_offload_free);
133 153
134void flow_offload_dead(struct flow_offload *flow) 154static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
155{
156 const struct flow_offload_tuple *tuple = data;
157
158 return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
159}
160
161static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
162{
163 const struct flow_offload_tuple_rhash *tuplehash = data;
164
165 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
166}
167
168static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
169 const void *ptr)
135{ 170{
136 flow->flags |= FLOW_OFFLOAD_DYING; 171 const struct flow_offload_tuple *tuple = arg->key;
172 const struct flow_offload_tuple_rhash *x = ptr;
173
174 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
175 return 1;
176
177 return 0;
137} 178}
138EXPORT_SYMBOL_GPL(flow_offload_dead); 179
180static const struct rhashtable_params nf_flow_offload_rhash_params = {
181 .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
182 .hashfn = flow_offload_hash,
183 .obj_hashfn = flow_offload_hash_obj,
184 .obj_cmpfn = flow_offload_hash_cmp,
185 .automatic_shrinking = true,
186};
139 187
140int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) 188int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
141{ 189{
@@ -143,10 +191,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
143 191
144 rhashtable_insert_fast(&flow_table->rhashtable, 192 rhashtable_insert_fast(&flow_table->rhashtable,
145 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, 193 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
146 *flow_table->type->params); 194 nf_flow_offload_rhash_params);
147 rhashtable_insert_fast(&flow_table->rhashtable, 195 rhashtable_insert_fast(&flow_table->rhashtable,
148 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, 196 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
149 *flow_table->type->params); 197 nf_flow_offload_rhash_params);
150 return 0; 198 return 0;
151} 199}
152EXPORT_SYMBOL_GPL(flow_offload_add); 200EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -154,22 +202,51 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
154static void flow_offload_del(struct nf_flowtable *flow_table, 202static void flow_offload_del(struct nf_flowtable *flow_table,
155 struct flow_offload *flow) 203 struct flow_offload *flow)
156{ 204{
205 struct flow_offload_entry *e;
206
157 rhashtable_remove_fast(&flow_table->rhashtable, 207 rhashtable_remove_fast(&flow_table->rhashtable,
158 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, 208 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
159 *flow_table->type->params); 209 nf_flow_offload_rhash_params);
160 rhashtable_remove_fast(&flow_table->rhashtable, 210 rhashtable_remove_fast(&flow_table->rhashtable,
161 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, 211 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
162 *flow_table->type->params); 212 nf_flow_offload_rhash_params);
213
214 e = container_of(flow, struct flow_offload_entry, flow);
215 clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
163 216
164 flow_offload_free(flow); 217 flow_offload_free(flow);
165} 218}
166 219
220void flow_offload_teardown(struct flow_offload *flow)
221{
222 struct flow_offload_entry *e;
223
224 flow->flags |= FLOW_OFFLOAD_TEARDOWN;
225
226 e = container_of(flow, struct flow_offload_entry, flow);
227 flow_offload_fixup_ct_state(e->ct);
228}
229EXPORT_SYMBOL_GPL(flow_offload_teardown);
230
167struct flow_offload_tuple_rhash * 231struct flow_offload_tuple_rhash *
168flow_offload_lookup(struct nf_flowtable *flow_table, 232flow_offload_lookup(struct nf_flowtable *flow_table,
169 struct flow_offload_tuple *tuple) 233 struct flow_offload_tuple *tuple)
170{ 234{
171 return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, 235 struct flow_offload_tuple_rhash *tuplehash;
172 *flow_table->type->params); 236 struct flow_offload *flow;
237 int dir;
238
239 tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
240 nf_flow_offload_rhash_params);
241 if (!tuplehash)
242 return NULL;
243
244 dir = tuplehash->tuple.dir;
245 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
246 if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
247 return NULL;
248
249 return tuplehash;
173} 250}
174EXPORT_SYMBOL_GPL(flow_offload_lookup); 251EXPORT_SYMBOL_GPL(flow_offload_lookup);
175 252
@@ -216,11 +293,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
216 return (__s32)(flow->timeout - (u32)jiffies) <= 0; 293 return (__s32)(flow->timeout - (u32)jiffies) <= 0;
217} 294}
218 295
219static inline bool nf_flow_is_dying(const struct flow_offload *flow)
220{
221 return flow->flags & FLOW_OFFLOAD_DYING;
222}
223
224static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) 296static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
225{ 297{
226 struct flow_offload_tuple_rhash *tuplehash; 298 struct flow_offload_tuple_rhash *tuplehash;
@@ -248,7 +320,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
248 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); 320 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
249 321
250 if (nf_flow_has_expired(flow) || 322 if (nf_flow_has_expired(flow) ||
251 nf_flow_is_dying(flow)) 323 (flow->flags & (FLOW_OFFLOAD_DYING |
324 FLOW_OFFLOAD_TEARDOWN)))
252 flow_offload_del(flow_table, flow); 325 flow_offload_del(flow_table, flow);
253 } 326 }
254out: 327out:
@@ -258,7 +331,7 @@ out:
258 return 1; 331 return 1;
259} 332}
260 333
261void nf_flow_offload_work_gc(struct work_struct *work) 334static void nf_flow_offload_work_gc(struct work_struct *work)
262{ 335{
263 struct nf_flowtable *flow_table; 336 struct nf_flowtable *flow_table;
264 337
@@ -266,42 +339,6 @@ void nf_flow_offload_work_gc(struct work_struct *work)
266 nf_flow_offload_gc_step(flow_table); 339 nf_flow_offload_gc_step(flow_table);
267 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); 340 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
268} 341}
269EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
270
271static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
272{
273 const struct flow_offload_tuple *tuple = data;
274
275 return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
276}
277
278static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
279{
280 const struct flow_offload_tuple_rhash *tuplehash = data;
281
282 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
283}
284
285static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
286 const void *ptr)
287{
288 const struct flow_offload_tuple *tuple = arg->key;
289 const struct flow_offload_tuple_rhash *x = ptr;
290
291 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
292 return 1;
293
294 return 0;
295}
296
297const struct rhashtable_params nf_flow_offload_rhash_params = {
298 .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
299 .hashfn = flow_offload_hash,
300 .obj_hashfn = flow_offload_hash_obj,
301 .obj_cmpfn = flow_offload_hash_cmp,
302 .automatic_shrinking = true,
303};
304EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
305 342
306static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, 343static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
307 __be16 port, __be16 new_port) 344 __be16 port, __be16 new_port)
@@ -419,33 +456,69 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
419} 456}
420EXPORT_SYMBOL_GPL(nf_flow_dnat_port); 457EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
421 458
459int nf_flow_table_init(struct nf_flowtable *flowtable)
460{
461 int err;
462
463 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
464
465 err = rhashtable_init(&flowtable->rhashtable,
466 &nf_flow_offload_rhash_params);
467 if (err < 0)
468 return err;
469
470 queue_delayed_work(system_power_efficient_wq,
471 &flowtable->gc_work, HZ);
472
473 mutex_lock(&flowtable_lock);
474 list_add(&flowtable->list, &flowtables);
475 mutex_unlock(&flowtable_lock);
476
477 return 0;
478}
479EXPORT_SYMBOL_GPL(nf_flow_table_init);
480
422static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) 481static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
423{ 482{
424 struct net_device *dev = data; 483 struct net_device *dev = data;
425 484
426 if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) 485 if (!dev) {
486 flow_offload_teardown(flow);
427 return; 487 return;
488 }
428 489
429 flow_offload_dead(flow); 490 if (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
491 flow->tuplehash[1].tuple.iifidx == dev->ifindex)
492 flow_offload_dead(flow);
430} 493}
431 494
432static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, 495static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
433 void *data) 496 struct net_device *dev)
434{ 497{
435 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data); 498 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
436 flush_delayed_work(&flowtable->gc_work); 499 flush_delayed_work(&flowtable->gc_work);
437} 500}
438 501
439void nf_flow_table_cleanup(struct net *net, struct net_device *dev) 502void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
440{ 503{
441 nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev); 504 struct nf_flowtable *flowtable;
505
506 mutex_lock(&flowtable_lock);
507 list_for_each_entry(flowtable, &flowtables, list)
508 nf_flow_table_iterate_cleanup(flowtable, dev);
509 mutex_unlock(&flowtable_lock);
442} 510}
443EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); 511EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
444 512
445void nf_flow_table_free(struct nf_flowtable *flow_table) 513void nf_flow_table_free(struct nf_flowtable *flow_table)
446{ 514{
515 mutex_lock(&flowtable_lock);
516 list_del(&flow_table->list);
517 mutex_unlock(&flowtable_lock);
518 cancel_delayed_work_sync(&flow_table->gc_work);
447 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); 519 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
448 WARN_ON(!nf_flow_offload_gc_step(flow_table)); 520 WARN_ON(!nf_flow_offload_gc_step(flow_table));
521 rhashtable_destroy(&flow_table->rhashtable);
449} 522}
450EXPORT_SYMBOL_GPL(nf_flow_table_free); 523EXPORT_SYMBOL_GPL(nf_flow_table_free);
451 524
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 375a1881d93d..99771aa7e7ea 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
22 22
23static struct nf_flowtable_type flowtable_inet = { 23static struct nf_flowtable_type flowtable_inet = {
24 .family = NFPROTO_INET, 24 .family = NFPROTO_INET,
25 .params = &nf_flow_offload_rhash_params, 25 .init = nf_flow_table_init,
26 .gc = nf_flow_offload_work_gc,
27 .free = nf_flow_table_free, 26 .free = nf_flow_table_free,
28 .hook = nf_flow_offload_inet_hook, 27 .hook = nf_flow_offload_inet_hook,
29 .owner = THIS_MODULE, 28 .owner = THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
new file mode 100644
index 000000000000..15ed91309992
--- /dev/null
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -0,0 +1,489 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <linux/module.h>
4#include <linux/netfilter.h>
5#include <linux/rhashtable.h>
6#include <linux/ip.h>
7#include <linux/ipv6.h>
8#include <linux/netdevice.h>
9#include <net/ip.h>
10#include <net/ipv6.h>
11#include <net/ip6_route.h>
12#include <net/neighbour.h>
13#include <net/netfilter/nf_flow_table.h>
14/* For layer 4 checksum field offset. */
15#include <linux/tcp.h>
16#include <linux/udp.h>
17
18static int nf_flow_state_check(struct flow_offload *flow, int proto,
19 struct sk_buff *skb, unsigned int thoff)
20{
21 struct tcphdr *tcph;
22
23 if (proto != IPPROTO_TCP)
24 return 0;
25
26 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
27 return -1;
28
29 tcph = (void *)(skb_network_header(skb) + thoff);
30 if (unlikely(tcph->fin || tcph->rst)) {
31 flow_offload_teardown(flow);
32 return -1;
33 }
34
35 return 0;
36}
37
38static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
39 __be32 addr, __be32 new_addr)
40{
41 struct tcphdr *tcph;
42
43 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
44 skb_try_make_writable(skb, thoff + sizeof(*tcph)))
45 return -1;
46
47 tcph = (void *)(skb_network_header(skb) + thoff);
48 inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
49
50 return 0;
51}
52
53static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
54 __be32 addr, __be32 new_addr)
55{
56 struct udphdr *udph;
57
58 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
59 skb_try_make_writable(skb, thoff + sizeof(*udph)))
60 return -1;
61
62 udph = (void *)(skb_network_header(skb) + thoff);
63 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
64 inet_proto_csum_replace4(&udph->check, skb, addr,
65 new_addr, true);
66 if (!udph->check)
67 udph->check = CSUM_MANGLED_0;
68 }
69
70 return 0;
71}
72
73static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
74 unsigned int thoff, __be32 addr,
75 __be32 new_addr)
76{
77 switch (iph->protocol) {
78 case IPPROTO_TCP:
79 if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
80 return NF_DROP;
81 break;
82 case IPPROTO_UDP:
83 if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
84 return NF_DROP;
85 break;
86 }
87
88 return 0;
89}
90
91static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
92 struct iphdr *iph, unsigned int thoff,
93 enum flow_offload_tuple_dir dir)
94{
95 __be32 addr, new_addr;
96
97 switch (dir) {
98 case FLOW_OFFLOAD_DIR_ORIGINAL:
99 addr = iph->saddr;
100 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
101 iph->saddr = new_addr;
102 break;
103 case FLOW_OFFLOAD_DIR_REPLY:
104 addr = iph->daddr;
105 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
106 iph->daddr = new_addr;
107 break;
108 default:
109 return -1;
110 }
111 csum_replace4(&iph->check, addr, new_addr);
112
113 return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
114}
115
116static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
117 struct iphdr *iph, unsigned int thoff,
118 enum flow_offload_tuple_dir dir)
119{
120 __be32 addr, new_addr;
121
122 switch (dir) {
123 case FLOW_OFFLOAD_DIR_ORIGINAL:
124 addr = iph->daddr;
125 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
126 iph->daddr = new_addr;
127 break;
128 case FLOW_OFFLOAD_DIR_REPLY:
129 addr = iph->saddr;
130 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
131 iph->saddr = new_addr;
132 break;
133 default:
134 return -1;
135 }
136 csum_replace4(&iph->check, addr, new_addr);
137
138 return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
139}
140
141static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
142 unsigned int thoff, enum flow_offload_tuple_dir dir)
143{
144 struct iphdr *iph = ip_hdr(skb);
145
146 if (flow->flags & FLOW_OFFLOAD_SNAT &&
147 (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
148 nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
149 return -1;
150 if (flow->flags & FLOW_OFFLOAD_DNAT &&
151 (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
152 nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
153 return -1;
154
155 return 0;
156}
157
158static bool ip_has_options(unsigned int thoff)
159{
160 return thoff != sizeof(struct iphdr);
161}
162
163static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
164 struct flow_offload_tuple *tuple)
165{
166 struct flow_ports *ports;
167 unsigned int thoff;
168 struct iphdr *iph;
169
170 if (!pskb_may_pull(skb, sizeof(*iph)))
171 return -1;
172
173 iph = ip_hdr(skb);
174 thoff = iph->ihl * 4;
175
176 if (ip_is_fragment(iph) ||
177 unlikely(ip_has_options(thoff)))
178 return -1;
179
180 if (iph->protocol != IPPROTO_TCP &&
181 iph->protocol != IPPROTO_UDP)
182 return -1;
183
184 thoff = iph->ihl * 4;
185 if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
186 return -1;
187
188 ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
189
190 tuple->src_v4.s_addr = iph->saddr;
191 tuple->dst_v4.s_addr = iph->daddr;
192 tuple->src_port = ports->source;
193 tuple->dst_port = ports->dest;
194 tuple->l3proto = AF_INET;
195 tuple->l4proto = iph->protocol;
196 tuple->iifidx = dev->ifindex;
197
198 return 0;
199}
200
201/* Based on ip_exceeds_mtu(). */
202static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
203{
204 if (skb->len <= mtu)
205 return false;
206
207 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
208 return false;
209
210 return true;
211}
212
213unsigned int
214nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
215 const struct nf_hook_state *state)
216{
217 struct flow_offload_tuple_rhash *tuplehash;
218 struct nf_flowtable *flow_table = priv;
219 struct flow_offload_tuple tuple = {};
220 enum flow_offload_tuple_dir dir;
221 struct flow_offload *flow;
222 struct net_device *outdev;
223 struct rtable *rt;
224 unsigned int thoff;
225 struct iphdr *iph;
226 __be32 nexthop;
227
228 if (skb->protocol != htons(ETH_P_IP))
229 return NF_ACCEPT;
230
231 if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
232 return NF_ACCEPT;
233
234 tuplehash = flow_offload_lookup(flow_table, &tuple);
235 if (tuplehash == NULL)
236 return NF_ACCEPT;
237
238 outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
239 if (!outdev)
240 return NF_ACCEPT;
241
242 dir = tuplehash->tuple.dir;
243 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
244 rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
245
246 if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
247 (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
248 return NF_ACCEPT;
249
250 if (skb_try_make_writable(skb, sizeof(*iph)))
251 return NF_DROP;
252
253 thoff = ip_hdr(skb)->ihl * 4;
254 if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
255 return NF_ACCEPT;
256
257 if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
258 nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
259 return NF_DROP;
260
261 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
262 iph = ip_hdr(skb);
263 ip_decrease_ttl(iph);
264
265 skb->dev = outdev;
266 nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
267 skb_dst_set_noref(skb, &rt->dst);
268 neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
269
270 return NF_STOLEN;
271}
272EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
273
274static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
275 struct in6_addr *addr,
276 struct in6_addr *new_addr)
277{
278 struct tcphdr *tcph;
279
280 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
281 skb_try_make_writable(skb, thoff + sizeof(*tcph)))
282 return -1;
283
284 tcph = (void *)(skb_network_header(skb) + thoff);
285 inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
286 new_addr->s6_addr32, true);
287
288 return 0;
289}
290
291static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
292 struct in6_addr *addr,
293 struct in6_addr *new_addr)
294{
295 struct udphdr *udph;
296
297 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
298 skb_try_make_writable(skb, thoff + sizeof(*udph)))
299 return -1;
300
301 udph = (void *)(skb_network_header(skb) + thoff);
302 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
303 inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
304 new_addr->s6_addr32, true);
305 if (!udph->check)
306 udph->check = CSUM_MANGLED_0;
307 }
308
309 return 0;
310}
311
312static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
313 unsigned int thoff, struct in6_addr *addr,
314 struct in6_addr *new_addr)
315{
316 switch (ip6h->nexthdr) {
317 case IPPROTO_TCP:
318 if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
319 return NF_DROP;
320 break;
321 case IPPROTO_UDP:
322 if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
323 return NF_DROP;
324 break;
325 }
326
327 return 0;
328}
329
330static int nf_flow_snat_ipv6(const struct flow_offload *flow,
331 struct sk_buff *skb, struct ipv6hdr *ip6h,
332 unsigned int thoff,
333 enum flow_offload_tuple_dir dir)
334{
335 struct in6_addr addr, new_addr;
336
337 switch (dir) {
338 case FLOW_OFFLOAD_DIR_ORIGINAL:
339 addr = ip6h->saddr;
340 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
341 ip6h->saddr = new_addr;
342 break;
343 case FLOW_OFFLOAD_DIR_REPLY:
344 addr = ip6h->daddr;
345 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
346 ip6h->daddr = new_addr;
347 break;
348 default:
349 return -1;
350 }
351
352 return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
353}
354
355static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
356 struct sk_buff *skb, struct ipv6hdr *ip6h,
357 unsigned int thoff,
358 enum flow_offload_tuple_dir dir)
359{
360 struct in6_addr addr, new_addr;
361
362 switch (dir) {
363 case FLOW_OFFLOAD_DIR_ORIGINAL:
364 addr = ip6h->daddr;
365 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
366 ip6h->daddr = new_addr;
367 break;
368 case FLOW_OFFLOAD_DIR_REPLY:
369 addr = ip6h->saddr;
370 new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
371 ip6h->saddr = new_addr;
372 break;
373 default:
374 return -1;
375 }
376
377 return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
378}
379
380static int nf_flow_nat_ipv6(const struct flow_offload *flow,
381 struct sk_buff *skb,
382 enum flow_offload_tuple_dir dir)
383{
384 struct ipv6hdr *ip6h = ipv6_hdr(skb);
385 unsigned int thoff = sizeof(*ip6h);
386
387 if (flow->flags & FLOW_OFFLOAD_SNAT &&
388 (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
389 nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
390 return -1;
391 if (flow->flags & FLOW_OFFLOAD_DNAT &&
392 (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
393 nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
394 return -1;
395
396 return 0;
397}
398
399static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
400 struct flow_offload_tuple *tuple)
401{
402 struct flow_ports *ports;
403 struct ipv6hdr *ip6h;
404 unsigned int thoff;
405
406 if (!pskb_may_pull(skb, sizeof(*ip6h)))
407 return -1;
408
409 ip6h = ipv6_hdr(skb);
410
411 if (ip6h->nexthdr != IPPROTO_TCP &&
412 ip6h->nexthdr != IPPROTO_UDP)
413 return -1;
414
415 thoff = sizeof(*ip6h);
416 if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
417 return -1;
418
419 ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
420
421 tuple->src_v6 = ip6h->saddr;
422 tuple->dst_v6 = ip6h->daddr;
423 tuple->src_port = ports->source;
424 tuple->dst_port = ports->dest;
425 tuple->l3proto = AF_INET6;
426 tuple->l4proto = ip6h->nexthdr;
427 tuple->iifidx = dev->ifindex;
428
429 return 0;
430}
431
432unsigned int
433nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
434 const struct nf_hook_state *state)
435{
436 struct flow_offload_tuple_rhash *tuplehash;
437 struct nf_flowtable *flow_table = priv;
438 struct flow_offload_tuple tuple = {};
439 enum flow_offload_tuple_dir dir;
440 struct flow_offload *flow;
441 struct net_device *outdev;
442 struct in6_addr *nexthop;
443 struct ipv6hdr *ip6h;
444 struct rt6_info *rt;
445
446 if (skb->protocol != htons(ETH_P_IPV6))
447 return NF_ACCEPT;
448
449 if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
450 return NF_ACCEPT;
451
452 tuplehash = flow_offload_lookup(flow_table, &tuple);
453 if (tuplehash == NULL)
454 return NF_ACCEPT;
455
456 outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
457 if (!outdev)
458 return NF_ACCEPT;
459
460 dir = tuplehash->tuple.dir;
461 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
462 rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
463
464 if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
465 return NF_ACCEPT;
466
467 if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
468 sizeof(*ip6h)))
469 return NF_ACCEPT;
470
471 if (skb_try_make_writable(skb, sizeof(*ip6h)))
472 return NF_DROP;
473
474 if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
475 nf_flow_nat_ipv6(flow, skb, dir) < 0)
476 return NF_DROP;
477
478 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
479 ip6h = ipv6_hdr(skb);
480 ip6h->hop_limit--;
481
482 skb->dev = outdev;
483 nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
484 skb_dst_set_noref(skb, &rt->dst);
485 neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
486
487 return NF_STOLEN;
488}
489EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 18f6d7ae995b..e15779fd58e3 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -15,4 +15,9 @@ void nf_queue_nf_hook_drop(struct net *net);
15/* nf_log.c */ 15/* nf_log.c */
16int __init netfilter_log_init(void); 16int __init netfilter_log_init(void);
17 17
18/* core.c */
19void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
20 const struct nf_hook_ops *reg);
21int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
22 const struct nf_hook_ops *reg);
18#endif 23#endif
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 617693ff9f4c..b7df32a56e7e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -32,6 +32,8 @@
32#include <net/netfilter/nf_conntrack_zones.h> 32#include <net/netfilter/nf_conntrack_zones.h>
33#include <linux/netfilter/nf_nat.h> 33#include <linux/netfilter/nf_nat.h>
34 34
35#include "nf_internals.h"
36
35static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; 37static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
36 38
37static DEFINE_MUTEX(nf_nat_proto_mutex); 39static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -39,11 +41,27 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
39 __read_mostly; 41 __read_mostly;
40static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] 42static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
41 __read_mostly; 43 __read_mostly;
44static unsigned int nat_net_id __read_mostly;
42 45
43static struct hlist_head *nf_nat_bysource __read_mostly; 46static struct hlist_head *nf_nat_bysource __read_mostly;
44static unsigned int nf_nat_htable_size __read_mostly; 47static unsigned int nf_nat_htable_size __read_mostly;
45static unsigned int nf_nat_hash_rnd __read_mostly; 48static unsigned int nf_nat_hash_rnd __read_mostly;
46 49
50struct nf_nat_lookup_hook_priv {
51 struct nf_hook_entries __rcu *entries;
52
53 struct rcu_head rcu_head;
54};
55
56struct nf_nat_hooks_net {
57 struct nf_hook_ops *nat_hook_ops;
58 unsigned int users;
59};
60
61struct nat_net {
62 struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
63};
64
47inline const struct nf_nat_l3proto * 65inline const struct nf_nat_l3proto *
48__nf_nat_l3proto_find(u8 family) 66__nf_nat_l3proto_find(u8 family)
49{ 67{
@@ -157,7 +175,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
157static int in_range(const struct nf_nat_l3proto *l3proto, 175static int in_range(const struct nf_nat_l3proto *l3proto,
158 const struct nf_nat_l4proto *l4proto, 176 const struct nf_nat_l4proto *l4proto,
159 const struct nf_conntrack_tuple *tuple, 177 const struct nf_conntrack_tuple *tuple,
160 const struct nf_nat_range *range) 178 const struct nf_nat_range2 *range)
161{ 179{
162 /* If we are supposed to map IPs, then we must be in the 180 /* If we are supposed to map IPs, then we must be in the
163 * range specified, otherwise let this drag us onto a new src IP. 181 * range specified, otherwise let this drag us onto a new src IP.
@@ -194,7 +212,7 @@ find_appropriate_src(struct net *net,
194 const struct nf_nat_l4proto *l4proto, 212 const struct nf_nat_l4proto *l4proto,
195 const struct nf_conntrack_tuple *tuple, 213 const struct nf_conntrack_tuple *tuple,
196 struct nf_conntrack_tuple *result, 214 struct nf_conntrack_tuple *result,
197 const struct nf_nat_range *range) 215 const struct nf_nat_range2 *range)
198{ 216{
199 unsigned int h = hash_by_src(net, tuple); 217 unsigned int h = hash_by_src(net, tuple);
200 const struct nf_conn *ct; 218 const struct nf_conn *ct;
@@ -224,7 +242,7 @@ find_appropriate_src(struct net *net,
224static void 242static void
225find_best_ips_proto(const struct nf_conntrack_zone *zone, 243find_best_ips_proto(const struct nf_conntrack_zone *zone,
226 struct nf_conntrack_tuple *tuple, 244 struct nf_conntrack_tuple *tuple,
227 const struct nf_nat_range *range, 245 const struct nf_nat_range2 *range,
228 const struct nf_conn *ct, 246 const struct nf_conn *ct,
229 enum nf_nat_manip_type maniptype) 247 enum nf_nat_manip_type maniptype)
230{ 248{
@@ -298,7 +316,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
298static void 316static void
299get_unique_tuple(struct nf_conntrack_tuple *tuple, 317get_unique_tuple(struct nf_conntrack_tuple *tuple,
300 const struct nf_conntrack_tuple *orig_tuple, 318 const struct nf_conntrack_tuple *orig_tuple,
301 const struct nf_nat_range *range, 319 const struct nf_nat_range2 *range,
302 struct nf_conn *ct, 320 struct nf_conn *ct,
303 enum nf_nat_manip_type maniptype) 321 enum nf_nat_manip_type maniptype)
304{ 322{
@@ -349,9 +367,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
349 /* Only bother mapping if it's not already in range and unique */ 367 /* Only bother mapping if it's not already in range and unique */
350 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 368 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
351 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 369 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
352 if (l4proto->in_range(tuple, maniptype, 370 if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
353 &range->min_proto, 371 l4proto->in_range(tuple, maniptype,
354 &range->max_proto) && 372 &range->min_proto,
373 &range->max_proto) &&
355 (range->min_proto.all == range->max_proto.all || 374 (range->min_proto.all == range->max_proto.all ||
356 !nf_nat_used_tuple(tuple, ct))) 375 !nf_nat_used_tuple(tuple, ct)))
357 goto out; 376 goto out;
@@ -360,7 +379,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
360 } 379 }
361 } 380 }
362 381
363 /* Last change: get protocol to try to obtain unique tuple. */ 382 /* Last chance: get protocol to try to obtain unique tuple. */
364 l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); 383 l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
365out: 384out:
366 rcu_read_unlock(); 385 rcu_read_unlock();
@@ -381,7 +400,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
381 400
382unsigned int 401unsigned int
383nf_nat_setup_info(struct nf_conn *ct, 402nf_nat_setup_info(struct nf_conn *ct,
384 const struct nf_nat_range *range, 403 const struct nf_nat_range2 *range,
385 enum nf_nat_manip_type maniptype) 404 enum nf_nat_manip_type maniptype)
386{ 405{
387 struct net *net = nf_ct_net(ct); 406 struct net *net = nf_ct_net(ct);
@@ -459,7 +478,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
459 (manip == NF_NAT_MANIP_SRC ? 478 (manip == NF_NAT_MANIP_SRC ?
460 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 479 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
461 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 480 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
462 struct nf_nat_range range = { 481 struct nf_nat_range2 range = {
463 .flags = NF_NAT_RANGE_MAP_IPS, 482 .flags = NF_NAT_RANGE_MAP_IPS,
464 .min_addr = ip, 483 .min_addr = ip,
465 .max_addr = ip, 484 .max_addr = ip,
@@ -474,17 +493,36 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
474} 493}
475EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); 494EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
476 495
496static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
497 enum nf_nat_manip_type mtype,
498 enum ip_conntrack_dir dir)
499{
500 const struct nf_nat_l3proto *l3proto;
501 const struct nf_nat_l4proto *l4proto;
502 struct nf_conntrack_tuple target;
503
504 /* We are aiming to look like inverse of other direction. */
505 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
506
507 l3proto = __nf_nat_l3proto_find(target.src.l3num);
508 l4proto = __nf_nat_l4proto_find(target.src.l3num,
509 target.dst.protonum);
510 if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
511 return NF_DROP;
512
513 return NF_ACCEPT;
514}
515
477/* Do packet manipulations according to nf_nat_setup_info. */ 516/* Do packet manipulations according to nf_nat_setup_info. */
478unsigned int nf_nat_packet(struct nf_conn *ct, 517unsigned int nf_nat_packet(struct nf_conn *ct,
479 enum ip_conntrack_info ctinfo, 518 enum ip_conntrack_info ctinfo,
480 unsigned int hooknum, 519 unsigned int hooknum,
481 struct sk_buff *skb) 520 struct sk_buff *skb)
482{ 521{
483 const struct nf_nat_l3proto *l3proto; 522 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
484 const struct nf_nat_l4proto *l4proto;
485 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 523 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
524 unsigned int verdict = NF_ACCEPT;
486 unsigned long statusbit; 525 unsigned long statusbit;
487 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
488 526
489 if (mtype == NF_NAT_MANIP_SRC) 527 if (mtype == NF_NAT_MANIP_SRC)
490 statusbit = IPS_SRC_NAT; 528 statusbit = IPS_SRC_NAT;
@@ -496,21 +534,87 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
496 statusbit ^= IPS_NAT_MASK; 534 statusbit ^= IPS_NAT_MASK;
497 535
498 /* Non-atomic: these bits don't change. */ 536 /* Non-atomic: these bits don't change. */
499 if (ct->status & statusbit) { 537 if (ct->status & statusbit)
500 struct nf_conntrack_tuple target; 538 verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
539
540 return verdict;
541}
542EXPORT_SYMBOL_GPL(nf_nat_packet);
543
544unsigned int
545nf_nat_inet_fn(void *priv, struct sk_buff *skb,
546 const struct nf_hook_state *state)
547{
548 struct nf_conn *ct;
549 enum ip_conntrack_info ctinfo;
550 struct nf_conn_nat *nat;
551 /* maniptype == SRC for postrouting. */
552 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
553
554 ct = nf_ct_get(skb, &ctinfo);
555 /* Can't track? It's not due to stress, or conntrack would
556 * have dropped it. Hence it's the user's responsibilty to
557 * packet filter it out, or implement conntrack/NAT for that
558 * protocol. 8) --RR
559 */
560 if (!ct)
561 return NF_ACCEPT;
501 562
502 /* We are aiming to look like inverse of other direction. */ 563 nat = nfct_nat(ct);
503 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
504 564
505 l3proto = __nf_nat_l3proto_find(target.src.l3num); 565 switch (ctinfo) {
506 l4proto = __nf_nat_l4proto_find(target.src.l3num, 566 case IP_CT_RELATED:
507 target.dst.protonum); 567 case IP_CT_RELATED_REPLY:
508 if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) 568 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
509 return NF_DROP; 569 case IP_CT_NEW:
570 /* Seen it before? This can happen for loopback, retrans,
571 * or local packets.
572 */
573 if (!nf_nat_initialized(ct, maniptype)) {
574 struct nf_nat_lookup_hook_priv *lpriv = priv;
575 struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
576 unsigned int ret;
577 int i;
578
579 if (!e)
580 goto null_bind;
581
582 for (i = 0; i < e->num_hook_entries; i++) {
583 ret = e->hooks[i].hook(e->hooks[i].priv, skb,
584 state);
585 if (ret != NF_ACCEPT)
586 return ret;
587 if (nf_nat_initialized(ct, maniptype))
588 goto do_nat;
589 }
590null_bind:
591 ret = nf_nat_alloc_null_binding(ct, state->hook);
592 if (ret != NF_ACCEPT)
593 return ret;
594 } else {
595 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
596 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
597 ct, ct->status);
598 if (nf_nat_oif_changed(state->hook, ctinfo, nat,
599 state->out))
600 goto oif_changed;
601 }
602 break;
603 default:
604 /* ESTABLISHED */
605 WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
606 ctinfo != IP_CT_ESTABLISHED_REPLY);
607 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
608 goto oif_changed;
510 } 609 }
511 return NF_ACCEPT; 610do_nat:
611 return nf_nat_packet(ct, ctinfo, state->hook, skb);
612
613oif_changed:
614 nf_ct_kill_acct(ct, ctinfo, skb);
615 return NF_DROP;
512} 616}
513EXPORT_SYMBOL_GPL(nf_nat_packet); 617EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
514 618
515struct nf_nat_proto_clean { 619struct nf_nat_proto_clean {
516 u8 l3proto; 620 u8 l3proto;
@@ -702,7 +806,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
702 806
703static int nfnetlink_parse_nat_proto(struct nlattr *attr, 807static int nfnetlink_parse_nat_proto(struct nlattr *attr,
704 const struct nf_conn *ct, 808 const struct nf_conn *ct,
705 struct nf_nat_range *range) 809 struct nf_nat_range2 *range)
706{ 810{
707 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 811 struct nlattr *tb[CTA_PROTONAT_MAX+1];
708 const struct nf_nat_l4proto *l4proto; 812 const struct nf_nat_l4proto *l4proto;
@@ -730,7 +834,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
730 834
731static int 835static int
732nfnetlink_parse_nat(const struct nlattr *nat, 836nfnetlink_parse_nat(const struct nlattr *nat,
733 const struct nf_conn *ct, struct nf_nat_range *range, 837 const struct nf_conn *ct, struct nf_nat_range2 *range,
734 const struct nf_nat_l3proto *l3proto) 838 const struct nf_nat_l3proto *l3proto)
735{ 839{
736 struct nlattr *tb[CTA_NAT_MAX+1]; 840 struct nlattr *tb[CTA_NAT_MAX+1];
@@ -758,7 +862,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
758 enum nf_nat_manip_type manip, 862 enum nf_nat_manip_type manip,
759 const struct nlattr *attr) 863 const struct nlattr *attr)
760{ 864{
761 struct nf_nat_range range; 865 struct nf_nat_range2 range;
762 const struct nf_nat_l3proto *l3proto; 866 const struct nf_nat_l3proto *l3proto;
763 int err; 867 int err;
764 868
@@ -800,6 +904,146 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
800 .expectfn = nf_nat_follow_master, 904 .expectfn = nf_nat_follow_master,
801}; 905};
802 906
907int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
908 const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
909{
910 struct nat_net *nat_net = net_generic(net, nat_net_id);
911 struct nf_nat_hooks_net *nat_proto_net;
912 struct nf_nat_lookup_hook_priv *priv;
913 unsigned int hooknum = ops->hooknum;
914 struct nf_hook_ops *nat_ops;
915 int i, ret;
916
917 if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
918 return -EINVAL;
919
920 nat_proto_net = &nat_net->nat_proto_net[ops->pf];
921
922 for (i = 0; i < ops_count; i++) {
923 if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
924 return -EINVAL;
925 if (orig_nat_ops[i].hooknum == hooknum) {
926 hooknum = i;
927 break;
928 }
929 }
930
931 if (WARN_ON_ONCE(i == ops_count))
932 return -EINVAL;
933
934 mutex_lock(&nf_nat_proto_mutex);
935 if (!nat_proto_net->nat_hook_ops) {
936 WARN_ON(nat_proto_net->users != 0);
937
938 nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
939 if (!nat_ops) {
940 mutex_unlock(&nf_nat_proto_mutex);
941 return -ENOMEM;
942 }
943
944 for (i = 0; i < ops_count; i++) {
945 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
946 if (priv) {
947 nat_ops[i].priv = priv;
948 continue;
949 }
950 mutex_unlock(&nf_nat_proto_mutex);
951 while (i)
952 kfree(nat_ops[--i].priv);
953 kfree(nat_ops);
954 return -ENOMEM;
955 }
956
957 ret = nf_register_net_hooks(net, nat_ops, ops_count);
958 if (ret < 0) {
959 mutex_unlock(&nf_nat_proto_mutex);
960 for (i = 0; i < ops_count; i++)
961 kfree(nat_ops[i].priv);
962 kfree(nat_ops);
963 return ret;
964 }
965
966 nat_proto_net->nat_hook_ops = nat_ops;
967 }
968
969 nat_ops = nat_proto_net->nat_hook_ops;
970 priv = nat_ops[hooknum].priv;
971 if (WARN_ON_ONCE(!priv)) {
972 mutex_unlock(&nf_nat_proto_mutex);
973 return -EOPNOTSUPP;
974 }
975
976 ret = nf_hook_entries_insert_raw(&priv->entries, ops);
977 if (ret == 0)
978 nat_proto_net->users++;
979
980 mutex_unlock(&nf_nat_proto_mutex);
981 return ret;
982}
983EXPORT_SYMBOL_GPL(nf_nat_register_fn);
984
985void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
986 unsigned int ops_count)
987{
988 struct nat_net *nat_net = net_generic(net, nat_net_id);
989 struct nf_nat_hooks_net *nat_proto_net;
990 struct nf_nat_lookup_hook_priv *priv;
991 struct nf_hook_ops *nat_ops;
992 int hooknum = ops->hooknum;
993 int i;
994
995 if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
996 return;
997
998 nat_proto_net = &nat_net->nat_proto_net[ops->pf];
999
1000 mutex_lock(&nf_nat_proto_mutex);
1001 if (WARN_ON(nat_proto_net->users == 0))
1002 goto unlock;
1003
1004 nat_proto_net->users--;
1005
1006 nat_ops = nat_proto_net->nat_hook_ops;
1007 for (i = 0; i < ops_count; i++) {
1008 if (nat_ops[i].hooknum == hooknum) {
1009 hooknum = i;
1010 break;
1011 }
1012 }
1013 if (WARN_ON_ONCE(i == ops_count))
1014 goto unlock;
1015 priv = nat_ops[hooknum].priv;
1016 nf_hook_entries_delete_raw(&priv->entries, ops);
1017
1018 if (nat_proto_net->users == 0) {
1019 nf_unregister_net_hooks(net, nat_ops, ops_count);
1020
1021 for (i = 0; i < ops_count; i++) {
1022 priv = nat_ops[i].priv;
1023 kfree_rcu(priv, rcu_head);
1024 }
1025
1026 nat_proto_net->nat_hook_ops = NULL;
1027 kfree(nat_ops);
1028 }
1029unlock:
1030 mutex_unlock(&nf_nat_proto_mutex);
1031}
1032EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
1033
1034static struct pernet_operations nat_net_ops = {
1035 .id = &nat_net_id,
1036 .size = sizeof(struct nat_net),
1037};
1038
1039static struct nf_nat_hook nat_hook = {
1040 .parse_nat_setup = nfnetlink_parse_nat_setup,
1041#ifdef CONFIG_XFRM
1042 .decode_session = __nf_nat_decode_session,
1043#endif
1044 .manip_pkt = nf_nat_manip_pkt,
1045};
1046
803static int __init nf_nat_init(void) 1047static int __init nf_nat_init(void)
804{ 1048{
805 int ret, i; 1049 int ret, i;
@@ -823,15 +1067,17 @@ static int __init nf_nat_init(void)
823 for (i = 0; i < CONNTRACK_LOCKS; i++) 1067 for (i = 0; i < CONNTRACK_LOCKS; i++)
824 spin_lock_init(&nf_nat_locks[i]); 1068 spin_lock_init(&nf_nat_locks[i]);
825 1069
1070 ret = register_pernet_subsys(&nat_net_ops);
1071 if (ret < 0) {
1072 nf_ct_extend_unregister(&nat_extend);
1073 return ret;
1074 }
1075
826 nf_ct_helper_expectfn_register(&follow_master_nat); 1076 nf_ct_helper_expectfn_register(&follow_master_nat);
827 1077
828 BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); 1078 WARN_ON(nf_nat_hook != NULL);
829 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, 1079 RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
830 nfnetlink_parse_nat_setup); 1080
831#ifdef CONFIG_XFRM
832 BUG_ON(nf_nat_decode_session_hook != NULL);
833 RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
834#endif
835 return 0; 1081 return 0;
836} 1082}
837 1083
@@ -844,16 +1090,15 @@ static void __exit nf_nat_cleanup(void)
844 1090
845 nf_ct_extend_unregister(&nat_extend); 1091 nf_ct_extend_unregister(&nat_extend);
846 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1092 nf_ct_helper_expectfn_unregister(&follow_master_nat);
847 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); 1093 RCU_INIT_POINTER(nf_nat_hook, NULL);
848#ifdef CONFIG_XFRM 1094
849 RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
850#endif
851 synchronize_rcu(); 1095 synchronize_rcu();
852 1096
853 for (i = 0; i < NFPROTO_NUMPROTO; i++) 1097 for (i = 0; i < NFPROTO_NUMPROTO; i++)
854 kfree(nf_nat_l4protos[i]); 1098 kfree(nf_nat_l4protos[i]);
855 synchronize_net(); 1099 synchronize_net();
856 nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); 1100 nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
1101 unregister_pernet_subsys(&nat_net_ops);
857} 1102}
858 1103
859MODULE_LICENSE("GPL"); 1104MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 607a373379b4..99606baedda4 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
191void nf_nat_follow_master(struct nf_conn *ct, 191void nf_nat_follow_master(struct nf_conn *ct,
192 struct nf_conntrack_expect *exp) 192 struct nf_conntrack_expect *exp)
193{ 193{
194 struct nf_nat_range range; 194 struct nf_nat_range2 range;
195 195
196 /* This must be a fresh one. */ 196 /* This must be a fresh one. */
197 BUG_ON(ct->status & IPS_NAT_DONE_MASK); 197 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 7d7466dbf663..5d849d835561 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
36 36
37void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, 37void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
38 struct nf_conntrack_tuple *tuple, 38 struct nf_conntrack_tuple *tuple,
39 const struct nf_nat_range *range, 39 const struct nf_nat_range2 *range,
40 enum nf_nat_manip_type maniptype, 40 enum nf_nat_manip_type maniptype,
41 const struct nf_conn *ct, 41 const struct nf_conn *ct,
42 u16 *rover) 42 u16 *rover)
@@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
83 : tuple->src.u.all); 83 : tuple->src.u.all);
84 } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { 84 } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
85 off = prandom_u32(); 85 off = prandom_u32();
86 } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
87 off = (ntohs(*portptr) - ntohs(range->base_proto.all));
86 } else { 88 } else {
87 off = *rover; 89 off = *rover;
88 } 90 }
@@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
91 *portptr = htons(min + off % range_size); 93 *portptr = htons(min + off % range_size);
92 if (++i != range_size && nf_nat_used_tuple(tuple, ct)) 94 if (++i != range_size && nf_nat_used_tuple(tuple, ct))
93 continue; 95 continue;
94 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) 96 if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
97 NF_NAT_RANGE_PROTO_OFFSET)))
95 *rover = off; 98 *rover = off;
96 return; 99 return;
97 } 100 }
@@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
100 103
101#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 104#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
102int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], 105int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
103 struct nf_nat_range *range) 106 struct nf_nat_range2 *range)
104{ 107{
105 if (tb[CTA_PROTONAT_PORT_MIN]) { 108 if (tb[CTA_PROTONAT_PORT_MIN]) {
106 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); 109 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 269fcd5dc34c..67ea0d83aa5a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover;
23static void 23static void
24dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, 24dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
25 struct nf_conntrack_tuple *tuple, 25 struct nf_conntrack_tuple *tuple,
26 const struct nf_nat_range *range, 26 const struct nf_nat_range2 *range,
27 enum nf_nat_manip_type maniptype, 27 enum nf_nat_manip_type maniptype,
28 const struct nf_conn *ct) 28 const struct nf_conn *ct)
29{ 29{
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index c57ee3240b1d..1c5d9b65fbba 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover;
17static void 17static void
18sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, 18sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
19 struct nf_conntrack_tuple *tuple, 19 struct nf_conntrack_tuple *tuple,
20 const struct nf_nat_range *range, 20 const struct nf_nat_range2 *range,
21 enum nf_nat_manip_type maniptype, 21 enum nf_nat_manip_type maniptype,
22 const struct nf_conn *ct) 22 const struct nf_conn *ct)
23{ 23{
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 4f8820fc5148..f15fcd475f98 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u16 tcp_port_rover;
23static void 23static void
24tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, 24tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
25 struct nf_conntrack_tuple *tuple, 25 struct nf_conntrack_tuple *tuple,
26 const struct nf_nat_range *range, 26 const struct nf_nat_range2 *range,
27 enum nf_nat_manip_type maniptype, 27 enum nf_nat_manip_type maniptype,
28 const struct nf_conn *ct) 28 const struct nf_conn *ct)
29{ 29{
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index edd4a77dc09a..5790f70a83b2 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u16 udp_port_rover;
22static void 22static void
23udp_unique_tuple(const struct nf_nat_l3proto *l3proto, 23udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
24 struct nf_conntrack_tuple *tuple, 24 struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_range *range, 25 const struct nf_nat_range2 *range,
26 enum nf_nat_manip_type maniptype, 26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct) 27 const struct nf_conn *ct)
28{ 28{
@@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
100static void 100static void
101udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, 101udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
102 struct nf_conntrack_tuple *tuple, 102 struct nf_conntrack_tuple *tuple,
103 const struct nf_nat_range *range, 103 const struct nf_nat_range2 *range,
104 enum nf_nat_manip_type maniptype, 104 enum nf_nat_manip_type maniptype,
105 const struct nf_conn *ct) 105 const struct nf_conn *ct)
106{ 106{
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
index 6e494d584412..c5db3e251232 100644
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
27 27
28static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, 28static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
29 struct nf_conntrack_tuple *tuple, 29 struct nf_conntrack_tuple *tuple,
30 const struct nf_nat_range *range, 30 const struct nf_nat_range2 *range,
31 enum nf_nat_manip_type maniptype, 31 enum nf_nat_manip_type maniptype,
32 const struct nf_conn *ct) 32 const struct nf_conn *ct)
33{ 33{
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 25b06b959118..adee04af8d43 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -15,7 +15,6 @@
15#include <linux/inetdevice.h> 15#include <linux/inetdevice.h>
16#include <linux/ip.h> 16#include <linux/ip.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <linux/netdevice.h> 18#include <linux/netdevice.h>
20#include <linux/netfilter.h> 19#include <linux/netfilter.h>
21#include <linux/types.h> 20#include <linux/types.h>
@@ -36,7 +35,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
36 struct nf_conn *ct; 35 struct nf_conn *ct;
37 enum ip_conntrack_info ctinfo; 36 enum ip_conntrack_info ctinfo;
38 __be32 newdst; 37 __be32 newdst;
39 struct nf_nat_range newrange; 38 struct nf_nat_range2 newrange;
40 39
41 WARN_ON(hooknum != NF_INET_PRE_ROUTING && 40 WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
42 hooknum != NF_INET_LOCAL_OUT); 41 hooknum != NF_INET_LOCAL_OUT);
@@ -82,10 +81,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
82static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 81static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
83 82
84unsigned int 83unsigned int
85nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, 84nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
86 unsigned int hooknum) 85 unsigned int hooknum)
87{ 86{
88 struct nf_nat_range newrange; 87 struct nf_nat_range2 newrange;
89 struct in6_addr newdst; 88 struct in6_addr newdst;
90 enum ip_conntrack_info ctinfo; 89 enum ip_conntrack_info ctinfo;
91 struct nf_conn *ct; 90 struct nf_conn *ct;
@@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
124 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); 123 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
125} 124}
126EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); 125EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
127
128MODULE_LICENSE("GPL");
129MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 791fac4fd745..1f3086074981 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
316static void nf_nat_sip_expected(struct nf_conn *ct, 316static void nf_nat_sip_expected(struct nf_conn *ct,
317 struct nf_conntrack_expect *exp) 317 struct nf_conntrack_expect *exp)
318{ 318{
319 struct nf_nat_range range; 319 struct nf_nat_range2 range;
320 320
321 /* This must be a fresh one. */ 321 /* This must be a fresh one. */
322 BUG_ON(ct->status & IPS_NAT_DONE_MASK); 322 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
new file mode 100644
index 000000000000..5ba5c7bef2f9
--- /dev/null
+++ b/net/netfilter/nf_osf.c
@@ -0,0 +1,218 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2#include <linux/module.h>
3#include <linux/kernel.h>
4
5#include <linux/capability.h>
6#include <linux/if.h>
7#include <linux/inetdevice.h>
8#include <linux/ip.h>
9#include <linux/list.h>
10#include <linux/rculist.h>
11#include <linux/skbuff.h>
12#include <linux/slab.h>
13#include <linux/tcp.h>
14
15#include <net/ip.h>
16#include <net/tcp.h>
17
18#include <linux/netfilter/nfnetlink.h>
19#include <linux/netfilter/x_tables.h>
20#include <net/netfilter/nf_log.h>
21#include <linux/netfilter/nf_osf.h>
22
23static inline int nf_osf_ttl(const struct sk_buff *skb,
24 const struct nf_osf_info *info,
25 unsigned char f_ttl)
26{
27 const struct iphdr *ip = ip_hdr(skb);
28
29 if (info->flags & NF_OSF_TTL) {
30 if (info->ttl == NF_OSF_TTL_TRUE)
31 return ip->ttl == f_ttl;
32 if (info->ttl == NF_OSF_TTL_NOCHECK)
33 return 1;
34 else if (ip->ttl <= f_ttl)
35 return 1;
36 else {
37 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
38 int ret = 0;
39
40 for_ifa(in_dev) {
41 if (inet_ifa_match(ip->saddr, ifa)) {
42 ret = (ip->ttl == f_ttl);
43 break;
44 }
45 }
46 endfor_ifa(in_dev);
47
48 return ret;
49 }
50 }
51
52 return ip->ttl == f_ttl;
53}
54
55bool
56nf_osf_match(const struct sk_buff *skb, u_int8_t family,
57 int hooknum, struct net_device *in, struct net_device *out,
58 const struct nf_osf_info *info, struct net *net,
59 const struct list_head *nf_osf_fingers)
60{
61 const unsigned char *optp = NULL, *_optp = NULL;
62 unsigned int optsize = 0, check_WSS = 0;
63 int fmatch = FMATCH_WRONG, fcount = 0;
64 const struct iphdr *ip = ip_hdr(skb);
65 const struct nf_osf_user_finger *f;
66 unsigned char opts[MAX_IPOPTLEN];
67 const struct nf_osf_finger *kf;
68 u16 window, totlen, mss = 0;
69 const struct tcphdr *tcp;
70 struct tcphdr _tcph;
71 bool df;
72
73 tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
74 if (!tcp)
75 return false;
76
77 if (!tcp->syn)
78 return false;
79
80 totlen = ntohs(ip->tot_len);
81 df = ntohs(ip->frag_off) & IP_DF;
82 window = ntohs(tcp->window);
83
84 if (tcp->doff * 4 > sizeof(struct tcphdr)) {
85 optsize = tcp->doff * 4 - sizeof(struct tcphdr);
86
87 _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
88 sizeof(struct tcphdr), optsize, opts);
89 }
90
91 list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
92 int foptsize, optnum;
93
94 f = &kf->finger;
95
96 if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
97 continue;
98
99 optp = _optp;
100 fmatch = FMATCH_WRONG;
101
102 if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
103 continue;
104
105 /*
106 * Should not happen if userspace parser was written correctly.
107 */
108 if (f->wss.wc >= OSF_WSS_MAX)
109 continue;
110
111 /* Check options */
112
113 foptsize = 0;
114 for (optnum = 0; optnum < f->opt_num; ++optnum)
115 foptsize += f->opt[optnum].length;
116
117 if (foptsize > MAX_IPOPTLEN ||
118 optsize > MAX_IPOPTLEN ||
119 optsize != foptsize)
120 continue;
121
122 check_WSS = f->wss.wc;
123
124 for (optnum = 0; optnum < f->opt_num; ++optnum) {
125 if (f->opt[optnum].kind == (*optp)) {
126 __u32 len = f->opt[optnum].length;
127 const __u8 *optend = optp + len;
128
129 fmatch = FMATCH_OK;
130
131 switch (*optp) {
132 case OSFOPT_MSS:
133 mss = optp[3];
134 mss <<= 8;
135 mss |= optp[2];
136
137 mss = ntohs((__force __be16)mss);
138 break;
139 case OSFOPT_TS:
140 break;
141 }
142
143 optp = optend;
144 } else
145 fmatch = FMATCH_OPT_WRONG;
146
147 if (fmatch != FMATCH_OK)
148 break;
149 }
150
151 if (fmatch != FMATCH_OPT_WRONG) {
152 fmatch = FMATCH_WRONG;
153
154 switch (check_WSS) {
155 case OSF_WSS_PLAIN:
156 if (f->wss.val == 0 || window == f->wss.val)
157 fmatch = FMATCH_OK;
158 break;
159 case OSF_WSS_MSS:
160 /*
161 * Some smart modems decrease mangle MSS to
162 * SMART_MSS_2, so we check standard, decreased
163 * and the one provided in the fingerprint MSS
164 * values.
165 */
166#define SMART_MSS_1 1460
167#define SMART_MSS_2 1448
168 if (window == f->wss.val * mss ||
169 window == f->wss.val * SMART_MSS_1 ||
170 window == f->wss.val * SMART_MSS_2)
171 fmatch = FMATCH_OK;
172 break;
173 case OSF_WSS_MTU:
174 if (window == f->wss.val * (mss + 40) ||
175 window == f->wss.val * (SMART_MSS_1 + 40) ||
176 window == f->wss.val * (SMART_MSS_2 + 40))
177 fmatch = FMATCH_OK;
178 break;
179 case OSF_WSS_MODULO:
180 if ((window % f->wss.val) == 0)
181 fmatch = FMATCH_OK;
182 break;
183 }
184 }
185
186 if (fmatch != FMATCH_OK)
187 continue;
188
189 fcount++;
190
191 if (info->flags & NF_OSF_LOG)
192 nf_log_packet(net, family, hooknum, skb,
193 in, out, NULL,
194 "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
195 f->genre, f->version, f->subtype,
196 &ip->saddr, ntohs(tcp->source),
197 &ip->daddr, ntohs(tcp->dest),
198 f->ttl - ip->ttl);
199
200 if ((info->flags & NF_OSF_LOG) &&
201 info->loglevel == NF_OSF_LOGLEVEL_FIRST)
202 break;
203 }
204
205 if (!fcount && (info->flags & NF_OSF_LOG))
206 nf_log_packet(net, family, hooknum, skb, in, out, NULL,
207 "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
208 &ip->saddr, ntohs(tcp->source),
209 &ip->daddr, ntohs(tcp->dest));
210
211 if (fcount)
212 fmatch = FMATCH_OK;
213
214 return fmatch == FMATCH_OK;
215}
216EXPORT_SYMBOL_GPL(nf_osf_match);
217
218MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 501e48a7965b..ca4c4d994ddb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -28,6 +28,42 @@ static LIST_HEAD(nf_tables_objects);
28static LIST_HEAD(nf_tables_flowtables); 28static LIST_HEAD(nf_tables_flowtables);
29static u64 table_handle; 29static u64 table_handle;
30 30
31enum {
32 NFT_VALIDATE_SKIP = 0,
33 NFT_VALIDATE_NEED,
34 NFT_VALIDATE_DO,
35};
36
37static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
38static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
39static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);
40
41static const struct rhashtable_params nft_chain_ht_params = {
42 .head_offset = offsetof(struct nft_chain, rhlhead),
43 .key_offset = offsetof(struct nft_chain, name),
44 .hashfn = nft_chain_hash,
45 .obj_hashfn = nft_chain_hash_obj,
46 .obj_cmpfn = nft_chain_hash_cmp,
47 .locks_mul = 1,
48 .automatic_shrinking = true,
49};
50
51static void nft_validate_state_update(struct net *net, u8 new_validate_state)
52{
53 switch (net->nft.validate_state) {
54 case NFT_VALIDATE_SKIP:
55 WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
56 break;
57 case NFT_VALIDATE_NEED:
58 break;
59 case NFT_VALIDATE_DO:
60 if (new_validate_state == NFT_VALIDATE_NEED)
61 return;
62 }
63
64 net->nft.validate_state = new_validate_state;
65}
66
31static void nft_ctx_init(struct nft_ctx *ctx, 67static void nft_ctx_init(struct nft_ctx *ctx,
32 struct net *net, 68 struct net *net,
33 const struct sk_buff *skb, 69 const struct sk_buff *skb,
@@ -74,88 +110,43 @@ static void nft_trans_destroy(struct nft_trans *trans)
74 kfree(trans); 110 kfree(trans);
75} 111}
76 112
77/* removal requests are queued in the commit_list, but not acted upon
78 * until after all new rules are in place.
79 *
80 * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending
81 * nf_unregister_net_hook().
82 *
83 * nf_register_net_hook thus fails if a nat hook is already in place
84 * even if the conflicting hook is about to be removed.
85 *
86 * If collision is detected, search commit_log for DELCHAIN matching
87 * the new nat hooknum; if we find one collision is temporary:
88 *
89 * Either transaction is aborted (new/colliding hook is removed), or
90 * transaction is committed (old hook is removed).
91 */
92static bool nf_tables_allow_nat_conflict(const struct net *net,
93 const struct nf_hook_ops *ops)
94{
95 const struct nft_trans *trans;
96 bool ret = false;
97
98 if (!ops->nat_hook)
99 return false;
100
101 list_for_each_entry(trans, &net->nft.commit_list, list) {
102 const struct nf_hook_ops *pending_ops;
103 const struct nft_chain *pending;
104
105 if (trans->msg_type != NFT_MSG_NEWCHAIN &&
106 trans->msg_type != NFT_MSG_DELCHAIN)
107 continue;
108
109 pending = trans->ctx.chain;
110 if (!nft_is_base_chain(pending))
111 continue;
112
113 pending_ops = &nft_base_chain(pending)->ops;
114 if (pending_ops->nat_hook &&
115 pending_ops->pf == ops->pf &&
116 pending_ops->hooknum == ops->hooknum) {
117 /* other hook registration already pending? */
118 if (trans->msg_type == NFT_MSG_NEWCHAIN)
119 return false;
120
121 ret = true;
122 }
123 }
124
125 return ret;
126}
127
128static int nf_tables_register_hook(struct net *net, 113static int nf_tables_register_hook(struct net *net,
129 const struct nft_table *table, 114 const struct nft_table *table,
130 struct nft_chain *chain) 115 struct nft_chain *chain)
131{ 116{
132 struct nf_hook_ops *ops; 117 const struct nft_base_chain *basechain;
133 int ret; 118 const struct nf_hook_ops *ops;
134 119
135 if (table->flags & NFT_TABLE_F_DORMANT || 120 if (table->flags & NFT_TABLE_F_DORMANT ||
136 !nft_is_base_chain(chain)) 121 !nft_is_base_chain(chain))
137 return 0; 122 return 0;
138 123
139 ops = &nft_base_chain(chain)->ops; 124 basechain = nft_base_chain(chain);
140 ret = nf_register_net_hook(net, ops); 125 ops = &basechain->ops;
141 if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
142 ops->nat_hook = false;
143 ret = nf_register_net_hook(net, ops);
144 ops->nat_hook = true;
145 }
146 126
147 return ret; 127 if (basechain->type->ops_register)
128 return basechain->type->ops_register(net, ops);
129
130 return nf_register_net_hook(net, ops);
148} 131}
149 132
150static void nf_tables_unregister_hook(struct net *net, 133static void nf_tables_unregister_hook(struct net *net,
151 const struct nft_table *table, 134 const struct nft_table *table,
152 struct nft_chain *chain) 135 struct nft_chain *chain)
153{ 136{
137 const struct nft_base_chain *basechain;
138 const struct nf_hook_ops *ops;
139
154 if (table->flags & NFT_TABLE_F_DORMANT || 140 if (table->flags & NFT_TABLE_F_DORMANT ||
155 !nft_is_base_chain(chain)) 141 !nft_is_base_chain(chain))
156 return; 142 return;
143 basechain = nft_base_chain(chain);
144 ops = &basechain->ops;
145
146 if (basechain->type->ops_unregister)
147 return basechain->type->ops_unregister(net, ops);
157 148
158 nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); 149 nf_unregister_net_hook(net, ops);
159} 150}
160 151
161static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) 152static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -415,13 +406,17 @@ static struct nft_table *nft_table_lookup(const struct net *net,
415{ 406{
416 struct nft_table *table; 407 struct nft_table *table;
417 408
418 list_for_each_entry(table, &net->nft.tables, list) { 409 if (nla == NULL)
410 return ERR_PTR(-EINVAL);
411
412 list_for_each_entry_rcu(table, &net->nft.tables, list) {
419 if (!nla_strcmp(nla, table->name) && 413 if (!nla_strcmp(nla, table->name) &&
420 table->family == family && 414 table->family == family &&
421 nft_active_genmask(table, genmask)) 415 nft_active_genmask(table, genmask))
422 return table; 416 return table;
423 } 417 }
424 return NULL; 418
419 return ERR_PTR(-ENOENT);
425} 420}
426 421
427static struct nft_table *nft_table_lookup_byhandle(const struct net *net, 422static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
@@ -435,37 +430,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
435 nft_active_genmask(table, genmask)) 430 nft_active_genmask(table, genmask))
436 return table; 431 return table;
437 } 432 }
438 return NULL;
439}
440
441static struct nft_table *nf_tables_table_lookup(const struct net *net,
442 const struct nlattr *nla,
443 u8 family, u8 genmask)
444{
445 struct nft_table *table;
446
447 if (nla == NULL)
448 return ERR_PTR(-EINVAL);
449
450 table = nft_table_lookup(net, nla, family, genmask);
451 if (table != NULL)
452 return table;
453
454 return ERR_PTR(-ENOENT);
455}
456
457static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
458 const struct nlattr *nla,
459 u8 genmask)
460{
461 struct nft_table *table;
462
463 if (nla == NULL)
464 return ERR_PTR(-EINVAL);
465
466 table = nft_table_lookup_byhandle(net, nla, genmask);
467 if (table != NULL)
468 return table;
469 433
470 return ERR_PTR(-ENOENT); 434 return ERR_PTR(-ENOENT);
471} 435}
@@ -618,6 +582,24 @@ done:
618 return skb->len; 582 return skb->len;
619} 583}
620 584
585static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
586 const struct nlmsghdr *nlh,
587 struct netlink_dump_control *c)
588{
589 int err;
590
591 if (!try_module_get(THIS_MODULE))
592 return -EINVAL;
593
594 rcu_read_unlock();
595 err = netlink_dump_start(nlsk, skb, nlh, c);
596 rcu_read_lock();
597 module_put(THIS_MODULE);
598
599 return err;
600}
601
602/* called with rcu_read_lock held */
621static int nf_tables_gettable(struct net *net, struct sock *nlsk, 603static int nf_tables_gettable(struct net *net, struct sock *nlsk,
622 struct sk_buff *skb, const struct nlmsghdr *nlh, 604 struct sk_buff *skb, const struct nlmsghdr *nlh,
623 const struct nlattr * const nla[], 605 const struct nlattr * const nla[],
@@ -633,16 +615,19 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
633 if (nlh->nlmsg_flags & NLM_F_DUMP) { 615 if (nlh->nlmsg_flags & NLM_F_DUMP) {
634 struct netlink_dump_control c = { 616 struct netlink_dump_control c = {
635 .dump = nf_tables_dump_tables, 617 .dump = nf_tables_dump_tables,
618 .module = THIS_MODULE,
636 }; 619 };
637 return netlink_dump_start(nlsk, skb, nlh, &c); 620
621 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
638 } 622 }
639 623
640 table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, 624 table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
641 genmask); 625 if (IS_ERR(table)) {
642 if (IS_ERR(table)) 626 NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
643 return PTR_ERR(table); 627 return PTR_ERR(table);
628 }
644 629
645 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 630 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
646 if (!skb2) 631 if (!skb2)
647 return -ENOMEM; 632 return -ENOMEM;
648 633
@@ -749,6 +734,29 @@ err:
749 return ret; 734 return ret;
750} 735}
751 736
737static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
738{
739 const char *name = data;
740
741 return jhash(name, strlen(name), seed);
742}
743
744static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
745{
746 const struct nft_chain *chain = data;
747
748 return nft_chain_hash(chain->name, 0, seed);
749}
750
751static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
752 const void *ptr)
753{
754 const struct nft_chain *chain = ptr;
755 const char *name = arg->key;
756
757 return strcmp(chain->name, name);
758}
759
752static int nf_tables_newtable(struct net *net, struct sock *nlsk, 760static int nf_tables_newtable(struct net *net, struct sock *nlsk,
753 struct sk_buff *skb, const struct nlmsghdr *nlh, 761 struct sk_buff *skb, const struct nlmsghdr *nlh,
754 const struct nlattr * const nla[], 762 const struct nlattr * const nla[],
@@ -756,21 +764,23 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
756{ 764{
757 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 765 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
758 u8 genmask = nft_genmask_next(net); 766 u8 genmask = nft_genmask_next(net);
759 const struct nlattr *name;
760 struct nft_table *table;
761 int family = nfmsg->nfgen_family; 767 int family = nfmsg->nfgen_family;
768 const struct nlattr *attr;
769 struct nft_table *table;
762 u32 flags = 0; 770 u32 flags = 0;
763 struct nft_ctx ctx; 771 struct nft_ctx ctx;
764 int err; 772 int err;
765 773
766 name = nla[NFTA_TABLE_NAME]; 774 attr = nla[NFTA_TABLE_NAME];
767 table = nf_tables_table_lookup(net, name, family, genmask); 775 table = nft_table_lookup(net, attr, family, genmask);
768 if (IS_ERR(table)) { 776 if (IS_ERR(table)) {
769 if (PTR_ERR(table) != -ENOENT) 777 if (PTR_ERR(table) != -ENOENT)
770 return PTR_ERR(table); 778 return PTR_ERR(table);
771 } else { 779 } else {
772 if (nlh->nlmsg_flags & NLM_F_EXCL) 780 if (nlh->nlmsg_flags & NLM_F_EXCL) {
781 NL_SET_BAD_ATTR(extack, attr);
773 return -EEXIST; 782 return -EEXIST;
783 }
774 if (nlh->nlmsg_flags & NLM_F_REPLACE) 784 if (nlh->nlmsg_flags & NLM_F_REPLACE)
775 return -EOPNOTSUPP; 785 return -EOPNOTSUPP;
776 786
@@ -789,10 +799,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
789 if (table == NULL) 799 if (table == NULL)
790 goto err_kzalloc; 800 goto err_kzalloc;
791 801
792 table->name = nla_strdup(name, GFP_KERNEL); 802 table->name = nla_strdup(attr, GFP_KERNEL);
793 if (table->name == NULL) 803 if (table->name == NULL)
794 goto err_strdup; 804 goto err_strdup;
795 805
806 err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
807 if (err)
808 goto err_chain_ht;
809
796 INIT_LIST_HEAD(&table->chains); 810 INIT_LIST_HEAD(&table->chains);
797 INIT_LIST_HEAD(&table->sets); 811 INIT_LIST_HEAD(&table->sets);
798 INIT_LIST_HEAD(&table->objects); 812 INIT_LIST_HEAD(&table->objects);
@@ -809,6 +823,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
809 list_add_tail_rcu(&table->list, &net->nft.tables); 823 list_add_tail_rcu(&table->list, &net->nft.tables);
810 return 0; 824 return 0;
811err_trans: 825err_trans:
826 rhltable_destroy(&table->chains_ht);
827err_chain_ht:
812 kfree(table->name); 828 kfree(table->name);
813err_strdup: 829err_strdup:
814 kfree(table); 830 kfree(table);
@@ -912,8 +928,9 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
912{ 928{
913 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 929 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
914 u8 genmask = nft_genmask_next(net); 930 u8 genmask = nft_genmask_next(net);
915 struct nft_table *table;
916 int family = nfmsg->nfgen_family; 931 int family = nfmsg->nfgen_family;
932 const struct nlattr *attr;
933 struct nft_table *table;
917 struct nft_ctx ctx; 934 struct nft_ctx ctx;
918 935
919 nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); 936 nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
@@ -921,16 +938,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
921 (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) 938 (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
922 return nft_flush(&ctx, family); 939 return nft_flush(&ctx, family);
923 940
924 if (nla[NFTA_TABLE_HANDLE]) 941 if (nla[NFTA_TABLE_HANDLE]) {
925 table = nf_tables_table_lookup_byhandle(net, 942 attr = nla[NFTA_TABLE_HANDLE];
926 nla[NFTA_TABLE_HANDLE], 943 table = nft_table_lookup_byhandle(net, attr, genmask);
927 genmask); 944 } else {
928 else 945 attr = nla[NFTA_TABLE_NAME];
929 table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], 946 table = nft_table_lookup(net, attr, family, genmask);
930 family, genmask); 947 }
931 948
932 if (IS_ERR(table)) 949 if (IS_ERR(table)) {
950 NL_SET_BAD_ATTR(extack, attr);
933 return PTR_ERR(table); 951 return PTR_ERR(table);
952 }
934 953
935 if (nlh->nlmsg_flags & NLM_F_NONREC && 954 if (nlh->nlmsg_flags & NLM_F_NONREC &&
936 table->use > 0) 955 table->use > 0)
@@ -946,6 +965,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
946{ 965{
947 BUG_ON(ctx->table->use > 0); 966 BUG_ON(ctx->table->use > 0);
948 967
968 rhltable_destroy(&ctx->table->chains_ht);
949 kfree(ctx->table->name); 969 kfree(ctx->table->name);
950 kfree(ctx->table); 970 kfree(ctx->table);
951} 971}
@@ -978,8 +998,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
978 */ 998 */
979 999
980static struct nft_chain * 1000static struct nft_chain *
981nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, 1001nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
982 u8 genmask)
983{ 1002{
984 struct nft_chain *chain; 1003 struct nft_chain *chain;
985 1004
@@ -992,22 +1011,35 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
992 return ERR_PTR(-ENOENT); 1011 return ERR_PTR(-ENOENT);
993} 1012}
994 1013
995static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, 1014static struct nft_chain *nft_chain_lookup(struct nft_table *table,
996 const struct nlattr *nla, 1015 const struct nlattr *nla, u8 genmask)
997 u8 genmask)
998{ 1016{
1017 char search[NFT_CHAIN_MAXNAMELEN + 1];
1018 struct rhlist_head *tmp, *list;
999 struct nft_chain *chain; 1019 struct nft_chain *chain;
1000 1020
1001 if (nla == NULL) 1021 if (nla == NULL)
1002 return ERR_PTR(-EINVAL); 1022 return ERR_PTR(-EINVAL);
1003 1023
1004 list_for_each_entry(chain, &table->chains, list) { 1024 nla_strlcpy(search, nla, sizeof(search));
1005 if (!nla_strcmp(nla, chain->name) &&
1006 nft_active_genmask(chain, genmask))
1007 return chain;
1008 }
1009 1025
1010 return ERR_PTR(-ENOENT); 1026 WARN_ON(!rcu_read_lock_held() &&
1027 !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
1028
1029 chain = ERR_PTR(-ENOENT);
1030 rcu_read_lock();
1031 list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
1032 if (!list)
1033 goto out_unlock;
1034
1035 rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
1036 if (nft_active_genmask(chain, genmask))
1037 goto out_unlock;
1038 }
1039 chain = ERR_PTR(-ENOENT);
1040out_unlock:
1041 rcu_read_unlock();
1042 return chain;
1011} 1043}
1012 1044
1013static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { 1045static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
@@ -1203,6 +1235,7 @@ done:
1203 return skb->len; 1235 return skb->len;
1204} 1236}
1205 1237
1238/* called with rcu_read_lock held */
1206static int nf_tables_getchain(struct net *net, struct sock *nlsk, 1239static int nf_tables_getchain(struct net *net, struct sock *nlsk,
1207 struct sk_buff *skb, const struct nlmsghdr *nlh, 1240 struct sk_buff *skb, const struct nlmsghdr *nlh,
1208 const struct nlattr * const nla[], 1241 const struct nlattr * const nla[],
@@ -1210,8 +1243,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
1210{ 1243{
1211 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 1244 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
1212 u8 genmask = nft_genmask_cur(net); 1245 u8 genmask = nft_genmask_cur(net);
1213 const struct nft_table *table;
1214 const struct nft_chain *chain; 1246 const struct nft_chain *chain;
1247 struct nft_table *table;
1215 struct sk_buff *skb2; 1248 struct sk_buff *skb2;
1216 int family = nfmsg->nfgen_family; 1249 int family = nfmsg->nfgen_family;
1217 int err; 1250 int err;
@@ -1219,20 +1252,25 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
1219 if (nlh->nlmsg_flags & NLM_F_DUMP) { 1252 if (nlh->nlmsg_flags & NLM_F_DUMP) {
1220 struct netlink_dump_control c = { 1253 struct netlink_dump_control c = {
1221 .dump = nf_tables_dump_chains, 1254 .dump = nf_tables_dump_chains,
1255 .module = THIS_MODULE,
1222 }; 1256 };
1223 return netlink_dump_start(nlsk, skb, nlh, &c); 1257
1258 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
1224 } 1259 }
1225 1260
1226 table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, 1261 table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
1227 genmask); 1262 if (IS_ERR(table)) {
1228 if (IS_ERR(table)) 1263 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
1229 return PTR_ERR(table); 1264 return PTR_ERR(table);
1265 }
1230 1266
1231 chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); 1267 chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
1232 if (IS_ERR(chain)) 1268 if (IS_ERR(chain)) {
1269 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
1233 return PTR_ERR(chain); 1270 return PTR_ERR(chain);
1271 }
1234 1272
1235 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1273 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
1236 if (!skb2) 1274 if (!skb2)
1237 return -ENOMEM; 1275 return -ENOMEM;
1238 1276
@@ -1304,17 +1342,32 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
1304 } 1342 }
1305} 1343}
1306 1344
1345static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
1346{
1347 struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
1348 struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
1349
1350 if (g0 != g1)
1351 kvfree(g1);
1352 kvfree(g0);
1353
1354 /* should be NULL either via abort or via successful commit */
1355 WARN_ON_ONCE(chain->rules_next);
1356 kvfree(chain->rules_next);
1357}
1358
1307static void nf_tables_chain_destroy(struct nft_ctx *ctx) 1359static void nf_tables_chain_destroy(struct nft_ctx *ctx)
1308{ 1360{
1309 struct nft_chain *chain = ctx->chain; 1361 struct nft_chain *chain = ctx->chain;
1310 1362
1311 BUG_ON(chain->use > 0); 1363 BUG_ON(chain->use > 0);
1312 1364
1365 /* no concurrent access possible anymore */
1366 nf_tables_chain_free_chain_rules(chain);
1367
1313 if (nft_is_base_chain(chain)) { 1368 if (nft_is_base_chain(chain)) {
1314 struct nft_base_chain *basechain = nft_base_chain(chain); 1369 struct nft_base_chain *basechain = nft_base_chain(chain);
1315 1370
1316 if (basechain->type->free)
1317 basechain->type->free(ctx);
1318 module_put(basechain->type->owner); 1371 module_put(basechain->type->owner);
1319 free_percpu(basechain->stats); 1372 free_percpu(basechain->stats);
1320 if (basechain->stats) 1373 if (basechain->stats)
@@ -1404,6 +1457,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
1404 module_put(hook->type->owner); 1457 module_put(hook->type->owner);
1405} 1458}
1406 1459
1460struct nft_rules_old {
1461 struct rcu_head h;
1462 struct nft_rule **start;
1463};
1464
1465static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
1466 unsigned int alloc)
1467{
1468 if (alloc > INT_MAX)
1469 return NULL;
1470
1471 alloc += 1; /* NULL, ends rules */
1472 if (sizeof(struct nft_rule *) > INT_MAX / alloc)
1473 return NULL;
1474
1475 alloc *= sizeof(struct nft_rule *);
1476 alloc += sizeof(struct nft_rules_old);
1477
1478 return kvmalloc(alloc, GFP_KERNEL);
1479}
1480
1407static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, 1481static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1408 u8 policy, bool create) 1482 u8 policy, bool create)
1409{ 1483{
@@ -1413,6 +1487,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1413 struct nft_stats __percpu *stats; 1487 struct nft_stats __percpu *stats;
1414 struct net *net = ctx->net; 1488 struct net *net = ctx->net;
1415 struct nft_chain *chain; 1489 struct nft_chain *chain;
1490 struct nft_rule **rules;
1416 int err; 1491 int err;
1417 1492
1418 if (table->use == UINT_MAX) 1493 if (table->use == UINT_MAX)
@@ -1447,9 +1522,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1447 } 1522 }
1448 1523
1449 basechain->type = hook.type; 1524 basechain->type = hook.type;
1450 if (basechain->type->init)
1451 basechain->type->init(ctx);
1452
1453 chain = &basechain->chain; 1525 chain = &basechain->chain;
1454 1526
1455 ops = &basechain->ops; 1527 ops = &basechain->ops;
@@ -1460,9 +1532,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1460 ops->hook = hook.type->hooks[ops->hooknum]; 1532 ops->hook = hook.type->hooks[ops->hooknum];
1461 ops->dev = hook.dev; 1533 ops->dev = hook.dev;
1462 1534
1463 if (basechain->type->type == NFT_CHAIN_T_NAT)
1464 ops->nat_hook = true;
1465
1466 chain->flags |= NFT_BASE_CHAIN; 1535 chain->flags |= NFT_BASE_CHAIN;
1467 basechain->policy = policy; 1536 basechain->policy = policy;
1468 } else { 1537 } else {
@@ -1481,13 +1550,31 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1481 goto err1; 1550 goto err1;
1482 } 1551 }
1483 1552
1553 rules = nf_tables_chain_alloc_rules(chain, 0);
1554 if (!rules) {
1555 err = -ENOMEM;
1556 goto err1;
1557 }
1558
1559 *rules = NULL;
1560 rcu_assign_pointer(chain->rules_gen_0, rules);
1561 rcu_assign_pointer(chain->rules_gen_1, rules);
1562
1484 err = nf_tables_register_hook(net, table, chain); 1563 err = nf_tables_register_hook(net, table, chain);
1485 if (err < 0) 1564 if (err < 0)
1486 goto err1; 1565 goto err1;
1487 1566
1567 err = rhltable_insert_key(&table->chains_ht, chain->name,
1568 &chain->rhlhead, nft_chain_ht_params);
1569 if (err)
1570 goto err2;
1571
1488 err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); 1572 err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
1489 if (err < 0) 1573 if (err < 0) {
1574 rhltable_remove(&table->chains_ht, &chain->rhlhead,
1575 nft_chain_ht_params);
1490 goto err2; 1576 goto err2;
1577 }
1491 1578
1492 table->use++; 1579 table->use++;
1493 list_add_tail_rcu(&chain->list, &table->chains); 1580 list_add_tail_rcu(&chain->list, &table->chains);
@@ -1544,8 +1631,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
1544 nla[NFTA_CHAIN_NAME]) { 1631 nla[NFTA_CHAIN_NAME]) {
1545 struct nft_chain *chain2; 1632 struct nft_chain *chain2;
1546 1633
1547 chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], 1634 chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
1548 genmask);
1549 if (!IS_ERR(chain2)) 1635 if (!IS_ERR(chain2))
1550 return -EEXIST; 1636 return -EEXIST;
1551 } 1637 }
@@ -1595,9 +1681,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1595 struct netlink_ext_ack *extack) 1681 struct netlink_ext_ack *extack)
1596{ 1682{
1597 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 1683 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
1598 const struct nlattr * uninitialized_var(name);
1599 u8 genmask = nft_genmask_next(net); 1684 u8 genmask = nft_genmask_next(net);
1600 int family = nfmsg->nfgen_family; 1685 int family = nfmsg->nfgen_family;
1686 const struct nlattr *attr;
1601 struct nft_table *table; 1687 struct nft_table *table;
1602 struct nft_chain *chain; 1688 struct nft_chain *chain;
1603 u8 policy = NF_ACCEPT; 1689 u8 policy = NF_ACCEPT;
@@ -1607,36 +1693,46 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1607 1693
1608 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; 1694 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
1609 1695
1610 table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, 1696 table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
1611 genmask); 1697 if (IS_ERR(table)) {
1612 if (IS_ERR(table)) 1698 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
1613 return PTR_ERR(table); 1699 return PTR_ERR(table);
1700 }
1614 1701
1615 chain = NULL; 1702 chain = NULL;
1616 name = nla[NFTA_CHAIN_NAME]; 1703 attr = nla[NFTA_CHAIN_NAME];
1617 1704
1618 if (nla[NFTA_CHAIN_HANDLE]) { 1705 if (nla[NFTA_CHAIN_HANDLE]) {
1619 handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); 1706 handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
1620 chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); 1707 chain = nft_chain_lookup_byhandle(table, handle, genmask);
1621 if (IS_ERR(chain)) 1708 if (IS_ERR(chain)) {
1709 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
1622 return PTR_ERR(chain); 1710 return PTR_ERR(chain);
1711 }
1712 attr = nla[NFTA_CHAIN_HANDLE];
1623 } else { 1713 } else {
1624 chain = nf_tables_chain_lookup(table, name, genmask); 1714 chain = nft_chain_lookup(table, attr, genmask);
1625 if (IS_ERR(chain)) { 1715 if (IS_ERR(chain)) {
1626 if (PTR_ERR(chain) != -ENOENT) 1716 if (PTR_ERR(chain) != -ENOENT) {
1717 NL_SET_BAD_ATTR(extack, attr);
1627 return PTR_ERR(chain); 1718 return PTR_ERR(chain);
1719 }
1628 chain = NULL; 1720 chain = NULL;
1629 } 1721 }
1630 } 1722 }
1631 1723
1632 if (nla[NFTA_CHAIN_POLICY]) { 1724 if (nla[NFTA_CHAIN_POLICY]) {
1633 if (chain != NULL && 1725 if (chain != NULL &&
1634 !nft_is_base_chain(chain)) 1726 !nft_is_base_chain(chain)) {
1727 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
1635 return -EOPNOTSUPP; 1728 return -EOPNOTSUPP;
1729 }
1636 1730
1637 if (chain == NULL && 1731 if (chain == NULL &&
1638 nla[NFTA_CHAIN_HOOK] == NULL) 1732 nla[NFTA_CHAIN_HOOK] == NULL) {
1733 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
1639 return -EOPNOTSUPP; 1734 return -EOPNOTSUPP;
1735 }
1640 1736
1641 policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); 1737 policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
1642 switch (policy) { 1738 switch (policy) {
@@ -1651,8 +1747,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1651 nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); 1747 nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
1652 1748
1653 if (chain != NULL) { 1749 if (chain != NULL) {
1654 if (nlh->nlmsg_flags & NLM_F_EXCL) 1750 if (nlh->nlmsg_flags & NLM_F_EXCL) {
1751 NL_SET_BAD_ATTR(extack, attr);
1655 return -EEXIST; 1752 return -EEXIST;
1753 }
1656 if (nlh->nlmsg_flags & NLM_F_REPLACE) 1754 if (nlh->nlmsg_flags & NLM_F_REPLACE)
1657 return -EOPNOTSUPP; 1755 return -EOPNOTSUPP;
1658 1756
@@ -1669,28 +1767,34 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
1669{ 1767{
1670 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 1768 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
1671 u8 genmask = nft_genmask_next(net); 1769 u8 genmask = nft_genmask_next(net);
1770 int family = nfmsg->nfgen_family;
1771 const struct nlattr *attr;
1672 struct nft_table *table; 1772 struct nft_table *table;
1673 struct nft_chain *chain; 1773 struct nft_chain *chain;
1674 struct nft_rule *rule; 1774 struct nft_rule *rule;
1675 int family = nfmsg->nfgen_family;
1676 struct nft_ctx ctx; 1775 struct nft_ctx ctx;
1677 u64 handle; 1776 u64 handle;
1678 u32 use; 1777 u32 use;
1679 int err; 1778 int err;
1680 1779
1681 table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, 1780 table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
1682 genmask); 1781 if (IS_ERR(table)) {
1683 if (IS_ERR(table)) 1782 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
1684 return PTR_ERR(table); 1783 return PTR_ERR(table);
1784 }
1685 1785
1686 if (nla[NFTA_CHAIN_HANDLE]) { 1786 if (nla[NFTA_CHAIN_HANDLE]) {
1687 handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); 1787 attr = nla[NFTA_CHAIN_HANDLE];
1688 chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); 1788 handle = be64_to_cpu(nla_get_be64(attr));
1789 chain = nft_chain_lookup_byhandle(table, handle, genmask);
1689 } else { 1790 } else {
1690 chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); 1791 attr = nla[NFTA_CHAIN_NAME];
1792 chain = nft_chain_lookup(table, attr, genmask);
1691 } 1793 }
1692 if (IS_ERR(chain)) 1794 if (IS_ERR(chain)) {
1795 NL_SET_BAD_ATTR(extack, attr);
1693 return PTR_ERR(chain); 1796 return PTR_ERR(chain);
1797 }
1694 1798
1695 if (nlh->nlmsg_flags & NLM_F_NONREC && 1799 if (nlh->nlmsg_flags & NLM_F_NONREC &&
1696 chain->use > 0) 1800 chain->use > 0)
@@ -1712,8 +1816,10 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
1712 /* There are rules and elements that are still holding references to us, 1816 /* There are rules and elements that are still holding references to us,
1713 * we cannot do a recursive removal in this case. 1817 * we cannot do a recursive removal in this case.
1714 */ 1818 */
1715 if (use > 0) 1819 if (use > 0) {
1820 NL_SET_BAD_ATTR(extack, attr);
1716 return -EBUSY; 1821 return -EBUSY;
1822 }
1717 1823
1718 return nft_delchain(&ctx); 1824 return nft_delchain(&ctx);
1719} 1825}
@@ -1905,19 +2011,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
1905 goto err1; 2011 goto err1;
1906 } 2012 }
1907 2013
1908 if (ops->validate) {
1909 const struct nft_data *data = NULL;
1910
1911 err = ops->validate(ctx, expr, &data);
1912 if (err < 0)
1913 goto err2;
1914 }
1915
1916 return 0; 2014 return 0;
1917
1918err2:
1919 if (ops->destroy)
1920 ops->destroy(ctx, expr);
1921err1: 2015err1:
1922 expr->ops = NULL; 2016 expr->ops = NULL;
1923 return err; 2017 return err;
@@ -1970,13 +2064,13 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
1970 * Rules 2064 * Rules
1971 */ 2065 */
1972 2066
1973static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, 2067static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
1974 u64 handle) 2068 u64 handle)
1975{ 2069{
1976 struct nft_rule *rule; 2070 struct nft_rule *rule;
1977 2071
1978 // FIXME: this sucks 2072 // FIXME: this sucks
1979 list_for_each_entry(rule, &chain->rules, list) { 2073 list_for_each_entry_rcu(rule, &chain->rules, list) {
1980 if (handle == rule->handle) 2074 if (handle == rule->handle)
1981 return rule; 2075 return rule;
1982 } 2076 }
@@ -1984,13 +2078,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
1984 return ERR_PTR(-ENOENT); 2078 return ERR_PTR(-ENOENT);
1985} 2079}
1986 2080
1987static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, 2081static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
1988 const struct nlattr *nla) 2082 const struct nlattr *nla)
1989{ 2083{
1990 if (nla == NULL) 2084 if (nla == NULL)
1991 return ERR_PTR(-EINVAL); 2085 return ERR_PTR(-EINVAL);
1992 2086
1993 return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); 2087 return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
1994} 2088}
1995 2089
1996static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { 2090static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2172,6 +2266,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
2172 return 0; 2266 return 0;
2173} 2267}
2174 2268
2269/* called with rcu_read_lock held */
2175static int nf_tables_getrule(struct net *net, struct sock *nlsk, 2270static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2176 struct sk_buff *skb, const struct nlmsghdr *nlh, 2271 struct sk_buff *skb, const struct nlmsghdr *nlh,
2177 const struct nlattr * const nla[], 2272 const struct nlattr * const nla[],
@@ -2179,9 +2274,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2179{ 2274{
2180 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 2275 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
2181 u8 genmask = nft_genmask_cur(net); 2276 u8 genmask = nft_genmask_cur(net);
2182 const struct nft_table *table;
2183 const struct nft_chain *chain; 2277 const struct nft_chain *chain;
2184 const struct nft_rule *rule; 2278 const struct nft_rule *rule;
2279 struct nft_table *table;
2185 struct sk_buff *skb2; 2280 struct sk_buff *skb2;
2186 int family = nfmsg->nfgen_family; 2281 int family = nfmsg->nfgen_family;
2187 int err; 2282 int err;
@@ -2190,18 +2285,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2190 struct netlink_dump_control c = { 2285 struct netlink_dump_control c = {
2191 .dump = nf_tables_dump_rules, 2286 .dump = nf_tables_dump_rules,
2192 .done = nf_tables_dump_rules_done, 2287 .done = nf_tables_dump_rules_done,
2288 .module = THIS_MODULE,
2193 }; 2289 };
2194 2290
2195 if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { 2291 if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
2196 struct nft_rule_dump_ctx *ctx; 2292 struct nft_rule_dump_ctx *ctx;
2197 2293
2198 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 2294 ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
2199 if (!ctx) 2295 if (!ctx)
2200 return -ENOMEM; 2296 return -ENOMEM;
2201 2297
2202 if (nla[NFTA_RULE_TABLE]) { 2298 if (nla[NFTA_RULE_TABLE]) {
2203 ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], 2299 ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
2204 GFP_KERNEL); 2300 GFP_ATOMIC);
2205 if (!ctx->table) { 2301 if (!ctx->table) {
2206 kfree(ctx); 2302 kfree(ctx);
2207 return -ENOMEM; 2303 return -ENOMEM;
@@ -2209,7 +2305,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2209 } 2305 }
2210 if (nla[NFTA_RULE_CHAIN]) { 2306 if (nla[NFTA_RULE_CHAIN]) {
2211 ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], 2307 ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
2212 GFP_KERNEL); 2308 GFP_ATOMIC);
2213 if (!ctx->chain) { 2309 if (!ctx->chain) {
2214 kfree(ctx->table); 2310 kfree(ctx->table);
2215 kfree(ctx); 2311 kfree(ctx);
@@ -2219,23 +2315,28 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2219 c.data = ctx; 2315 c.data = ctx;
2220 } 2316 }
2221 2317
2222 return netlink_dump_start(nlsk, skb, nlh, &c); 2318 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
2223 } 2319 }
2224 2320
2225 table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, 2321 table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
2226 genmask); 2322 if (IS_ERR(table)) {
2227 if (IS_ERR(table)) 2323 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
2228 return PTR_ERR(table); 2324 return PTR_ERR(table);
2325 }
2229 2326
2230 chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); 2327 chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
2231 if (IS_ERR(chain)) 2328 if (IS_ERR(chain)) {
2329 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
2232 return PTR_ERR(chain); 2330 return PTR_ERR(chain);
2331 }
2233 2332
2234 rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); 2333 rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
2235 if (IS_ERR(rule)) 2334 if (IS_ERR(rule)) {
2335 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
2236 return PTR_ERR(rule); 2336 return PTR_ERR(rule);
2337 }
2237 2338
2238 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2339 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
2239 if (!skb2) 2340 if (!skb2)
2240 return -ENOMEM; 2341 return -ENOMEM;
2241 2342
@@ -2276,6 +2377,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
2276 nf_tables_rule_destroy(ctx, rule); 2377 nf_tables_rule_destroy(ctx, rule);
2277} 2378}
2278 2379
2380int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
2381{
2382 struct nft_expr *expr, *last;
2383 const struct nft_data *data;
2384 struct nft_rule *rule;
2385 int err;
2386
2387 list_for_each_entry(rule, &chain->rules, list) {
2388 if (!nft_is_active_next(ctx->net, rule))
2389 continue;
2390
2391 nft_rule_for_each_expr(expr, last, rule) {
2392 if (!expr->ops->validate)
2393 continue;
2394
2395 err = expr->ops->validate(ctx, expr, &data);
2396 if (err < 0)
2397 return err;
2398 }
2399 }
2400
2401 return 0;
2402}
2403EXPORT_SYMBOL_GPL(nft_chain_validate);
2404
2405static int nft_table_validate(struct net *net, const struct nft_table *table)
2406{
2407 struct nft_chain *chain;
2408 struct nft_ctx ctx = {
2409 .net = net,
2410 .family = table->family,
2411 };
2412 int err;
2413
2414 list_for_each_entry(chain, &table->chains, list) {
2415 if (!nft_is_base_chain(chain))
2416 continue;
2417
2418 ctx.chain = chain;
2419 err = nft_chain_validate(&ctx, chain);
2420 if (err < 0)
2421 return err;
2422 }
2423
2424 return 0;
2425}
2426
2279#define NFT_RULE_MAXEXPRS 128 2427#define NFT_RULE_MAXEXPRS 128
2280 2428
2281static struct nft_expr_info *info; 2429static struct nft_expr_info *info;
@@ -2303,23 +2451,30 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2303 2451
2304 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; 2452 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
2305 2453
2306 table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, 2454 table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
2307 genmask); 2455 if (IS_ERR(table)) {
2308 if (IS_ERR(table)) 2456 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
2309 return PTR_ERR(table); 2457 return PTR_ERR(table);
2458 }
2310 2459
2311 chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); 2460 chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
2312 if (IS_ERR(chain)) 2461 if (IS_ERR(chain)) {
2462 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
2313 return PTR_ERR(chain); 2463 return PTR_ERR(chain);
2464 }
2314 2465
2315 if (nla[NFTA_RULE_HANDLE]) { 2466 if (nla[NFTA_RULE_HANDLE]) {
2316 handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); 2467 handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
2317 rule = __nf_tables_rule_lookup(chain, handle); 2468 rule = __nft_rule_lookup(chain, handle);
2318 if (IS_ERR(rule)) 2469 if (IS_ERR(rule)) {
2470 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
2319 return PTR_ERR(rule); 2471 return PTR_ERR(rule);
2472 }
2320 2473
2321 if (nlh->nlmsg_flags & NLM_F_EXCL) 2474 if (nlh->nlmsg_flags & NLM_F_EXCL) {
2475 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
2322 return -EEXIST; 2476 return -EEXIST;
2477 }
2323 if (nlh->nlmsg_flags & NLM_F_REPLACE) 2478 if (nlh->nlmsg_flags & NLM_F_REPLACE)
2324 old_rule = rule; 2479 old_rule = rule;
2325 else 2480 else
@@ -2338,9 +2493,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2338 return -EOPNOTSUPP; 2493 return -EOPNOTSUPP;
2339 2494
2340 pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); 2495 pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
2341 old_rule = __nf_tables_rule_lookup(chain, pos_handle); 2496 old_rule = __nft_rule_lookup(chain, pos_handle);
2342 if (IS_ERR(old_rule)) 2497 if (IS_ERR(old_rule)) {
2498 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
2343 return PTR_ERR(old_rule); 2499 return PTR_ERR(old_rule);
2500 }
2344 } 2501 }
2345 2502
2346 nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); 2503 nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
@@ -2394,6 +2551,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2394 err = nf_tables_newexpr(&ctx, &info[i], expr); 2551 err = nf_tables_newexpr(&ctx, &info[i], expr);
2395 if (err < 0) 2552 if (err < 0)
2396 goto err2; 2553 goto err2;
2554
2555 if (info[i].ops->validate)
2556 nft_validate_state_update(net, NFT_VALIDATE_NEED);
2557
2397 info[i].ops = NULL; 2558 info[i].ops = NULL;
2398 expr = nft_expr_next(expr); 2559 expr = nft_expr_next(expr);
2399 } 2560 }
@@ -2437,8 +2598,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2437 } 2598 }
2438 } 2599 }
2439 chain->use++; 2600 chain->use++;
2440 return 0;
2441 2601
2602 if (net->nft.validate_state == NFT_VALIDATE_DO)
2603 return nft_table_validate(net, table);
2604
2605 return 0;
2442err2: 2606err2:
2443 nf_tables_rule_release(&ctx, rule); 2607 nf_tables_rule_release(&ctx, rule);
2444err1: 2608err1:
@@ -2478,32 +2642,37 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
2478 int family = nfmsg->nfgen_family, err = 0; 2642 int family = nfmsg->nfgen_family, err = 0;
2479 struct nft_ctx ctx; 2643 struct nft_ctx ctx;
2480 2644
2481 table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, 2645 table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
2482 genmask); 2646 if (IS_ERR(table)) {
2483 if (IS_ERR(table)) 2647 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
2484 return PTR_ERR(table); 2648 return PTR_ERR(table);
2649 }
2485 2650
2486 if (nla[NFTA_RULE_CHAIN]) { 2651 if (nla[NFTA_RULE_CHAIN]) {
2487 chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], 2652 chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
2488 genmask); 2653 if (IS_ERR(chain)) {
2489 if (IS_ERR(chain)) 2654 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
2490 return PTR_ERR(chain); 2655 return PTR_ERR(chain);
2656 }
2491 } 2657 }
2492 2658
2493 nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); 2659 nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
2494 2660
2495 if (chain) { 2661 if (chain) {
2496 if (nla[NFTA_RULE_HANDLE]) { 2662 if (nla[NFTA_RULE_HANDLE]) {
2497 rule = nf_tables_rule_lookup(chain, 2663 rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
2498 nla[NFTA_RULE_HANDLE]); 2664 if (IS_ERR(rule)) {
2499 if (IS_ERR(rule)) 2665 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
2500 return PTR_ERR(rule); 2666 return PTR_ERR(rule);
2667 }
2501 2668
2502 err = nft_delrule(&ctx, rule); 2669 err = nft_delrule(&ctx, rule);
2503 } else if (nla[NFTA_RULE_ID]) { 2670 } else if (nla[NFTA_RULE_ID]) {
2504 rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); 2671 rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
2505 if (IS_ERR(rule)) 2672 if (IS_ERR(rule)) {
2673 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
2506 return PTR_ERR(rule); 2674 return PTR_ERR(rule);
2675 }
2507 2676
2508 err = nft_delrule(&ctx, rule); 2677 err = nft_delrule(&ctx, rule);
2509 } else { 2678 } else {
@@ -2548,14 +2717,12 @@ void nft_unregister_set(struct nft_set_type *type)
2548EXPORT_SYMBOL_GPL(nft_unregister_set); 2717EXPORT_SYMBOL_GPL(nft_unregister_set);
2549 2718
2550#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ 2719#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
2551 NFT_SET_TIMEOUT | NFT_SET_OBJECT) 2720 NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
2721 NFT_SET_EVAL)
2552 2722
2553static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags) 2723static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
2554{ 2724{
2555 if ((flags & NFT_SET_EVAL) && !ops->update) 2725 return (flags & type->features) == (flags & NFT_SET_FEATURES);
2556 return false;
2557
2558 return (flags & ops->features) == (flags & NFT_SET_FEATURES);
2559} 2726}
2560 2727
2561/* 2728/*
@@ -2592,14 +2759,9 @@ nft_select_set_ops(const struct nft_ctx *ctx,
2592 best.space = ~0; 2759 best.space = ~0;
2593 2760
2594 list_for_each_entry(type, &nf_tables_set_types, list) { 2761 list_for_each_entry(type, &nf_tables_set_types, list) {
2595 if (!type->select_ops) 2762 ops = &type->ops;
2596 ops = type->ops;
2597 else
2598 ops = type->select_ops(ctx, desc, flags);
2599 if (!ops)
2600 continue;
2601 2763
2602 if (!nft_set_ops_candidate(ops, flags)) 2764 if (!nft_set_ops_candidate(type, flags))
2603 continue; 2765 continue;
2604 if (!ops->estimate(desc, flags, &est)) 2766 if (!ops->estimate(desc, flags, &est))
2605 continue; 2767 continue;
@@ -2630,7 +2792,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
2630 if (!try_module_get(type->owner)) 2792 if (!try_module_get(type->owner))
2631 continue; 2793 continue;
2632 if (bops != NULL) 2794 if (bops != NULL)
2633 module_put(bops->type->owner); 2795 module_put(to_set_type(bops)->owner);
2634 2796
2635 bops = ops; 2797 bops = ops;
2636 best = est; 2798 best = est;
@@ -2671,6 +2833,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
2671 const struct sk_buff *skb, 2833 const struct sk_buff *skb,
2672 const struct nlmsghdr *nlh, 2834 const struct nlmsghdr *nlh,
2673 const struct nlattr * const nla[], 2835 const struct nlattr * const nla[],
2836 struct netlink_ext_ack *extack,
2674 u8 genmask) 2837 u8 genmask)
2675{ 2838{
2676 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 2839 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2678,25 +2841,27 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
2678 struct nft_table *table = NULL; 2841 struct nft_table *table = NULL;
2679 2842
2680 if (nla[NFTA_SET_TABLE] != NULL) { 2843 if (nla[NFTA_SET_TABLE] != NULL) {
2681 table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], 2844 table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
2682 family, genmask); 2845 genmask);
2683 if (IS_ERR(table)) 2846 if (IS_ERR(table)) {
2847 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
2684 return PTR_ERR(table); 2848 return PTR_ERR(table);
2849 }
2685 } 2850 }
2686 2851
2687 nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); 2852 nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
2688 return 0; 2853 return 0;
2689} 2854}
2690 2855
2691static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, 2856static struct nft_set *nft_set_lookup(const struct nft_table *table,
2692 const struct nlattr *nla, u8 genmask) 2857 const struct nlattr *nla, u8 genmask)
2693{ 2858{
2694 struct nft_set *set; 2859 struct nft_set *set;
2695 2860
2696 if (nla == NULL) 2861 if (nla == NULL)
2697 return ERR_PTR(-EINVAL); 2862 return ERR_PTR(-EINVAL);
2698 2863
2699 list_for_each_entry(set, &table->sets, list) { 2864 list_for_each_entry_rcu(set, &table->sets, list) {
2700 if (!nla_strcmp(nla, set->name) && 2865 if (!nla_strcmp(nla, set->name) &&
2701 nft_active_genmask(set, genmask)) 2866 nft_active_genmask(set, genmask))
2702 return set; 2867 return set;
@@ -2704,14 +2869,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
2704 return ERR_PTR(-ENOENT); 2869 return ERR_PTR(-ENOENT);
2705} 2870}
2706 2871
2707static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, 2872static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
2708 const struct nlattr *nla, u8 genmask) 2873 const struct nlattr *nla,
2874 u8 genmask)
2709{ 2875{
2710 struct nft_set *set; 2876 struct nft_set *set;
2711 2877
2712 if (nla == NULL)
2713 return ERR_PTR(-EINVAL);
2714
2715 list_for_each_entry(set, &table->sets, list) { 2878 list_for_each_entry(set, &table->sets, list) {
2716 if (be64_to_cpu(nla_get_be64(nla)) == set->handle && 2879 if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
2717 nft_active_genmask(set, genmask)) 2880 nft_active_genmask(set, genmask))
@@ -2720,9 +2883,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab
2720 return ERR_PTR(-ENOENT); 2883 return ERR_PTR(-ENOENT);
2721} 2884}
2722 2885
2723static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, 2886static struct nft_set *nft_set_lookup_byid(const struct net *net,
2724 const struct nlattr *nla, 2887 const struct nlattr *nla, u8 genmask)
2725 u8 genmask)
2726{ 2888{
2727 struct nft_trans *trans; 2889 struct nft_trans *trans;
2728 u32 id = ntohl(nla_get_be32(nla)); 2890 u32 id = ntohl(nla_get_be32(nla));
@@ -2746,12 +2908,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
2746{ 2908{
2747 struct nft_set *set; 2909 struct nft_set *set;
2748 2910
2749 set = nf_tables_set_lookup(table, nla_set_name, genmask); 2911 set = nft_set_lookup(table, nla_set_name, genmask);
2750 if (IS_ERR(set)) { 2912 if (IS_ERR(set)) {
2751 if (!nla_set_id) 2913 if (!nla_set_id)
2752 return set; 2914 return set;
2753 2915
2754 set = nf_tables_set_lookup_byid(net, nla_set_id, genmask); 2916 set = nft_set_lookup_byid(net, nla_set_id, genmask);
2755 } 2917 }
2756 return set; 2918 return set;
2757} 2919}
@@ -2811,6 +2973,27 @@ cont:
2811 return 0; 2973 return 0;
2812} 2974}
2813 2975
2976static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
2977{
2978 u64 ms = be64_to_cpu(nla_get_be64(nla));
2979 u64 max = (u64)(~((u64)0));
2980
2981 max = div_u64(max, NSEC_PER_MSEC);
2982 if (ms >= max)
2983 return -ERANGE;
2984
2985 ms *= NSEC_PER_MSEC;
2986 *result = nsecs_to_jiffies64(ms);
2987 return 0;
2988}
2989
2990static __be64 nf_jiffies64_to_msecs(u64 input)
2991{
2992 u64 ms = jiffies64_to_nsecs(input);
2993
2994 return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
2995}
2996
2814static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, 2997static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
2815 const struct nft_set *set, u16 event, u16 flags) 2998 const struct nft_set *set, u16 event, u16 flags)
2816{ 2999{
@@ -2858,7 +3041,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
2858 3041
2859 if (set->timeout && 3042 if (set->timeout &&
2860 nla_put_be64(skb, NFTA_SET_TIMEOUT, 3043 nla_put_be64(skb, NFTA_SET_TIMEOUT,
2861 cpu_to_be64(jiffies_to_msecs(set->timeout)), 3044 nf_jiffies64_to_msecs(set->timeout),
2862 NFTA_SET_PAD)) 3045 NFTA_SET_PAD))
2863 goto nla_put_failure; 3046 goto nla_put_failure;
2864 if (set->gc_int && 3047 if (set->gc_int &&
@@ -2983,6 +3166,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
2983 return 0; 3166 return 0;
2984} 3167}
2985 3168
3169/* called with rcu_read_lock held */
2986static int nf_tables_getset(struct net *net, struct sock *nlsk, 3170static int nf_tables_getset(struct net *net, struct sock *nlsk,
2987 struct sk_buff *skb, const struct nlmsghdr *nlh, 3171 struct sk_buff *skb, const struct nlmsghdr *nlh,
2988 const struct nlattr * const nla[], 3172 const struct nlattr * const nla[],
@@ -2996,7 +3180,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
2996 int err; 3180 int err;
2997 3181
2998 /* Verify existence before starting dump */ 3182 /* Verify existence before starting dump */
2999 err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); 3183 err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
3184 genmask);
3000 if (err < 0) 3185 if (err < 0)
3001 return err; 3186 return err;
3002 3187
@@ -3004,17 +3189,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
3004 struct netlink_dump_control c = { 3189 struct netlink_dump_control c = {
3005 .dump = nf_tables_dump_sets, 3190 .dump = nf_tables_dump_sets,
3006 .done = nf_tables_dump_sets_done, 3191 .done = nf_tables_dump_sets_done,
3192 .module = THIS_MODULE,
3007 }; 3193 };
3008 struct nft_ctx *ctx_dump; 3194 struct nft_ctx *ctx_dump;
3009 3195
3010 ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL); 3196 ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
3011 if (ctx_dump == NULL) 3197 if (ctx_dump == NULL)
3012 return -ENOMEM; 3198 return -ENOMEM;
3013 3199
3014 *ctx_dump = ctx; 3200 *ctx_dump = ctx;
3015 c.data = ctx_dump; 3201 c.data = ctx_dump;
3016 3202
3017 return netlink_dump_start(nlsk, skb, nlh, &c); 3203 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
3018 } 3204 }
3019 3205
3020 /* Only accept unspec with dump */ 3206 /* Only accept unspec with dump */
@@ -3023,11 +3209,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
3023 if (!nla[NFTA_SET_TABLE]) 3209 if (!nla[NFTA_SET_TABLE])
3024 return -EINVAL; 3210 return -EINVAL;
3025 3211
3026 set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); 3212 set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
3027 if (IS_ERR(set)) 3213 if (IS_ERR(set))
3028 return PTR_ERR(set); 3214 return PTR_ERR(set);
3029 3215
3030 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3216 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
3031 if (skb2 == NULL) 3217 if (skb2 == NULL)
3032 return -ENOMEM; 3218 return -ENOMEM;
3033 3219
@@ -3153,8 +3339,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
3153 if (nla[NFTA_SET_TIMEOUT] != NULL) { 3339 if (nla[NFTA_SET_TIMEOUT] != NULL) {
3154 if (!(flags & NFT_SET_TIMEOUT)) 3340 if (!(flags & NFT_SET_TIMEOUT))
3155 return -EINVAL; 3341 return -EINVAL;
3156 timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( 3342
3157 nla[NFTA_SET_TIMEOUT]))); 3343 err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
3344 if (err)
3345 return err;
3158 } 3346 }
3159 gc_int = 0; 3347 gc_int = 0;
3160 if (nla[NFTA_SET_GC_INTERVAL] != NULL) { 3348 if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3175,22 +3363,28 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
3175 3363
3176 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; 3364 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
3177 3365
3178 table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family, 3366 table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
3179 genmask); 3367 if (IS_ERR(table)) {
3180 if (IS_ERR(table)) 3368 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
3181 return PTR_ERR(table); 3369 return PTR_ERR(table);
3370 }
3182 3371
3183 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); 3372 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
3184 3373
3185 set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); 3374 set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
3186 if (IS_ERR(set)) { 3375 if (IS_ERR(set)) {
3187 if (PTR_ERR(set) != -ENOENT) 3376 if (PTR_ERR(set) != -ENOENT) {
3377 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
3188 return PTR_ERR(set); 3378 return PTR_ERR(set);
3379 }
3189 } else { 3380 } else {
3190 if (nlh->nlmsg_flags & NLM_F_EXCL) 3381 if (nlh->nlmsg_flags & NLM_F_EXCL) {
3382 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
3191 return -EEXIST; 3383 return -EEXIST;
3384 }
3192 if (nlh->nlmsg_flags & NLM_F_REPLACE) 3385 if (nlh->nlmsg_flags & NLM_F_REPLACE)
3193 return -EOPNOTSUPP; 3386 return -EOPNOTSUPP;
3387
3194 return 0; 3388 return 0;
3195 } 3389 }
3196 3390
@@ -3233,6 +3427,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
3233 } 3427 }
3234 3428
3235 INIT_LIST_HEAD(&set->bindings); 3429 INIT_LIST_HEAD(&set->bindings);
3430 set->table = table;
3431 write_pnet(&set->net, net);
3236 set->ops = ops; 3432 set->ops = ops;
3237 set->ktype = ktype; 3433 set->ktype = ktype;
3238 set->klen = desc.klen; 3434 set->klen = desc.klen;
@@ -3267,14 +3463,14 @@ err3:
3267err2: 3463err2:
3268 kvfree(set); 3464 kvfree(set);
3269err1: 3465err1:
3270 module_put(ops->type->owner); 3466 module_put(to_set_type(ops)->owner);
3271 return err; 3467 return err;
3272} 3468}
3273 3469
3274static void nft_set_destroy(struct nft_set *set) 3470static void nft_set_destroy(struct nft_set *set)
3275{ 3471{
3276 set->ops->destroy(set); 3472 set->ops->destroy(set);
3277 module_put(set->ops->type->owner); 3473 module_put(to_set_type(set->ops)->owner);
3278 kfree(set->name); 3474 kfree(set->name);
3279 kvfree(set); 3475 kvfree(set);
3280} 3476}
@@ -3293,6 +3489,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
3293{ 3489{
3294 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 3490 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
3295 u8 genmask = nft_genmask_next(net); 3491 u8 genmask = nft_genmask_next(net);
3492 const struct nlattr *attr;
3296 struct nft_set *set; 3493 struct nft_set *set;
3297 struct nft_ctx ctx; 3494 struct nft_ctx ctx;
3298 int err; 3495 int err;
@@ -3302,20 +3499,28 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
3302 if (nla[NFTA_SET_TABLE] == NULL) 3499 if (nla[NFTA_SET_TABLE] == NULL)
3303 return -EINVAL; 3500 return -EINVAL;
3304 3501
3305 err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); 3502 err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
3503 genmask);
3306 if (err < 0) 3504 if (err < 0)
3307 return err; 3505 return err;
3308 3506
3309 if (nla[NFTA_SET_HANDLE]) 3507 if (nla[NFTA_SET_HANDLE]) {
3310 set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); 3508 attr = nla[NFTA_SET_HANDLE];
3311 else 3509 set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
3312 set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); 3510 } else {
3313 if (IS_ERR(set)) 3511 attr = nla[NFTA_SET_NAME];
3314 return PTR_ERR(set); 3512 set = nft_set_lookup(ctx.table, attr, genmask);
3513 }
3315 3514
3515 if (IS_ERR(set)) {
3516 NL_SET_BAD_ATTR(extack, attr);
3517 return PTR_ERR(set);
3518 }
3316 if (!list_empty(&set->bindings) || 3519 if (!list_empty(&set->bindings) ||
3317 (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) 3520 (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
3521 NL_SET_BAD_ATTR(extack, attr);
3318 return -EBUSY; 3522 return -EBUSY;
3523 }
3319 3524
3320 return nft_delset(&ctx, set); 3525 return nft_delset(&ctx, set);
3321} 3526}
@@ -3405,8 +3610,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
3405 .align = __alignof__(u64), 3610 .align = __alignof__(u64),
3406 }, 3611 },
3407 [NFT_SET_EXT_EXPIRATION] = { 3612 [NFT_SET_EXT_EXPIRATION] = {
3408 .len = sizeof(unsigned long), 3613 .len = sizeof(u64),
3409 .align = __alignof__(unsigned long), 3614 .align = __alignof__(u64),
3410 }, 3615 },
3411 [NFT_SET_EXT_USERDATA] = { 3616 [NFT_SET_EXT_USERDATA] = {
3412 .len = sizeof(struct nft_userdata), 3617 .len = sizeof(struct nft_userdata),
@@ -3443,16 +3648,19 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
3443 const struct sk_buff *skb, 3648 const struct sk_buff *skb,
3444 const struct nlmsghdr *nlh, 3649 const struct nlmsghdr *nlh,
3445 const struct nlattr * const nla[], 3650 const struct nlattr * const nla[],
3651 struct netlink_ext_ack *extack,
3446 u8 genmask) 3652 u8 genmask)
3447{ 3653{
3448 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 3654 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
3449 int family = nfmsg->nfgen_family; 3655 int family = nfmsg->nfgen_family;
3450 struct nft_table *table; 3656 struct nft_table *table;
3451 3657
3452 table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], 3658 table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
3453 family, genmask); 3659 genmask);
3454 if (IS_ERR(table)) 3660 if (IS_ERR(table)) {
3661 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
3455 return PTR_ERR(table); 3662 return PTR_ERR(table);
3663 }
3456 3664
3457 nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); 3665 nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
3458 return 0; 3666 return 0;
@@ -3496,22 +3704,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
3496 3704
3497 if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && 3705 if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
3498 nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, 3706 nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
3499 cpu_to_be64(jiffies_to_msecs( 3707 nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
3500 *nft_set_ext_timeout(ext))),
3501 NFTA_SET_ELEM_PAD)) 3708 NFTA_SET_ELEM_PAD))
3502 goto nla_put_failure; 3709 goto nla_put_failure;
3503 3710
3504 if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { 3711 if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
3505 unsigned long expires, now = jiffies; 3712 u64 expires, now = get_jiffies_64();
3506 3713
3507 expires = *nft_set_ext_expiration(ext); 3714 expires = *nft_set_ext_expiration(ext);
3508 if (time_before(now, expires)) 3715 if (time_before64(now, expires))
3509 expires -= now; 3716 expires -= now;
3510 else 3717 else
3511 expires = 0; 3718 expires = 0;
3512 3719
3513 if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, 3720 if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
3514 cpu_to_be64(jiffies_to_msecs(expires)), 3721 nf_jiffies64_to_msecs(expires),
3515 NFTA_SET_ELEM_PAD)) 3722 NFTA_SET_ELEM_PAD))
3516 goto nla_put_failure; 3723 goto nla_put_failure;
3517 } 3724 }
@@ -3749,7 +3956,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3749 ext = nft_set_elem_ext(set, &elem); 3956 ext = nft_set_elem_ext(set, &elem);
3750 3957
3751 err = -ENOMEM; 3958 err = -ENOMEM;
3752 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 3959 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
3753 if (skb == NULL) 3960 if (skb == NULL)
3754 goto err1; 3961 goto err1;
3755 3962
@@ -3771,6 +3978,7 @@ err1:
3771 return err == -EAGAIN ? -ENOBUFS : err; 3978 return err == -EAGAIN ? -ENOBUFS : err;
3772} 3979}
3773 3980
3981/* called with rcu_read_lock held */
3774static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, 3982static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
3775 struct sk_buff *skb, const struct nlmsghdr *nlh, 3983 struct sk_buff *skb, const struct nlmsghdr *nlh,
3776 const struct nlattr * const nla[], 3984 const struct nlattr * const nla[],
@@ -3782,12 +3990,12 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
3782 struct nft_ctx ctx; 3990 struct nft_ctx ctx;
3783 int rem, err = 0; 3991 int rem, err = 0;
3784 3992
3785 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); 3993 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
3994 genmask);
3786 if (err < 0) 3995 if (err < 0)
3787 return err; 3996 return err;
3788 3997
3789 set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], 3998 set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
3790 genmask);
3791 if (IS_ERR(set)) 3999 if (IS_ERR(set))
3792 return PTR_ERR(set); 4000 return PTR_ERR(set);
3793 4001
@@ -3795,10 +4003,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
3795 struct netlink_dump_control c = { 4003 struct netlink_dump_control c = {
3796 .dump = nf_tables_dump_set, 4004 .dump = nf_tables_dump_set,
3797 .done = nf_tables_dump_set_done, 4005 .done = nf_tables_dump_set_done,
4006 .module = THIS_MODULE,
3798 }; 4007 };
3799 struct nft_set_dump_ctx *dump_ctx; 4008 struct nft_set_dump_ctx *dump_ctx;
3800 4009
3801 dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); 4010 dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
3802 if (!dump_ctx) 4011 if (!dump_ctx)
3803 return -ENOMEM; 4012 return -ENOMEM;
3804 4013
@@ -3806,7 +4015,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
3806 dump_ctx->ctx = ctx; 4015 dump_ctx->ctx = ctx;
3807 4016
3808 c.data = dump_ctx; 4017 c.data = dump_ctx;
3809 return netlink_dump_start(nlsk, skb, nlh, &c); 4018 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
3810 } 4019 }
3811 4020
3812 if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) 4021 if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -3886,7 +4095,7 @@ void *nft_set_elem_init(const struct nft_set *set,
3886 memcpy(nft_set_ext_data(ext), data, set->dlen); 4095 memcpy(nft_set_ext_data(ext), data, set->dlen);
3887 if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) 4096 if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
3888 *nft_set_ext_expiration(ext) = 4097 *nft_set_ext_expiration(ext) =
3889 jiffies + timeout; 4098 get_jiffies_64() + timeout;
3890 if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) 4099 if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
3891 *nft_set_ext_timeout(ext) = timeout; 4100 *nft_set_ext_timeout(ext) = timeout;
3892 4101
@@ -3897,12 +4106,24 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
3897 bool destroy_expr) 4106 bool destroy_expr)
3898{ 4107{
3899 struct nft_set_ext *ext = nft_set_elem_ext(set, elem); 4108 struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
4109 struct nft_ctx ctx = {
4110 .net = read_pnet(&set->net),
4111 .family = set->table->family,
4112 };
3900 4113
3901 nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); 4114 nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
3902 if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) 4115 if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
3903 nft_data_release(nft_set_ext_data(ext), set->dtype); 4116 nft_data_release(nft_set_ext_data(ext), set->dtype);
3904 if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) 4117 if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
3905 nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); 4118 struct nft_expr *expr = nft_set_ext_expr(ext);
4119
4120 if (expr->ops->destroy_clone) {
4121 expr->ops->destroy_clone(&ctx, expr);
4122 module_put(expr->ops->type->owner);
4123 } else {
4124 nf_tables_expr_destroy(&ctx, expr);
4125 }
4126 }
3906 if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) 4127 if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
3907 (*nft_set_ext_obj(ext))->use--; 4128 (*nft_set_ext_obj(ext))->use--;
3908 kfree(elem); 4129 kfree(elem);
@@ -3912,12 +4133,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
3912/* Only called from commit path, nft_set_elem_deactivate() already deals with 4133/* Only called from commit path, nft_set_elem_deactivate() already deals with
3913 * the refcounting from the preparation phase. 4134 * the refcounting from the preparation phase.
3914 */ 4135 */
3915static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) 4136static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
4137 const struct nft_set *set, void *elem)
3916{ 4138{
3917 struct nft_set_ext *ext = nft_set_elem_ext(set, elem); 4139 struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
3918 4140
3919 if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) 4141 if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
3920 nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); 4142 nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
3921 kfree(elem); 4143 kfree(elem);
3922} 4144}
3923 4145
@@ -3973,8 +4195,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3973 if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { 4195 if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
3974 if (!(set->flags & NFT_SET_TIMEOUT)) 4196 if (!(set->flags & NFT_SET_TIMEOUT))
3975 return -EINVAL; 4197 return -EINVAL;
3976 timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( 4198 err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
3977 nla[NFTA_SET_ELEM_TIMEOUT]))); 4199 &timeout);
4200 if (err)
4201 return err;
3978 } else if (set->flags & NFT_SET_TIMEOUT) { 4202 } else if (set->flags & NFT_SET_TIMEOUT) {
3979 timeout = set->timeout; 4203 timeout = set->timeout;
3980 } 4204 }
@@ -3999,8 +4223,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3999 err = -EINVAL; 4223 err = -EINVAL;
4000 goto err2; 4224 goto err2;
4001 } 4225 }
4002 obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], 4226 obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
4003 set->objtype, genmask); 4227 set->objtype, genmask);
4004 if (IS_ERR(obj)) { 4228 if (IS_ERR(obj)) {
4005 err = PTR_ERR(obj); 4229 err = PTR_ERR(obj);
4006 goto err2; 4230 goto err2;
@@ -4035,6 +4259,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
4035 d2.type, d2.len); 4259 d2.type, d2.len);
4036 if (err < 0) 4260 if (err < 0)
4037 goto err3; 4261 goto err3;
4262
4263 if (d2.type == NFT_DATA_VERDICT &&
4264 (data.verdict.code == NFT_GOTO ||
4265 data.verdict.code == NFT_JUMP))
4266 nft_validate_state_update(ctx->net,
4267 NFT_VALIDATE_NEED);
4038 } 4268 }
4039 4269
4040 nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); 4270 nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4134,12 +4364,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
4134 const struct nlattr *attr; 4364 const struct nlattr *attr;
4135 struct nft_set *set; 4365 struct nft_set *set;
4136 struct nft_ctx ctx; 4366 struct nft_ctx ctx;
4137 int rem, err = 0; 4367 int rem, err;
4138 4368
4139 if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) 4369 if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
4140 return -EINVAL; 4370 return -EINVAL;
4141 4371
4142 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); 4372 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
4373 genmask);
4143 if (err < 0) 4374 if (err < 0)
4144 return err; 4375 return err;
4145 4376
@@ -4154,9 +4385,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
4154 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { 4385 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
4155 err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); 4386 err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
4156 if (err < 0) 4387 if (err < 0)
4157 break; 4388 return err;
4158 } 4389 }
4159 return err; 4390
4391 if (net->nft.validate_state == NFT_VALIDATE_DO)
4392 return nft_table_validate(net, ctx.table);
4393
4394 return 0;
4160} 4395}
4161 4396
4162/** 4397/**
@@ -4327,12 +4562,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
4327 struct nft_ctx ctx; 4562 struct nft_ctx ctx;
4328 int rem, err = 0; 4563 int rem, err = 0;
4329 4564
4330 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); 4565 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
4566 genmask);
4331 if (err < 0) 4567 if (err < 0)
4332 return err; 4568 return err;
4333 4569
4334 set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], 4570 set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
4335 genmask);
4336 if (IS_ERR(set)) 4571 if (IS_ERR(set))
4337 return PTR_ERR(set); 4572 return PTR_ERR(set);
4338 if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) 4573 if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -4420,13 +4655,13 @@ void nft_unregister_obj(struct nft_object_type *obj_type)
4420} 4655}
4421EXPORT_SYMBOL_GPL(nft_unregister_obj); 4656EXPORT_SYMBOL_GPL(nft_unregister_obj);
4422 4657
4423struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, 4658struct nft_object *nft_obj_lookup(const struct nft_table *table,
4424 const struct nlattr *nla, 4659 const struct nlattr *nla, u32 objtype,
4425 u32 objtype, u8 genmask) 4660 u8 genmask)
4426{ 4661{
4427 struct nft_object *obj; 4662 struct nft_object *obj;
4428 4663
4429 list_for_each_entry(obj, &table->objects, list) { 4664 list_for_each_entry_rcu(obj, &table->objects, list) {
4430 if (!nla_strcmp(nla, obj->name) && 4665 if (!nla_strcmp(nla, obj->name) &&
4431 objtype == obj->ops->type->type && 4666 objtype == obj->ops->type->type &&
4432 nft_active_genmask(obj, genmask)) 4667 nft_active_genmask(obj, genmask))
@@ -4434,11 +4669,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
4434 } 4669 }
4435 return ERR_PTR(-ENOENT); 4670 return ERR_PTR(-ENOENT);
4436} 4671}
4437EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); 4672EXPORT_SYMBOL_GPL(nft_obj_lookup);
4438 4673
4439static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, 4674static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
4440 const struct nlattr *nla, 4675 const struct nlattr *nla,
4441 u32 objtype, u8 genmask) 4676 u32 objtype, u8 genmask)
4442{ 4677{
4443 struct nft_object *obj; 4678 struct nft_object *obj;
4444 4679
@@ -4582,22 +4817,25 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
4582 !nla[NFTA_OBJ_DATA]) 4817 !nla[NFTA_OBJ_DATA])
4583 return -EINVAL; 4818 return -EINVAL;
4584 4819
4585 table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, 4820 table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
4586 genmask); 4821 if (IS_ERR(table)) {
4587 if (IS_ERR(table)) 4822 NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
4588 return PTR_ERR(table); 4823 return PTR_ERR(table);
4824 }
4589 4825
4590 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); 4826 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4591 obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); 4827 obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
4592 if (IS_ERR(obj)) { 4828 if (IS_ERR(obj)) {
4593 err = PTR_ERR(obj); 4829 err = PTR_ERR(obj);
4594 if (err != -ENOENT) 4830 if (err != -ENOENT) {
4831 NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
4595 return err; 4832 return err;
4596 4833 }
4597 } else { 4834 } else {
4598 if (nlh->nlmsg_flags & NLM_F_EXCL) 4835 if (nlh->nlmsg_flags & NLM_F_EXCL) {
4836 NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
4599 return -EEXIST; 4837 return -EEXIST;
4600 4838 }
4601 return 0; 4839 return 0;
4602 } 4840 }
4603 4841
@@ -4632,7 +4870,7 @@ err3:
4632 kfree(obj->name); 4870 kfree(obj->name);
4633err2: 4871err2:
4634 if (obj->ops->destroy) 4872 if (obj->ops->destroy)
4635 obj->ops->destroy(obj); 4873 obj->ops->destroy(&ctx, obj);
4636 kfree(obj); 4874 kfree(obj);
4637err1: 4875err1:
4638 module_put(type->owner); 4876 module_put(type->owner);
@@ -4753,12 +4991,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
4753{ 4991{
4754 struct nft_obj_filter *filter; 4992 struct nft_obj_filter *filter;
4755 4993
4756 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 4994 filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
4757 if (!filter) 4995 if (!filter)
4758 return ERR_PTR(-ENOMEM); 4996 return ERR_PTR(-ENOMEM);
4759 4997
4760 if (nla[NFTA_OBJ_TABLE]) { 4998 if (nla[NFTA_OBJ_TABLE]) {
4761 filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL); 4999 filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
4762 if (!filter->table) { 5000 if (!filter->table) {
4763 kfree(filter); 5001 kfree(filter);
4764 return ERR_PTR(-ENOMEM); 5002 return ERR_PTR(-ENOMEM);
@@ -4770,6 +5008,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
4770 return filter; 5008 return filter;
4771} 5009}
4772 5010
5011/* called with rcu_read_lock held */
4773static int nf_tables_getobj(struct net *net, struct sock *nlsk, 5012static int nf_tables_getobj(struct net *net, struct sock *nlsk,
4774 struct sk_buff *skb, const struct nlmsghdr *nlh, 5013 struct sk_buff *skb, const struct nlmsghdr *nlh,
4775 const struct nlattr * const nla[], 5014 const struct nlattr * const nla[],
@@ -4789,6 +5028,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
4789 struct netlink_dump_control c = { 5028 struct netlink_dump_control c = {
4790 .dump = nf_tables_dump_obj, 5029 .dump = nf_tables_dump_obj,
4791 .done = nf_tables_dump_obj_done, 5030 .done = nf_tables_dump_obj_done,
5031 .module = THIS_MODULE,
4792 }; 5032 };
4793 5033
4794 if (nla[NFTA_OBJ_TABLE] || 5034 if (nla[NFTA_OBJ_TABLE] ||
@@ -4801,24 +5041,27 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
4801 5041
4802 c.data = filter; 5042 c.data = filter;
4803 } 5043 }
4804 return netlink_dump_start(nlsk, skb, nlh, &c); 5044 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
4805 } 5045 }
4806 5046
4807 if (!nla[NFTA_OBJ_NAME] || 5047 if (!nla[NFTA_OBJ_NAME] ||
4808 !nla[NFTA_OBJ_TYPE]) 5048 !nla[NFTA_OBJ_TYPE])
4809 return -EINVAL; 5049 return -EINVAL;
4810 5050
4811 table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, 5051 table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
4812 genmask); 5052 if (IS_ERR(table)) {
4813 if (IS_ERR(table)) 5053 NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
4814 return PTR_ERR(table); 5054 return PTR_ERR(table);
5055 }
4815 5056
4816 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); 5057 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4817 obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); 5058 obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
4818 if (IS_ERR(obj)) 5059 if (IS_ERR(obj)) {
5060 NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
4819 return PTR_ERR(obj); 5061 return PTR_ERR(obj);
5062 }
4820 5063
4821 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5064 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
4822 if (!skb2) 5065 if (!skb2)
4823 return -ENOMEM; 5066 return -ENOMEM;
4824 5067
@@ -4837,10 +5080,10 @@ err:
4837 return err; 5080 return err;
4838} 5081}
4839 5082
4840static void nft_obj_destroy(struct nft_object *obj) 5083static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
4841{ 5084{
4842 if (obj->ops->destroy) 5085 if (obj->ops->destroy)
4843 obj->ops->destroy(obj); 5086 obj->ops->destroy(ctx, obj);
4844 5087
4845 module_put(obj->ops->type->owner); 5088 module_put(obj->ops->type->owner);
4846 kfree(obj->name); 5089 kfree(obj->name);
@@ -4855,6 +5098,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
4855 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 5098 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
4856 u8 genmask = nft_genmask_next(net); 5099 u8 genmask = nft_genmask_next(net);
4857 int family = nfmsg->nfgen_family; 5100 int family = nfmsg->nfgen_family;
5101 const struct nlattr *attr;
4858 struct nft_table *table; 5102 struct nft_table *table;
4859 struct nft_object *obj; 5103 struct nft_object *obj;
4860 struct nft_ctx ctx; 5104 struct nft_ctx ctx;
@@ -4864,22 +5108,29 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
4864 (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) 5108 (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
4865 return -EINVAL; 5109 return -EINVAL;
4866 5110
4867 table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, 5111 table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
4868 genmask); 5112 if (IS_ERR(table)) {
4869 if (IS_ERR(table)) 5113 NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
4870 return PTR_ERR(table); 5114 return PTR_ERR(table);
5115 }
4871 5116
4872 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); 5117 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4873 if (nla[NFTA_OBJ_HANDLE]) 5118 if (nla[NFTA_OBJ_HANDLE]) {
4874 obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], 5119 attr = nla[NFTA_OBJ_HANDLE];
4875 objtype, genmask); 5120 obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
4876 else 5121 } else {
4877 obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], 5122 attr = nla[NFTA_OBJ_NAME];
4878 objtype, genmask); 5123 obj = nft_obj_lookup(table, attr, objtype, genmask);
4879 if (IS_ERR(obj)) 5124 }
5125
5126 if (IS_ERR(obj)) {
5127 NL_SET_BAD_ATTR(extack, attr);
4880 return PTR_ERR(obj); 5128 return PTR_ERR(obj);
4881 if (obj->use > 0) 5129 }
5130 if (obj->use > 0) {
5131 NL_SET_BAD_ATTR(extack, attr);
4882 return -EBUSY; 5132 return -EBUSY;
5133 }
4883 5134
4884 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); 5135 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
4885 5136
@@ -4950,24 +5201,23 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
4950 [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, 5201 [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 },
4951}; 5202};
4952 5203
4953struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, 5204struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
4954 const struct nlattr *nla, 5205 const struct nlattr *nla, u8 genmask)
4955 u8 genmask)
4956{ 5206{
4957 struct nft_flowtable *flowtable; 5207 struct nft_flowtable *flowtable;
4958 5208
4959 list_for_each_entry(flowtable, &table->flowtables, list) { 5209 list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
4960 if (!nla_strcmp(nla, flowtable->name) && 5210 if (!nla_strcmp(nla, flowtable->name) &&
4961 nft_active_genmask(flowtable, genmask)) 5211 nft_active_genmask(flowtable, genmask))
4962 return flowtable; 5212 return flowtable;
4963 } 5213 }
4964 return ERR_PTR(-ENOENT); 5214 return ERR_PTR(-ENOENT);
4965} 5215}
4966EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); 5216EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
4967 5217
4968static struct nft_flowtable * 5218static struct nft_flowtable *
4969nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, 5219nft_flowtable_lookup_byhandle(const struct nft_table *table,
4970 const struct nlattr *nla, u8 genmask) 5220 const struct nlattr *nla, u8 genmask)
4971{ 5221{
4972 struct nft_flowtable *flowtable; 5222 struct nft_flowtable *flowtable;
4973 5223
@@ -5066,7 +5316,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
5066 flowtable->ops[i].pf = NFPROTO_NETDEV; 5316 flowtable->ops[i].pf = NFPROTO_NETDEV;
5067 flowtable->ops[i].hooknum = hooknum; 5317 flowtable->ops[i].hooknum = hooknum;
5068 flowtable->ops[i].priority = priority; 5318 flowtable->ops[i].priority = priority;
5069 flowtable->ops[i].priv = &flowtable->data.rhashtable; 5319 flowtable->ops[i].priv = &flowtable->data;
5070 flowtable->ops[i].hook = flowtable->data.type->hook; 5320 flowtable->ops[i].hook = flowtable->data.type->hook;
5071 flowtable->ops[i].dev = dev_array[i]; 5321 flowtable->ops[i].dev = dev_array[i];
5072 flowtable->dev_name[i] = kstrdup(dev_array[i]->name, 5322 flowtable->dev_name[i] = kstrdup(dev_array[i]->name,
@@ -5107,23 +5357,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
5107 return ERR_PTR(-ENOENT); 5357 return ERR_PTR(-ENOENT);
5108} 5358}
5109 5359
5110void nft_flow_table_iterate(struct net *net,
5111 void (*iter)(struct nf_flowtable *flowtable, void *data),
5112 void *data)
5113{
5114 struct nft_flowtable *flowtable;
5115 const struct nft_table *table;
5116
5117 nfnl_lock(NFNL_SUBSYS_NFTABLES);
5118 list_for_each_entry(table, &net->nft.tables, list) {
5119 list_for_each_entry(flowtable, &table->flowtables, list) {
5120 iter(&flowtable->data, data);
5121 }
5122 }
5123 nfnl_unlock(NFNL_SUBSYS_NFTABLES);
5124}
5125EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
5126
5127static void nft_unregister_flowtable_net_hooks(struct net *net, 5360static void nft_unregister_flowtable_net_hooks(struct net *net,
5128 struct nft_flowtable *flowtable) 5361 struct nft_flowtable *flowtable)
5129{ 5362{
@@ -5157,20 +5390,26 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
5157 !nla[NFTA_FLOWTABLE_HOOK]) 5390 !nla[NFTA_FLOWTABLE_HOOK])
5158 return -EINVAL; 5391 return -EINVAL;
5159 5392
5160 table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], 5393 table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
5161 family, genmask); 5394 genmask);
5162 if (IS_ERR(table)) 5395 if (IS_ERR(table)) {
5396 NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
5163 return PTR_ERR(table); 5397 return PTR_ERR(table);
5398 }
5164 5399
5165 flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], 5400 flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
5166 genmask); 5401 genmask);
5167 if (IS_ERR(flowtable)) { 5402 if (IS_ERR(flowtable)) {
5168 err = PTR_ERR(flowtable); 5403 err = PTR_ERR(flowtable);
5169 if (err != -ENOENT) 5404 if (err != -ENOENT) {
5405 NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
5170 return err; 5406 return err;
5407 }
5171 } else { 5408 } else {
5172 if (nlh->nlmsg_flags & NLM_F_EXCL) 5409 if (nlh->nlmsg_flags & NLM_F_EXCL) {
5410 NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
5173 return -EEXIST; 5411 return -EEXIST;
5412 }
5174 5413
5175 return 0; 5414 return 0;
5176 } 5415 }
@@ -5197,14 +5436,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
5197 } 5436 }
5198 5437
5199 flowtable->data.type = type; 5438 flowtable->data.type = type;
5200 err = rhashtable_init(&flowtable->data.rhashtable, type->params); 5439 err = type->init(&flowtable->data);
5201 if (err < 0) 5440 if (err < 0)
5202 goto err3; 5441 goto err3;
5203 5442
5204 err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], 5443 err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
5205 flowtable); 5444 flowtable);
5206 if (err < 0) 5445 if (err < 0)
5207 goto err3; 5446 goto err4;
5208 5447
5209 for (i = 0; i < flowtable->ops_len; i++) { 5448 for (i = 0; i < flowtable->ops_len; i++) {
5210 if (!flowtable->ops[i].dev) 5449 if (!flowtable->ops[i].dev)
@@ -5218,37 +5457,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
5218 if (flowtable->ops[i].dev == ft->ops[k].dev && 5457 if (flowtable->ops[i].dev == ft->ops[k].dev &&
5219 flowtable->ops[i].pf == ft->ops[k].pf) { 5458 flowtable->ops[i].pf == ft->ops[k].pf) {
5220 err = -EBUSY; 5459 err = -EBUSY;
5221 goto err4; 5460 goto err5;
5222 } 5461 }
5223 } 5462 }
5224 } 5463 }
5225 5464
5226 err = nf_register_net_hook(net, &flowtable->ops[i]); 5465 err = nf_register_net_hook(net, &flowtable->ops[i]);
5227 if (err < 0) 5466 if (err < 0)
5228 goto err4; 5467 goto err5;
5229 } 5468 }
5230 5469
5231 err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); 5470 err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
5232 if (err < 0) 5471 if (err < 0)
5233 goto err5; 5472 goto err6;
5234
5235 INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
5236 queue_delayed_work(system_power_efficient_wq,
5237 &flowtable->data.gc_work, HZ);
5238 5473
5239 list_add_tail_rcu(&flowtable->list, &table->flowtables); 5474 list_add_tail_rcu(&flowtable->list, &table->flowtables);
5240 table->use++; 5475 table->use++;
5241 5476
5242 return 0; 5477 return 0;
5243err5: 5478err6:
5244 i = flowtable->ops_len; 5479 i = flowtable->ops_len;
5245err4: 5480err5:
5246 for (k = i - 1; k >= 0; k--) { 5481 for (k = i - 1; k >= 0; k--) {
5247 kfree(flowtable->dev_name[k]); 5482 kfree(flowtable->dev_name[k]);
5248 nf_unregister_net_hook(net, &flowtable->ops[k]); 5483 nf_unregister_net_hook(net, &flowtable->ops[k]);
5249 } 5484 }
5250 5485
5251 kfree(flowtable->ops); 5486 kfree(flowtable->ops);
5487err4:
5488 flowtable->data.type->free(&flowtable->data);
5252err3: 5489err3:
5253 module_put(type->owner); 5490 module_put(type->owner);
5254err2: 5491err2:
@@ -5268,6 +5505,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
5268 u8 genmask = nft_genmask_next(net); 5505 u8 genmask = nft_genmask_next(net);
5269 int family = nfmsg->nfgen_family; 5506 int family = nfmsg->nfgen_family;
5270 struct nft_flowtable *flowtable; 5507 struct nft_flowtable *flowtable;
5508 const struct nlattr *attr;
5271 struct nft_table *table; 5509 struct nft_table *table;
5272 struct nft_ctx ctx; 5510 struct nft_ctx ctx;
5273 5511
@@ -5276,23 +5514,29 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
5276 !nla[NFTA_FLOWTABLE_HANDLE])) 5514 !nla[NFTA_FLOWTABLE_HANDLE]))
5277 return -EINVAL; 5515 return -EINVAL;
5278 5516
5279 table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], 5517 table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
5280 family, genmask); 5518 genmask);
5281 if (IS_ERR(table)) 5519 if (IS_ERR(table)) {
5520 NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
5282 return PTR_ERR(table); 5521 return PTR_ERR(table);
5522 }
5283 5523
5284 if (nla[NFTA_FLOWTABLE_HANDLE]) 5524 if (nla[NFTA_FLOWTABLE_HANDLE]) {
5285 flowtable = nf_tables_flowtable_lookup_byhandle(table, 5525 attr = nla[NFTA_FLOWTABLE_HANDLE];
5286 nla[NFTA_FLOWTABLE_HANDLE], 5526 flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
5287 genmask); 5527 } else {
5288 else 5528 attr = nla[NFTA_FLOWTABLE_NAME];
5289 flowtable = nf_tables_flowtable_lookup(table, 5529 flowtable = nft_flowtable_lookup(table, attr, genmask);
5290 nla[NFTA_FLOWTABLE_NAME], 5530 }
5291 genmask); 5531
5292 if (IS_ERR(flowtable)) 5532 if (IS_ERR(flowtable)) {
5293 return PTR_ERR(flowtable); 5533 NL_SET_BAD_ATTR(extack, attr);
5294 if (flowtable->use > 0) 5534 return PTR_ERR(flowtable);
5535 }
5536 if (flowtable->use > 0) {
5537 NL_SET_BAD_ATTR(extack, attr);
5295 return -EBUSY; 5538 return -EBUSY;
5539 }
5296 5540
5297 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); 5541 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
5298 5542
@@ -5423,13 +5667,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
5423{ 5667{
5424 struct nft_flowtable_filter *filter; 5668 struct nft_flowtable_filter *filter;
5425 5669
5426 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 5670 filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
5427 if (!filter) 5671 if (!filter)
5428 return ERR_PTR(-ENOMEM); 5672 return ERR_PTR(-ENOMEM);
5429 5673
5430 if (nla[NFTA_FLOWTABLE_TABLE]) { 5674 if (nla[NFTA_FLOWTABLE_TABLE]) {
5431 filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], 5675 filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
5432 GFP_KERNEL); 5676 GFP_ATOMIC);
5433 if (!filter->table) { 5677 if (!filter->table) {
5434 kfree(filter); 5678 kfree(filter);
5435 return ERR_PTR(-ENOMEM); 5679 return ERR_PTR(-ENOMEM);
@@ -5438,6 +5682,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
5438 return filter; 5682 return filter;
5439} 5683}
5440 5684
5685/* called with rcu_read_lock held */
5441static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, 5686static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
5442 struct sk_buff *skb, 5687 struct sk_buff *skb,
5443 const struct nlmsghdr *nlh, 5688 const struct nlmsghdr *nlh,
@@ -5456,6 +5701,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
5456 struct netlink_dump_control c = { 5701 struct netlink_dump_control c = {
5457 .dump = nf_tables_dump_flowtable, 5702 .dump = nf_tables_dump_flowtable,
5458 .done = nf_tables_dump_flowtable_done, 5703 .done = nf_tables_dump_flowtable_done,
5704 .module = THIS_MODULE,
5459 }; 5705 };
5460 5706
5461 if (nla[NFTA_FLOWTABLE_TABLE]) { 5707 if (nla[NFTA_FLOWTABLE_TABLE]) {
@@ -5467,23 +5713,23 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
5467 5713
5468 c.data = filter; 5714 c.data = filter;
5469 } 5715 }
5470 return netlink_dump_start(nlsk, skb, nlh, &c); 5716 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
5471 } 5717 }
5472 5718
5473 if (!nla[NFTA_FLOWTABLE_NAME]) 5719 if (!nla[NFTA_FLOWTABLE_NAME])
5474 return -EINVAL; 5720 return -EINVAL;
5475 5721
5476 table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], 5722 table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
5477 family, genmask); 5723 genmask);
5478 if (IS_ERR(table)) 5724 if (IS_ERR(table))
5479 return PTR_ERR(table); 5725 return PTR_ERR(table);
5480 5726
5481 flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], 5727 flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
5482 genmask); 5728 genmask);
5483 if (IS_ERR(flowtable)) 5729 if (IS_ERR(flowtable))
5484 return PTR_ERR(flowtable); 5730 return PTR_ERR(flowtable);
5485 5731
5486 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5732 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
5487 if (!skb2) 5733 if (!skb2)
5488 return -ENOMEM; 5734 return -ENOMEM;
5489 5735
@@ -5532,11 +5778,9 @@ err:
5532 5778
5533static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) 5779static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
5534{ 5780{
5535 cancel_delayed_work_sync(&flowtable->data.gc_work);
5536 kfree(flowtable->ops); 5781 kfree(flowtable->ops);
5537 kfree(flowtable->name); 5782 kfree(flowtable->name);
5538 flowtable->data.type->free(&flowtable->data); 5783 flowtable->data.type->free(&flowtable->data);
5539 rhashtable_destroy(&flowtable->data.rhashtable);
5540 module_put(flowtable->data.type->owner); 5784 module_put(flowtable->data.type->owner);
5541} 5785}
5542 5786
@@ -5649,7 +5893,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
5649 struct sk_buff *skb2; 5893 struct sk_buff *skb2;
5650 int err; 5894 int err;
5651 5895
5652 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5896 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
5653 if (skb2 == NULL) 5897 if (skb2 == NULL)
5654 return -ENOMEM; 5898 return -ENOMEM;
5655 5899
@@ -5671,7 +5915,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5671 .policy = nft_table_policy, 5915 .policy = nft_table_policy,
5672 }, 5916 },
5673 [NFT_MSG_GETTABLE] = { 5917 [NFT_MSG_GETTABLE] = {
5674 .call = nf_tables_gettable, 5918 .call_rcu = nf_tables_gettable,
5675 .attr_count = NFTA_TABLE_MAX, 5919 .attr_count = NFTA_TABLE_MAX,
5676 .policy = nft_table_policy, 5920 .policy = nft_table_policy,
5677 }, 5921 },
@@ -5686,7 +5930,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5686 .policy = nft_chain_policy, 5930 .policy = nft_chain_policy,
5687 }, 5931 },
5688 [NFT_MSG_GETCHAIN] = { 5932 [NFT_MSG_GETCHAIN] = {
5689 .call = nf_tables_getchain, 5933 .call_rcu = nf_tables_getchain,
5690 .attr_count = NFTA_CHAIN_MAX, 5934 .attr_count = NFTA_CHAIN_MAX,
5691 .policy = nft_chain_policy, 5935 .policy = nft_chain_policy,
5692 }, 5936 },
@@ -5701,7 +5945,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5701 .policy = nft_rule_policy, 5945 .policy = nft_rule_policy,
5702 }, 5946 },
5703 [NFT_MSG_GETRULE] = { 5947 [NFT_MSG_GETRULE] = {
5704 .call = nf_tables_getrule, 5948 .call_rcu = nf_tables_getrule,
5705 .attr_count = NFTA_RULE_MAX, 5949 .attr_count = NFTA_RULE_MAX,
5706 .policy = nft_rule_policy, 5950 .policy = nft_rule_policy,
5707 }, 5951 },
@@ -5716,7 +5960,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5716 .policy = nft_set_policy, 5960 .policy = nft_set_policy,
5717 }, 5961 },
5718 [NFT_MSG_GETSET] = { 5962 [NFT_MSG_GETSET] = {
5719 .call = nf_tables_getset, 5963 .call_rcu = nf_tables_getset,
5720 .attr_count = NFTA_SET_MAX, 5964 .attr_count = NFTA_SET_MAX,
5721 .policy = nft_set_policy, 5965 .policy = nft_set_policy,
5722 }, 5966 },
@@ -5731,7 +5975,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5731 .policy = nft_set_elem_list_policy, 5975 .policy = nft_set_elem_list_policy,
5732 }, 5976 },
5733 [NFT_MSG_GETSETELEM] = { 5977 [NFT_MSG_GETSETELEM] = {
5734 .call = nf_tables_getsetelem, 5978 .call_rcu = nf_tables_getsetelem,
5735 .attr_count = NFTA_SET_ELEM_LIST_MAX, 5979 .attr_count = NFTA_SET_ELEM_LIST_MAX,
5736 .policy = nft_set_elem_list_policy, 5980 .policy = nft_set_elem_list_policy,
5737 }, 5981 },
@@ -5741,7 +5985,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5741 .policy = nft_set_elem_list_policy, 5985 .policy = nft_set_elem_list_policy,
5742 }, 5986 },
5743 [NFT_MSG_GETGEN] = { 5987 [NFT_MSG_GETGEN] = {
5744 .call = nf_tables_getgen, 5988 .call_rcu = nf_tables_getgen,
5745 }, 5989 },
5746 [NFT_MSG_NEWOBJ] = { 5990 [NFT_MSG_NEWOBJ] = {
5747 .call_batch = nf_tables_newobj, 5991 .call_batch = nf_tables_newobj,
@@ -5749,7 +5993,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5749 .policy = nft_obj_policy, 5993 .policy = nft_obj_policy,
5750 }, 5994 },
5751 [NFT_MSG_GETOBJ] = { 5995 [NFT_MSG_GETOBJ] = {
5752 .call = nf_tables_getobj, 5996 .call_rcu = nf_tables_getobj,
5753 .attr_count = NFTA_OBJ_MAX, 5997 .attr_count = NFTA_OBJ_MAX,
5754 .policy = nft_obj_policy, 5998 .policy = nft_obj_policy,
5755 }, 5999 },
@@ -5759,7 +6003,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5759 .policy = nft_obj_policy, 6003 .policy = nft_obj_policy,
5760 }, 6004 },
5761 [NFT_MSG_GETOBJ_RESET] = { 6005 [NFT_MSG_GETOBJ_RESET] = {
5762 .call = nf_tables_getobj, 6006 .call_rcu = nf_tables_getobj,
5763 .attr_count = NFTA_OBJ_MAX, 6007 .attr_count = NFTA_OBJ_MAX,
5764 .policy = nft_obj_policy, 6008 .policy = nft_obj_policy,
5765 }, 6009 },
@@ -5769,7 +6013,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5769 .policy = nft_flowtable_policy, 6013 .policy = nft_flowtable_policy,
5770 }, 6014 },
5771 [NFT_MSG_GETFLOWTABLE] = { 6015 [NFT_MSG_GETFLOWTABLE] = {
5772 .call = nf_tables_getflowtable, 6016 .call_rcu = nf_tables_getflowtable,
5773 .attr_count = NFTA_FLOWTABLE_MAX, 6017 .attr_count = NFTA_FLOWTABLE_MAX,
5774 .policy = nft_flowtable_policy, 6018 .policy = nft_flowtable_policy,
5775 }, 6019 },
@@ -5780,12 +6024,41 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
5780 }, 6024 },
5781}; 6025};
5782 6026
6027static int nf_tables_validate(struct net *net)
6028{
6029 struct nft_table *table;
6030
6031 switch (net->nft.validate_state) {
6032 case NFT_VALIDATE_SKIP:
6033 break;
6034 case NFT_VALIDATE_NEED:
6035 nft_validate_state_update(net, NFT_VALIDATE_DO);
6036 /* fall through */
6037 case NFT_VALIDATE_DO:
6038 list_for_each_entry(table, &net->nft.tables, list) {
6039 if (nft_table_validate(net, table) < 0)
6040 return -EAGAIN;
6041 }
6042 break;
6043 }
6044
6045 return 0;
6046}
6047
5783static void nft_chain_commit_update(struct nft_trans *trans) 6048static void nft_chain_commit_update(struct nft_trans *trans)
5784{ 6049{
5785 struct nft_base_chain *basechain; 6050 struct nft_base_chain *basechain;
5786 6051
5787 if (nft_trans_chain_name(trans)) 6052 if (nft_trans_chain_name(trans)) {
6053 rhltable_remove(&trans->ctx.table->chains_ht,
6054 &trans->ctx.chain->rhlhead,
6055 nft_chain_ht_params);
5788 swap(trans->ctx.chain->name, nft_trans_chain_name(trans)); 6056 swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
6057 rhltable_insert_key(&trans->ctx.table->chains_ht,
6058 trans->ctx.chain->name,
6059 &trans->ctx.chain->rhlhead,
6060 nft_chain_ht_params);
6061 }
5789 6062
5790 if (!nft_is_base_chain(trans->ctx.chain)) 6063 if (!nft_is_base_chain(trans->ctx.chain))
5791 return; 6064 return;
@@ -5817,11 +6090,12 @@ static void nft_commit_release(struct nft_trans *trans)
5817 nft_set_destroy(nft_trans_set(trans)); 6090 nft_set_destroy(nft_trans_set(trans));
5818 break; 6091 break;
5819 case NFT_MSG_DELSETELEM: 6092 case NFT_MSG_DELSETELEM:
5820 nf_tables_set_elem_destroy(nft_trans_elem_set(trans), 6093 nf_tables_set_elem_destroy(&trans->ctx,
6094 nft_trans_elem_set(trans),
5821 nft_trans_elem(trans).priv); 6095 nft_trans_elem(trans).priv);
5822 break; 6096 break;
5823 case NFT_MSG_DELOBJ: 6097 case NFT_MSG_DELOBJ:
5824 nft_obj_destroy(nft_trans_obj(trans)); 6098 nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
5825 break; 6099 break;
5826 case NFT_MSG_DELFLOWTABLE: 6100 case NFT_MSG_DELFLOWTABLE:
5827 nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); 6101 nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -5845,21 +6119,175 @@ static void nf_tables_commit_release(struct net *net)
5845 } 6119 }
5846} 6120}
5847 6121
6122static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
6123{
6124 struct nft_rule *rule;
6125 unsigned int alloc = 0;
6126 int i;
6127
6128 /* already handled or inactive chain? */
6129 if (chain->rules_next || !nft_is_active_next(net, chain))
6130 return 0;
6131
6132 rule = list_entry(&chain->rules, struct nft_rule, list);
6133 i = 0;
6134
6135 list_for_each_entry_continue(rule, &chain->rules, list) {
6136 if (nft_is_active_next(net, rule))
6137 alloc++;
6138 }
6139
6140 chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
6141 if (!chain->rules_next)
6142 return -ENOMEM;
6143
6144 list_for_each_entry_continue(rule, &chain->rules, list) {
6145 if (nft_is_active_next(net, rule))
6146 chain->rules_next[i++] = rule;
6147 }
6148
6149 chain->rules_next[i] = NULL;
6150 return 0;
6151}
6152
6153static void nf_tables_commit_chain_prepare_cancel(struct net *net)
6154{
6155 struct nft_trans *trans, *next;
6156
6157 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
6158 struct nft_chain *chain = trans->ctx.chain;
6159
6160 if (trans->msg_type == NFT_MSG_NEWRULE ||
6161 trans->msg_type == NFT_MSG_DELRULE) {
6162 kvfree(chain->rules_next);
6163 chain->rules_next = NULL;
6164 }
6165 }
6166}
6167
6168static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
6169{
6170 struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
6171
6172 kvfree(o->start);
6173}
6174
6175static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
6176{
6177 struct nft_rule **r = rules;
6178 struct nft_rules_old *old;
6179
6180 while (*r)
6181 r++;
6182
6183 r++; /* rcu_head is after end marker */
6184 old = (void *) r;
6185 old->start = rules;
6186
6187 call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
6188}
6189
6190static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
6191{
6192 struct nft_rule **g0, **g1;
6193 bool next_genbit;
6194
6195 next_genbit = nft_gencursor_next(net);
6196
6197 g0 = rcu_dereference_protected(chain->rules_gen_0,
6198 lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
6199 g1 = rcu_dereference_protected(chain->rules_gen_1,
6200 lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
6201
6202 /* No changes to this chain? */
6203 if (chain->rules_next == NULL) {
6204 /* chain had no change in last or next generation */
6205 if (g0 == g1)
6206 return;
6207 /*
6208 * chain had no change in this generation; make sure next
6209 * one uses same rules as current generation.
6210 */
6211 if (next_genbit) {
6212 rcu_assign_pointer(chain->rules_gen_1, g0);
6213 nf_tables_commit_chain_free_rules_old(g1);
6214 } else {
6215 rcu_assign_pointer(chain->rules_gen_0, g1);
6216 nf_tables_commit_chain_free_rules_old(g0);
6217 }
6218
6219 return;
6220 }
6221
6222 if (next_genbit)
6223 rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
6224 else
6225 rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
6226
6227 chain->rules_next = NULL;
6228
6229 if (g0 == g1)
6230 return;
6231
6232 if (next_genbit)
6233 nf_tables_commit_chain_free_rules_old(g1);
6234 else
6235 nf_tables_commit_chain_free_rules_old(g0);
6236}
6237
6238static void nft_chain_del(struct nft_chain *chain)
6239{
6240 struct nft_table *table = chain->table;
6241
6242 WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
6243 nft_chain_ht_params));
6244 list_del_rcu(&chain->list);
6245}
6246
5848static int nf_tables_commit(struct net *net, struct sk_buff *skb) 6247static int nf_tables_commit(struct net *net, struct sk_buff *skb)
5849{ 6248{
5850 struct nft_trans *trans, *next; 6249 struct nft_trans *trans, *next;
5851 struct nft_trans_elem *te; 6250 struct nft_trans_elem *te;
6251 struct nft_chain *chain;
6252 struct nft_table *table;
5852 6253
5853 /* Bump generation counter, invalidate any dump in progress */ 6254 /* 0. Validate ruleset, otherwise roll back for error reporting. */
5854 while (++net->nft.base_seq == 0); 6255 if (nf_tables_validate(net) < 0)
6256 return -EAGAIN;
5855 6257
5856 /* A new generation has just started */ 6258 /* 1. Allocate space for next generation rules_gen_X[] */
5857 net->nft.gencursor = nft_gencursor_next(net); 6259 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
6260 int ret;
6261
6262 if (trans->msg_type == NFT_MSG_NEWRULE ||
6263 trans->msg_type == NFT_MSG_DELRULE) {
6264 chain = trans->ctx.chain;
6265
6266 ret = nf_tables_commit_chain_prepare(net, chain);
6267 if (ret < 0) {
6268 nf_tables_commit_chain_prepare_cancel(net);
6269 return ret;
6270 }
6271 }
6272 }
6273
6274 /* step 2. Make rules_gen_X visible to packet path */
6275 list_for_each_entry(table, &net->nft.tables, list) {
6276 list_for_each_entry(chain, &table->chains, list) {
6277 if (!nft_is_active_next(net, chain))
6278 continue;
6279 nf_tables_commit_chain_active(net, chain);
6280 }
6281 }
5858 6282
5859 /* Make sure all packets have left the previous generation before 6283 /*
5860 * purging old rules. 6284 * Bump generation counter, invalidate any dump in progress.
6285 * Cannot fail after this point.
5861 */ 6286 */
5862 synchronize_rcu(); 6287 while (++net->nft.base_seq == 0);
6288
6289 /* step 3. Start new generation, rules_gen_X now in use. */
6290 net->nft.gencursor = nft_gencursor_next(net);
5863 6291
5864 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { 6292 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
5865 switch (trans->msg_type) { 6293 switch (trans->msg_type) {
@@ -5890,7 +6318,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
5890 nft_trans_destroy(trans); 6318 nft_trans_destroy(trans);
5891 break; 6319 break;
5892 case NFT_MSG_DELCHAIN: 6320 case NFT_MSG_DELCHAIN:
5893 list_del_rcu(&trans->ctx.chain->list); 6321 nft_chain_del(trans->ctx.chain);
5894 nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); 6322 nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
5895 nf_tables_unregister_hook(trans->ctx.net, 6323 nf_tables_unregister_hook(trans->ctx.net,
5896 trans->ctx.table, 6324 trans->ctx.table,
@@ -6001,7 +6429,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
6001 nft_trans_elem(trans).priv, true); 6429 nft_trans_elem(trans).priv, true);
6002 break; 6430 break;
6003 case NFT_MSG_NEWOBJ: 6431 case NFT_MSG_NEWOBJ:
6004 nft_obj_destroy(nft_trans_obj(trans)); 6432 nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
6005 break; 6433 break;
6006 case NFT_MSG_NEWFLOWTABLE: 6434 case NFT_MSG_NEWFLOWTABLE:
6007 nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); 6435 nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -6041,7 +6469,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
6041 nft_trans_destroy(trans); 6469 nft_trans_destroy(trans);
6042 } else { 6470 } else {
6043 trans->ctx.table->use--; 6471 trans->ctx.table->use--;
6044 list_del_rcu(&trans->ctx.chain->list); 6472 nft_chain_del(trans->ctx.chain);
6045 nf_tables_unregister_hook(trans->ctx.net, 6473 nf_tables_unregister_hook(trans->ctx.net,
6046 trans->ctx.table, 6474 trans->ctx.table,
6047 trans->ctx.chain); 6475 trans->ctx.chain);
@@ -6121,6 +6549,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
6121 return 0; 6549 return 0;
6122} 6550}
6123 6551
6552static void nf_tables_cleanup(struct net *net)
6553{
6554 nft_validate_state_update(net, NFT_VALIDATE_SKIP);
6555}
6556
6124static bool nf_tables_valid_genid(struct net *net, u32 genid) 6557static bool nf_tables_valid_genid(struct net *net, u32 genid)
6125{ 6558{
6126 return net->nft.base_seq == genid; 6559 return net->nft.base_seq == genid;
@@ -6133,6 +6566,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
6133 .cb = nf_tables_cb, 6566 .cb = nf_tables_cb,
6134 .commit = nf_tables_commit, 6567 .commit = nf_tables_commit,
6135 .abort = nf_tables_abort, 6568 .abort = nf_tables_abort,
6569 .cleanup = nf_tables_cleanup,
6136 .valid_genid = nf_tables_valid_genid, 6570 .valid_genid = nf_tables_valid_genid,
6137}; 6571};
6138 6572
@@ -6216,19 +6650,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
6216 6650
6217 list_for_each_entry(rule, &chain->rules, list) { 6651 list_for_each_entry(rule, &chain->rules, list) {
6218 nft_rule_for_each_expr(expr, last, rule) { 6652 nft_rule_for_each_expr(expr, last, rule) {
6219 const struct nft_data *data = NULL; 6653 struct nft_immediate_expr *priv;
6654 const struct nft_data *data;
6220 int err; 6655 int err;
6221 6656
6222 if (!expr->ops->validate) 6657 if (strcmp(expr->ops->type->name, "immediate"))
6223 continue; 6658 continue;
6224 6659
6225 err = expr->ops->validate(ctx, expr, &data); 6660 priv = nft_expr_priv(expr);
6226 if (err < 0) 6661 if (priv->dreg != NFT_REG_VERDICT)
6227 return err;
6228
6229 if (data == NULL)
6230 continue; 6662 continue;
6231 6663
6664 data = &priv->data;
6232 switch (data->verdict.code) { 6665 switch (data->verdict.code) {
6233 case NFT_JUMP: 6666 case NFT_JUMP:
6234 case NFT_GOTO: 6667 case NFT_GOTO:
@@ -6461,8 +6894,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
6461 case NFT_GOTO: 6894 case NFT_GOTO:
6462 if (!tb[NFTA_VERDICT_CHAIN]) 6895 if (!tb[NFTA_VERDICT_CHAIN])
6463 return -EINVAL; 6896 return -EINVAL;
6464 chain = nf_tables_chain_lookup(ctx->table, 6897 chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
6465 tb[NFTA_VERDICT_CHAIN], genmask); 6898 genmask);
6466 if (IS_ERR(chain)) 6899 if (IS_ERR(chain))
6467 return PTR_ERR(chain); 6900 return PTR_ERR(chain);
6468 if (nft_is_base_chain(chain)) 6901 if (nft_is_base_chain(chain))
@@ -6638,7 +7071,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
6638 ctx->chain->use--; 7071 ctx->chain->use--;
6639 nf_tables_rule_release(ctx, rule); 7072 nf_tables_rule_release(ctx, rule);
6640 } 7073 }
6641 list_del(&ctx->chain->list); 7074 nft_chain_del(ctx->chain);
6642 ctx->table->use--; 7075 ctx->table->use--;
6643 nf_tables_chain_destroy(ctx); 7076 nf_tables_chain_destroy(ctx);
6644 7077
@@ -6690,11 +7123,11 @@ static void __nft_release_tables(struct net *net)
6690 list_for_each_entry_safe(obj, ne, &table->objects, list) { 7123 list_for_each_entry_safe(obj, ne, &table->objects, list) {
6691 list_del(&obj->list); 7124 list_del(&obj->list);
6692 table->use--; 7125 table->use--;
6693 nft_obj_destroy(obj); 7126 nft_obj_destroy(&ctx, obj);
6694 } 7127 }
6695 list_for_each_entry_safe(chain, nc, &table->chains, list) { 7128 list_for_each_entry_safe(chain, nc, &table->chains, list) {
6696 ctx.chain = chain; 7129 ctx.chain = chain;
6697 list_del(&chain->list); 7130 nft_chain_del(chain);
6698 table->use--; 7131 table->use--;
6699 nf_tables_chain_destroy(&ctx); 7132 nf_tables_chain_destroy(&ctx);
6700 } 7133 }
@@ -6708,6 +7141,8 @@ static int __net_init nf_tables_init_net(struct net *net)
6708 INIT_LIST_HEAD(&net->nft.tables); 7141 INIT_LIST_HEAD(&net->nft.tables);
6709 INIT_LIST_HEAD(&net->nft.commit_list); 7142 INIT_LIST_HEAD(&net->nft.commit_list);
6710 net->nft.base_seq = 1; 7143 net->nft.base_seq = 1;
7144 net->nft.validate_state = NFT_VALIDATE_SKIP;
7145
6711 return 0; 7146 return 0;
6712} 7147}
6713 7148
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 40e744572283..deff10adef9c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -23,25 +23,9 @@
23#include <net/netfilter/nf_tables.h> 23#include <net/netfilter/nf_tables.h>
24#include <net/netfilter/nf_log.h> 24#include <net/netfilter/nf_log.h>
25 25
26static const char *const comments[__NFT_TRACETYPE_MAX] = {
27 [NFT_TRACETYPE_POLICY] = "policy",
28 [NFT_TRACETYPE_RETURN] = "return",
29 [NFT_TRACETYPE_RULE] = "rule",
30};
31
32static const struct nf_loginfo trace_loginfo = {
33 .type = NF_LOG_TYPE_LOG,
34 .u = {
35 .log = {
36 .level = LOGLEVEL_WARNING,
37 .logflags = NF_LOG_DEFAULT_MASK,
38 },
39 },
40};
41
42static noinline void __nft_trace_packet(struct nft_traceinfo *info, 26static noinline void __nft_trace_packet(struct nft_traceinfo *info,
43 const struct nft_chain *chain, 27 const struct nft_chain *chain,
44 int rulenum, enum nft_trace_types type) 28 enum nft_trace_types type)
45{ 29{
46 const struct nft_pktinfo *pkt = info->pkt; 30 const struct nft_pktinfo *pkt = info->pkt;
47 31
@@ -52,22 +36,16 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
52 info->type = type; 36 info->type = type;
53 37
54 nft_trace_notify(info); 38 nft_trace_notify(info);
55
56 nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
57 nft_in(pkt), nft_out(pkt), &trace_loginfo,
58 "TRACE: %s:%s:%s:%u ",
59 chain->table->name, chain->name, comments[type], rulenum);
60} 39}
61 40
62static inline void nft_trace_packet(struct nft_traceinfo *info, 41static inline void nft_trace_packet(struct nft_traceinfo *info,
63 const struct nft_chain *chain, 42 const struct nft_chain *chain,
64 const struct nft_rule *rule, 43 const struct nft_rule *rule,
65 int rulenum,
66 enum nft_trace_types type) 44 enum nft_trace_types type)
67{ 45{
68 if (static_branch_unlikely(&nft_trace_enabled)) { 46 if (static_branch_unlikely(&nft_trace_enabled)) {
69 info->rule = rule; 47 info->rule = rule;
70 __nft_trace_packet(info, chain, rulenum, type); 48 __nft_trace_packet(info, chain, type);
71 } 49 }
72} 50}
73 51
@@ -139,8 +117,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
139 117
140struct nft_jumpstack { 118struct nft_jumpstack {
141 const struct nft_chain *chain; 119 const struct nft_chain *chain;
142 const struct nft_rule *rule; 120 struct nft_rule *const *rules;
143 int rulenum;
144}; 121};
145 122
146unsigned int 123unsigned int
@@ -148,31 +125,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
148{ 125{
149 const struct nft_chain *chain = priv, *basechain = chain; 126 const struct nft_chain *chain = priv, *basechain = chain;
150 const struct net *net = nft_net(pkt); 127 const struct net *net = nft_net(pkt);
128 struct nft_rule *const *rules;
151 const struct nft_rule *rule; 129 const struct nft_rule *rule;
152 const struct nft_expr *expr, *last; 130 const struct nft_expr *expr, *last;
153 struct nft_regs regs; 131 struct nft_regs regs;
154 unsigned int stackptr = 0; 132 unsigned int stackptr = 0;
155 struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; 133 struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
156 int rulenum; 134 bool genbit = READ_ONCE(net->nft.gencursor);
157 unsigned int gencursor = nft_genmask_cur(net);
158 struct nft_traceinfo info; 135 struct nft_traceinfo info;
159 136
160 info.trace = false; 137 info.trace = false;
161 if (static_branch_unlikely(&nft_trace_enabled)) 138 if (static_branch_unlikely(&nft_trace_enabled))
162 nft_trace_init(&info, pkt, &regs.verdict, basechain); 139 nft_trace_init(&info, pkt, &regs.verdict, basechain);
163do_chain: 140do_chain:
164 rulenum = 0; 141 if (genbit)
165 rule = list_entry(&chain->rules, struct nft_rule, list); 142 rules = rcu_dereference(chain->rules_gen_1);
143 else
144 rules = rcu_dereference(chain->rules_gen_0);
145
166next_rule: 146next_rule:
147 rule = *rules;
167 regs.verdict.code = NFT_CONTINUE; 148 regs.verdict.code = NFT_CONTINUE;
168 list_for_each_entry_continue_rcu(rule, &chain->rules, list) { 149 for (; *rules ; rules++) {
169 150 rule = *rules;
170 /* This rule is not active, skip. */
171 if (unlikely(rule->genmask & gencursor))
172 continue;
173
174 rulenum++;
175
176 nft_rule_for_each_expr(expr, last, rule) { 151 nft_rule_for_each_expr(expr, last, rule) {
177 if (expr->ops == &nft_cmp_fast_ops) 152 if (expr->ops == &nft_cmp_fast_ops)
178 nft_cmp_fast_eval(expr, &regs); 153 nft_cmp_fast_eval(expr, &regs);
@@ -190,7 +165,7 @@ next_rule:
190 continue; 165 continue;
191 case NFT_CONTINUE: 166 case NFT_CONTINUE:
192 nft_trace_packet(&info, chain, rule, 167 nft_trace_packet(&info, chain, rule,
193 rulenum, NFT_TRACETYPE_RULE); 168 NFT_TRACETYPE_RULE);
194 continue; 169 continue;
195 } 170 }
196 break; 171 break;
@@ -202,7 +177,7 @@ next_rule:
202 case NF_QUEUE: 177 case NF_QUEUE:
203 case NF_STOLEN: 178 case NF_STOLEN:
204 nft_trace_packet(&info, chain, rule, 179 nft_trace_packet(&info, chain, rule,
205 rulenum, NFT_TRACETYPE_RULE); 180 NFT_TRACETYPE_RULE);
206 return regs.verdict.code; 181 return regs.verdict.code;
207 } 182 }
208 183
@@ -210,22 +185,20 @@ next_rule:
210 case NFT_JUMP: 185 case NFT_JUMP:
211 BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); 186 BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
212 jumpstack[stackptr].chain = chain; 187 jumpstack[stackptr].chain = chain;
213 jumpstack[stackptr].rule = rule; 188 jumpstack[stackptr].rules = rules + 1;
214 jumpstack[stackptr].rulenum = rulenum;
215 stackptr++; 189 stackptr++;
216 /* fall through */ 190 /* fall through */
217 case NFT_GOTO: 191 case NFT_GOTO:
218 nft_trace_packet(&info, chain, rule, 192 nft_trace_packet(&info, chain, rule,
219 rulenum, NFT_TRACETYPE_RULE); 193 NFT_TRACETYPE_RULE);
220 194
221 chain = regs.verdict.chain; 195 chain = regs.verdict.chain;
222 goto do_chain; 196 goto do_chain;
223 case NFT_CONTINUE: 197 case NFT_CONTINUE:
224 rulenum++;
225 /* fall through */ 198 /* fall through */
226 case NFT_RETURN: 199 case NFT_RETURN:
227 nft_trace_packet(&info, chain, rule, 200 nft_trace_packet(&info, chain, rule,
228 rulenum, NFT_TRACETYPE_RETURN); 201 NFT_TRACETYPE_RETURN);
229 break; 202 break;
230 default: 203 default:
231 WARN_ON(1); 204 WARN_ON(1);
@@ -234,13 +207,11 @@ next_rule:
234 if (stackptr > 0) { 207 if (stackptr > 0) {
235 stackptr--; 208 stackptr--;
236 chain = jumpstack[stackptr].chain; 209 chain = jumpstack[stackptr].chain;
237 rule = jumpstack[stackptr].rule; 210 rules = jumpstack[stackptr].rules;
238 rulenum = jumpstack[stackptr].rulenum;
239 goto next_rule; 211 goto next_rule;
240 } 212 }
241 213
242 nft_trace_packet(&info, basechain, NULL, -1, 214 nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
243 NFT_TRACETYPE_POLICY);
244 215
245 if (static_branch_unlikely(&nft_counters_enabled)) 216 if (static_branch_unlikely(&nft_counters_enabled))
246 nft_update_chain_stats(basechain, pkt); 217 nft_update_chain_stats(basechain, pkt);
@@ -258,6 +229,9 @@ static struct nft_expr_type *nft_basic_types[] = {
258 &nft_payload_type, 229 &nft_payload_type,
259 &nft_dynset_type, 230 &nft_dynset_type,
260 &nft_range_type, 231 &nft_range_type,
232 &nft_meta_type,
233 &nft_rt_type,
234 &nft_exthdr_type,
261}; 235};
262 236
263int __init nf_tables_core_module_init(void) 237int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..4d0da7042aff 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -25,6 +25,7 @@
25#include <linux/uaccess.h> 25#include <linux/uaccess.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/sched/signal.h>
28 29
29#include <net/netlink.h> 30#include <net/netlink.h>
30#include <linux/netfilter/nfnetlink.h> 31#include <linux/netfilter/nfnetlink.h>
@@ -37,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
37 rcu_dereference_protected(table[(id)].subsys, \ 38 rcu_dereference_protected(table[(id)].subsys, \
38 lockdep_nfnl_is_held((id))) 39 lockdep_nfnl_is_held((id)))
39 40
41#define NFNL_MAX_ATTR_COUNT 32
42
40static struct { 43static struct {
41 struct mutex mutex; 44 struct mutex mutex;
42 const struct nfnetlink_subsystem __rcu *subsys; 45 const struct nfnetlink_subsystem __rcu *subsys;
@@ -76,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
76 79
77int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) 80int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
78{ 81{
82 u8 cb_id;
83
84 /* Sanity-check attr_count size to avoid stack buffer overflow. */
85 for (cb_id = 0; cb_id < n->cb_count; cb_id++)
86 if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
87 return -EINVAL;
88
79 nfnl_lock(n->subsys_id); 89 nfnl_lock(n->subsys_id);
80 if (table[n->subsys_id].subsys) { 90 if (table[n->subsys_id].subsys) {
81 nfnl_unlock(n->subsys_id); 91 nfnl_unlock(n->subsys_id);
@@ -185,11 +195,17 @@ replay:
185 { 195 {
186 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); 196 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
187 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); 197 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
188 struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; 198 struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
189 struct nlattr *attr = (void *)nlh + min_len; 199 struct nlattr *attr = (void *)nlh + min_len;
190 int attrlen = nlh->nlmsg_len - min_len; 200 int attrlen = nlh->nlmsg_len - min_len;
191 __u8 subsys_id = NFNL_SUBSYS_ID(type); 201 __u8 subsys_id = NFNL_SUBSYS_ID(type);
192 202
203 /* Sanity-check NFNL_MAX_ATTR_COUNT */
204 if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
205 rcu_read_unlock();
206 return -ENOMEM;
207 }
208
193 err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, 209 err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
194 ss->cb[cb_id].policy, extack); 210 ss->cb[cb_id].policy, extack);
195 if (err < 0) { 211 if (err < 0) {
@@ -330,6 +346,13 @@ replay:
330 while (skb->len >= nlmsg_total_size(0)) { 346 while (skb->len >= nlmsg_total_size(0)) {
331 int msglen, type; 347 int msglen, type;
332 348
349 if (fatal_signal_pending(current)) {
350 nfnl_err_reset(&err_list);
351 err = -EINTR;
352 status = NFNL_BATCH_FAILURE;
353 goto done;
354 }
355
333 memset(&extack, 0, sizeof(extack)); 356 memset(&extack, 0, sizeof(extack));
334 nlh = nlmsg_hdr(skb); 357 nlh = nlmsg_hdr(skb);
335 err = 0; 358 err = 0;
@@ -379,10 +402,16 @@ replay:
379 { 402 {
380 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); 403 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
381 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); 404 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
382 struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; 405 struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
383 struct nlattr *attr = (void *)nlh + min_len; 406 struct nlattr *attr = (void *)nlh + min_len;
384 int attrlen = nlh->nlmsg_len - min_len; 407 int attrlen = nlh->nlmsg_len - min_len;
385 408
409 /* Sanity-check NFTA_MAX_ATTR */
410 if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
411 err = -ENOMEM;
412 goto ack;
413 }
414
386 err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, 415 err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
387 attrlen, ss->cb[cb_id].policy, NULL); 416 attrlen, ss->cb[cb_id].policy, NULL);
388 if (err < 0) 417 if (err < 0)
@@ -441,10 +470,19 @@ done:
441 kfree_skb(skb); 470 kfree_skb(skb);
442 goto replay; 471 goto replay;
443 } else if (status == NFNL_BATCH_DONE) { 472 } else if (status == NFNL_BATCH_DONE) {
444 ss->commit(net, oskb); 473 err = ss->commit(net, oskb);
474 if (err == -EAGAIN) {
475 status |= NFNL_BATCH_REPLAY;
476 goto done;
477 } else if (err) {
478 ss->abort(net, oskb);
479 netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
480 }
445 } else { 481 } else {
446 ss->abort(net, oskb); 482 ss->abort(net, oskb);
447 } 483 }
484 if (ss->cleanup)
485 ss->cleanup(net);
448 486
449 nfnl_err_deliver(&err_list, oskb); 487 nfnl_err_deliver(&err_list, oskb);
450 nfnl_unlock(subsys_id); 488 nfnl_unlock(subsys_id);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index c14822b9729f..332c69d27b47 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -37,7 +37,6 @@
37#include <net/sock.h> 37#include <net/sock.h>
38#include <net/netfilter/nf_log.h> 38#include <net/netfilter/nf_log.h>
39#include <net/netns/generic.h> 39#include <net/netns/generic.h>
40#include <net/netfilter/nfnetlink_log.h>
41 40
42#include <linux/atomic.h> 41#include <linux/atomic.h>
43#include <linux/refcount.h> 42#include <linux/refcount.h>
@@ -47,6 +46,7 @@
47#include "../bridge/br_private.h" 46#include "../bridge/br_private.h"
48#endif 47#endif
49 48
49#define NFULNL_COPY_DISABLED 0xff
50#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE 50#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
51#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ 51#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
52#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ 52#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
@@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = {
618}; 618};
619 619
620/* log handler for internal netfilter logging api */ 620/* log handler for internal netfilter logging api */
621void 621static void
622nfulnl_log_packet(struct net *net, 622nfulnl_log_packet(struct net *net,
623 u_int8_t pf, 623 u_int8_t pf,
624 unsigned int hooknum, 624 unsigned int hooknum,
@@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net,
633 struct nfulnl_instance *inst; 633 struct nfulnl_instance *inst;
634 const struct nf_loginfo *li; 634 const struct nf_loginfo *li;
635 unsigned int qthreshold; 635 unsigned int qthreshold;
636 unsigned int plen; 636 unsigned int plen = 0;
637 struct nfnl_log_net *log = nfnl_log_pernet(net); 637 struct nfnl_log_net *log = nfnl_log_pernet(net);
638 const struct nfnl_ct_hook *nfnl_ct = NULL; 638 const struct nfnl_ct_hook *nfnl_ct = NULL;
639 struct nf_conn *ct = NULL; 639 struct nf_conn *ct = NULL;
@@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net,
648 if (!inst) 648 if (!inst)
649 return; 649 return;
650 650
651 plen = 0;
652 if (prefix) 651 if (prefix)
653 plen = strlen(prefix) + 1; 652 plen = strlen(prefix) + 1;
654 653
@@ -760,7 +759,6 @@ alloc_failure:
760 /* FIXME: statistics */ 759 /* FIXME: statistics */
761 goto unlock_and_release; 760 goto unlock_and_release;
762} 761}
763EXPORT_SYMBOL_GPL(nfulnl_log_packet);
764 762
765static int 763static int
766nfulnl_rcv_nl_event(struct notifier_block *this, 764nfulnl_rcv_nl_event(struct notifier_block *this,
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 494a9ab35cb6..4ccd2988f9db 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -227,6 +227,25 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
227 return entry; 227 return entry;
228} 228}
229 229
230static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
231{
232 struct nf_ct_hook *ct_hook;
233 int err;
234
235 if (verdict == NF_ACCEPT ||
236 verdict == NF_STOP) {
237 rcu_read_lock();
238 ct_hook = rcu_dereference(nf_ct_hook);
239 if (ct_hook) {
240 err = ct_hook->update(entry->state.net, entry->skb);
241 if (err < 0)
242 verdict = NF_DROP;
243 }
244 rcu_read_unlock();
245 }
246 nf_reinject(entry, verdict);
247}
248
230static void 249static void
231nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) 250nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
232{ 251{
@@ -237,7 +256,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
237 if (!cmpfn || cmpfn(entry, data)) { 256 if (!cmpfn || cmpfn(entry, data)) {
238 list_del(&entry->list); 257 list_del(&entry->list);
239 queue->queue_total--; 258 queue->queue_total--;
240 nf_reinject(entry, NF_DROP); 259 nfqnl_reinject(entry, NF_DROP);
241 } 260 }
242 } 261 }
243 spin_unlock_bh(&queue->lock); 262 spin_unlock_bh(&queue->lock);
@@ -686,7 +705,7 @@ err_out_free_nskb:
686err_out_unlock: 705err_out_unlock:
687 spin_unlock_bh(&queue->lock); 706 spin_unlock_bh(&queue->lock);
688 if (failopen) 707 if (failopen)
689 nf_reinject(entry, NF_ACCEPT); 708 nfqnl_reinject(entry, NF_ACCEPT);
690err_out: 709err_out:
691 return err; 710 return err;
692} 711}
@@ -1085,7 +1104,8 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
1085 list_for_each_entry_safe(entry, tmp, &batch_list, list) { 1104 list_for_each_entry_safe(entry, tmp, &batch_list, list) {
1086 if (nfqa[NFQA_MARK]) 1105 if (nfqa[NFQA_MARK])
1087 entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); 1106 entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
1088 nf_reinject(entry, verdict); 1107
1108 nfqnl_reinject(entry, verdict);
1089 } 1109 }
1090 return 0; 1110 return 0;
1091} 1111}
@@ -1208,7 +1228,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
1208 if (nfqa[NFQA_MARK]) 1228 if (nfqa[NFQA_MARK])
1209 entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); 1229 entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
1210 1230
1211 nf_reinject(entry, verdict); 1231 nfqnl_reinject(entry, verdict);
1212 return 0; 1232 return 0;
1213} 1233}
1214 1234
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 1d99a1efdafc..8d1ff654e5af 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -611,10 +611,10 @@ nla_put_failure:
611 return -1; 611 return -1;
612} 612}
613 613
614static int nfnl_compat_get(struct net *net, struct sock *nfnl, 614static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
615 struct sk_buff *skb, const struct nlmsghdr *nlh, 615 struct sk_buff *skb, const struct nlmsghdr *nlh,
616 const struct nlattr * const tb[], 616 const struct nlattr * const tb[],
617 struct netlink_ext_ack *extack) 617 struct netlink_ext_ack *extack)
618{ 618{
619 int ret = 0, target; 619 int ret = 0, target;
620 struct nfgenmsg *nfmsg; 620 struct nfgenmsg *nfmsg;
@@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
653 return -EINVAL; 653 return -EINVAL;
654 } 654 }
655 655
656 if (!try_module_get(THIS_MODULE))
657 return -EINVAL;
658
659 rcu_read_unlock();
656 try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, 660 try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
657 rev, target, &ret), 661 rev, target, &ret),
658 fmt, name); 662 fmt, name);
659
660 if (ret < 0) 663 if (ret < 0)
661 return ret; 664 goto out_put;
662 665
663 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 666 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
664 if (skb2 == NULL) 667 if (skb2 == NULL) {
665 return -ENOMEM; 668 ret = -ENOMEM;
669 goto out_put;
670 }
666 671
667 /* include the best revision for this extension in the message */ 672 /* include the best revision for this extension in the message */
668 if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, 673 if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
@@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
672 nfmsg->nfgen_family, 677 nfmsg->nfgen_family,
673 name, ret, target) <= 0) { 678 name, ret, target) <= 0) {
674 kfree_skb(skb2); 679 kfree_skb(skb2);
675 return -ENOSPC; 680 goto out_put;
676 } 681 }
677 682
678 ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, 683 ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
679 MSG_DONTWAIT); 684 MSG_DONTWAIT);
680 if (ret > 0) 685 if (ret > 0)
681 ret = 0; 686 ret = 0;
682 687out_put:
688 rcu_read_lock();
689 module_put(THIS_MODULE);
683 return ret == -EAGAIN ? -ENOBUFS : ret; 690 return ret == -EAGAIN ? -ENOBUFS : ret;
684} 691}
685 692
@@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
691}; 698};
692 699
693static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { 700static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
694 [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get, 701 [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu,
695 .attr_count = NFTA_COMPAT_MAX, 702 .attr_count = NFTA_COMPAT_MAX,
696 .policy = nfnl_compat_policy_get }, 703 .policy = nfnl_compat_policy_get },
697}; 704};
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
new file mode 100644
index 000000000000..50c068d660e5
--- /dev/null
+++ b/net/netfilter/nft_connlimit.c
@@ -0,0 +1,297 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/netlink.h>
7#include <linux/netfilter.h>
8#include <linux/netfilter/nf_tables.h>
9#include <net/netfilter/nf_tables.h>
10#include <net/netfilter/nf_conntrack.h>
11#include <net/netfilter/nf_conntrack_count.h>
12#include <net/netfilter/nf_conntrack_core.h>
13#include <net/netfilter/nf_conntrack_tuple.h>
14#include <net/netfilter/nf_conntrack_zones.h>
15
16struct nft_connlimit {
17 spinlock_t lock;
18 struct hlist_head hhead;
19 u32 limit;
20 bool invert;
21};
22
23static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
24 struct nft_regs *regs,
25 const struct nft_pktinfo *pkt,
26 const struct nft_set_ext *ext)
27{
28 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
29 const struct nf_conntrack_tuple *tuple_ptr;
30 struct nf_conntrack_tuple tuple;
31 enum ip_conntrack_info ctinfo;
32 const struct nf_conn *ct;
33 unsigned int count;
34 bool addit;
35
36 tuple_ptr = &tuple;
37
38 ct = nf_ct_get(pkt->skb, &ctinfo);
39 if (ct != NULL) {
40 tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
41 zone = nf_ct_zone(ct);
42 } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
43 nft_pf(pkt), nft_net(pkt), &tuple)) {
44 regs->verdict.code = NF_DROP;
45 return;
46 }
47
48 spin_lock_bh(&priv->lock);
49 count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
50 &addit);
51
52 if (!addit)
53 goto out;
54
55 if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
56 regs->verdict.code = NF_DROP;
57 spin_unlock_bh(&priv->lock);
58 return;
59 }
60 count++;
61out:
62 spin_unlock_bh(&priv->lock);
63
64 if ((count > priv->limit) ^ priv->invert) {
65 regs->verdict.code = NFT_BREAK;
66 return;
67 }
68}
69
70static int nft_connlimit_do_init(const struct nft_ctx *ctx,
71 const struct nlattr * const tb[],
72 struct nft_connlimit *priv)
73{
74 bool invert = false;
75 u32 flags, limit;
76
77 if (!tb[NFTA_CONNLIMIT_COUNT])
78 return -EINVAL;
79
80 limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT]));
81
82 if (tb[NFTA_CONNLIMIT_FLAGS]) {
83 flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS]));
84 if (flags & ~NFT_CONNLIMIT_F_INV)
85 return -EOPNOTSUPP;
86 if (flags & NFT_CONNLIMIT_F_INV)
87 invert = true;
88 }
89
90 spin_lock_init(&priv->lock);
91 INIT_HLIST_HEAD(&priv->hhead);
92 priv->limit = limit;
93 priv->invert = invert;
94
95 return nf_ct_netns_get(ctx->net, ctx->family);
96}
97
98static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
99 struct nft_connlimit *priv)
100{
101 nf_ct_netns_put(ctx->net, ctx->family);
102 nf_conncount_cache_free(&priv->hhead);
103}
104
105static int nft_connlimit_do_dump(struct sk_buff *skb,
106 struct nft_connlimit *priv)
107{
108 if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit)))
109 goto nla_put_failure;
110 if (priv->invert &&
111 nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV)))
112 goto nla_put_failure;
113
114 return 0;
115
116nla_put_failure:
117 return -1;
118}
119
120static inline void nft_connlimit_obj_eval(struct nft_object *obj,
121 struct nft_regs *regs,
122 const struct nft_pktinfo *pkt)
123{
124 struct nft_connlimit *priv = nft_obj_data(obj);
125
126 nft_connlimit_do_eval(priv, regs, pkt, NULL);
127}
128
129static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
130 const struct nlattr * const tb[],
131 struct nft_object *obj)
132{
133 struct nft_connlimit *priv = nft_obj_data(obj);
134
135 return nft_connlimit_do_init(ctx, tb, priv);
136}
137
138static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
139 struct nft_object *obj)
140{
141 struct nft_connlimit *priv = nft_obj_data(obj);
142
143 nft_connlimit_do_destroy(ctx, priv);
144}
145
146static int nft_connlimit_obj_dump(struct sk_buff *skb,
147 struct nft_object *obj, bool reset)
148{
149 struct nft_connlimit *priv = nft_obj_data(obj);
150
151 return nft_connlimit_do_dump(skb, priv);
152}
153
154static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = {
155 [NFTA_CONNLIMIT_COUNT] = { .type = NLA_U32 },
156 [NFTA_CONNLIMIT_FLAGS] = { .type = NLA_U32 },
157};
158
159static struct nft_object_type nft_connlimit_obj_type;
160static const struct nft_object_ops nft_connlimit_obj_ops = {
161 .type = &nft_connlimit_obj_type,
162 .size = sizeof(struct nft_connlimit),
163 .eval = nft_connlimit_obj_eval,
164 .init = nft_connlimit_obj_init,
165 .destroy = nft_connlimit_obj_destroy,
166 .dump = nft_connlimit_obj_dump,
167};
168
169static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
170 .type = NFT_OBJECT_CONNLIMIT,
171 .ops = &nft_connlimit_obj_ops,
172 .maxattr = NFTA_CONNLIMIT_MAX,
173 .policy = nft_connlimit_policy,
174 .owner = THIS_MODULE,
175};
176
177static void nft_connlimit_eval(const struct nft_expr *expr,
178 struct nft_regs *regs,
179 const struct nft_pktinfo *pkt)
180{
181 struct nft_connlimit *priv = nft_expr_priv(expr);
182
183 nft_connlimit_do_eval(priv, regs, pkt, NULL);
184}
185
186static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
187{
188 struct nft_connlimit *priv = nft_expr_priv(expr);
189
190 return nft_connlimit_do_dump(skb, priv);
191}
192
193static int nft_connlimit_init(const struct nft_ctx *ctx,
194 const struct nft_expr *expr,
195 const struct nlattr * const tb[])
196{
197 struct nft_connlimit *priv = nft_expr_priv(expr);
198
199 return nft_connlimit_do_init(ctx, tb, priv);
200}
201
202static void nft_connlimit_destroy(const struct nft_ctx *ctx,
203 const struct nft_expr *expr)
204{
205 struct nft_connlimit *priv = nft_expr_priv(expr);
206
207 nft_connlimit_do_destroy(ctx, priv);
208}
209
210static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
211{
212 struct nft_connlimit *priv_dst = nft_expr_priv(dst);
213 struct nft_connlimit *priv_src = nft_expr_priv(src);
214
215 spin_lock_init(&priv_dst->lock);
216 INIT_HLIST_HEAD(&priv_dst->hhead);
217 priv_dst->limit = priv_src->limit;
218 priv_dst->invert = priv_src->invert;
219
220 return 0;
221}
222
223static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
224 const struct nft_expr *expr)
225{
226 struct nft_connlimit *priv = nft_expr_priv(expr);
227
228 nf_conncount_cache_free(&priv->hhead);
229}
230
231static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
232{
233 struct nft_connlimit *priv = nft_expr_priv(expr);
234 bool addit, ret;
235
236 spin_lock_bh(&priv->lock);
237 nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
238
239 ret = hlist_empty(&priv->hhead);
240 spin_unlock_bh(&priv->lock);
241
242 return ret;
243}
244
245static struct nft_expr_type nft_connlimit_type;
246static const struct nft_expr_ops nft_connlimit_ops = {
247 .type = &nft_connlimit_type,
248 .size = NFT_EXPR_SIZE(sizeof(struct nft_connlimit)),
249 .eval = nft_connlimit_eval,
250 .init = nft_connlimit_init,
251 .destroy = nft_connlimit_destroy,
252 .clone = nft_connlimit_clone,
253 .destroy_clone = nft_connlimit_destroy_clone,
254 .dump = nft_connlimit_dump,
255 .gc = nft_connlimit_gc,
256};
257
258static struct nft_expr_type nft_connlimit_type __read_mostly = {
259 .name = "connlimit",
260 .ops = &nft_connlimit_ops,
261 .policy = nft_connlimit_policy,
262 .maxattr = NFTA_CONNLIMIT_MAX,
263 .flags = NFT_EXPR_STATEFUL | NFT_EXPR_GC,
264 .owner = THIS_MODULE,
265};
266
267static int __init nft_connlimit_module_init(void)
268{
269 int err;
270
271 err = nft_register_obj(&nft_connlimit_obj_type);
272 if (err < 0)
273 return err;
274
275 err = nft_register_expr(&nft_connlimit_type);
276 if (err < 0)
277 goto err1;
278
279 return 0;
280err1:
281 nft_unregister_obj(&nft_connlimit_obj_type);
282 return err;
283}
284
285static void __exit nft_connlimit_module_exit(void)
286{
287 nft_unregister_expr(&nft_connlimit_type);
288 nft_unregister_obj(&nft_connlimit_obj_type);
289}
290
291module_init(nft_connlimit_module_init);
292module_exit(nft_connlimit_module_exit);
293
294MODULE_LICENSE("GPL");
295MODULE_AUTHOR("Pablo Neira Ayuso");
296MODULE_ALIAS_NFT_EXPR("connlimit");
297MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index eefe3b409925..a61d7edfc290 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -96,7 +96,8 @@ static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
96 free_percpu(priv->counter); 96 free_percpu(priv->counter);
97} 97}
98 98
99static void nft_counter_obj_destroy(struct nft_object *obj) 99static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
100 struct nft_object *obj)
100{ 101{
101 struct nft_counter_percpu_priv *priv = nft_obj_data(obj); 102 struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
102 103
@@ -257,6 +258,7 @@ static const struct nft_expr_ops nft_counter_ops = {
257 .eval = nft_counter_eval, 258 .eval = nft_counter_eval,
258 .init = nft_counter_init, 259 .init = nft_counter_init,
259 .destroy = nft_counter_destroy, 260 .destroy = nft_counter_destroy,
261 .destroy_clone = nft_counter_destroy,
260 .dump = nft_counter_dump, 262 .dump = nft_counter_dump,
261 .clone = nft_counter_clone, 263 .clone = nft_counter_clone,
262}; 264};
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 5c0de704bad5..1435ffc5f57e 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,7 +826,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
826 return 0; 826 return 0;
827} 827}
828 828
829static void nft_ct_helper_obj_destroy(struct nft_object *obj) 829static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
830 struct nft_object *obj)
830{ 831{
831 struct nft_ct_helper_obj *priv = nft_obj_data(obj); 832 struct nft_ct_helper_obj *priv = nft_obj_data(obj);
832 833
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 04863fad05dd..4d49529cff61 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -36,7 +36,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
36 u64 timeout; 36 u64 timeout;
37 void *elem; 37 void *elem;
38 38
39 if (set->size && !atomic_add_unless(&set->nelems, 1, set->size)) 39 if (!atomic_add_unless(&set->nelems, 1, set->size))
40 return NULL; 40 return NULL;
41 41
42 timeout = priv->timeout ? : set->timeout; 42 timeout = priv->timeout ? : set->timeout;
@@ -81,7 +81,7 @@ static void nft_dynset_eval(const struct nft_expr *expr,
81 if (priv->op == NFT_DYNSET_OP_UPDATE && 81 if (priv->op == NFT_DYNSET_OP_UPDATE &&
82 nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { 82 nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
83 timeout = priv->timeout ? : set->timeout; 83 timeout = priv->timeout ? : set->timeout;
84 *nft_set_ext_expiration(ext) = jiffies + timeout; 84 *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
85 } 85 }
86 86
87 if (sexpr != NULL) 87 if (sexpr != NULL)
@@ -195,6 +195,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
195 err = -EOPNOTSUPP; 195 err = -EOPNOTSUPP;
196 if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) 196 if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
197 goto err1; 197 goto err1;
198
199 if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
200 if (set->flags & NFT_SET_TIMEOUT)
201 goto err1;
202 if (!set->ops->gc_init)
203 goto err1;
204 set->ops->gc_init(set);
205 }
206
198 } else if (set->flags & NFT_SET_EVAL) 207 } else if (set->flags & NFT_SET_EVAL)
199 return -EINVAL; 208 return -EINVAL;
200 209
@@ -216,6 +225,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
216 if (err < 0) 225 if (err < 0)
217 goto err1; 226 goto err1;
218 227
228 if (set->size == 0)
229 set->size = 0xffff;
230
219 priv->set = set; 231 priv->set = set;
220 return 0; 232 return 0;
221 233
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47ec1046ad11..a940c9fd9045 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -10,11 +10,10 @@
10 10
11#include <asm/unaligned.h> 11#include <asm/unaligned.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/init.h>
14#include <linux/module.h>
15#include <linux/netlink.h> 13#include <linux/netlink.h>
16#include <linux/netfilter.h> 14#include <linux/netfilter.h>
17#include <linux/netfilter/nf_tables.h> 15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_core.h>
18#include <net/netfilter/nf_tables.h> 17#include <net/netfilter/nf_tables.h>
19#include <net/tcp.h> 18#include <net/tcp.h>
20 19
@@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
353 return nft_exthdr_dump_common(skb, priv); 352 return nft_exthdr_dump_common(skb, priv);
354} 353}
355 354
356static struct nft_expr_type nft_exthdr_type;
357static const struct nft_expr_ops nft_exthdr_ipv6_ops = { 355static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
358 .type = &nft_exthdr_type, 356 .type = &nft_exthdr_type,
359 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), 357 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
407 return ERR_PTR(-EOPNOTSUPP); 405 return ERR_PTR(-EOPNOTSUPP);
408} 406}
409 407
410static struct nft_expr_type nft_exthdr_type __read_mostly = { 408struct nft_expr_type nft_exthdr_type __read_mostly = {
411 .name = "exthdr", 409 .name = "exthdr",
412 .select_ops = nft_exthdr_select_ops, 410 .select_ops = nft_exthdr_select_ops,
413 .policy = nft_exthdr_policy, 411 .policy = nft_exthdr_policy,
414 .maxattr = NFTA_EXTHDR_MAX, 412 .maxattr = NFTA_EXTHDR_MAX,
415 .owner = THIS_MODULE, 413 .owner = THIS_MODULE,
416}; 414};
417
418static int __init nft_exthdr_module_init(void)
419{
420 return nft_register_expr(&nft_exthdr_type);
421}
422
423static void __exit nft_exthdr_module_exit(void)
424{
425 nft_unregister_expr(&nft_exthdr_type);
426}
427
428module_init(nft_exthdr_module_init);
429module_exit(nft_exthdr_module_exit);
430
431MODULE_LICENSE("GPL");
432MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
433MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b65829b2be22..d6bab8c3cbb0 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
142 if (!tb[NFTA_FLOW_TABLE_NAME]) 142 if (!tb[NFTA_FLOW_TABLE_NAME])
143 return -EINVAL; 143 return -EINVAL;
144 144
145 flowtable = nf_tables_flowtable_lookup(ctx->table, 145 flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
146 tb[NFTA_FLOW_TABLE_NAME], 146 genmask);
147 genmask);
148 if (IS_ERR(flowtable)) 147 if (IS_ERR(flowtable))
149 return PTR_ERR(flowtable); 148 return PTR_ERR(flowtable);
150 149
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
12#include <linux/netlink.h> 12#include <linux/netlink.h>
13#include <linux/netfilter.h> 13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h> 14#include <linux/netfilter/nf_tables.h>
15#include <linux/ip.h>
16#include <linux/ipv6.h>
15#include <net/netfilter/nf_tables.h> 17#include <net/netfilter/nf_tables.h>
16#include <net/netfilter/nf_dup_netdev.h> 18#include <net/netfilter/nf_dup_netdev.h>
19#include <net/neighbour.h>
20#include <net/ip.h>
17 21
18struct nft_fwd_netdev { 22struct nft_fwd_netdev {
19 enum nft_registers sreg_dev:8; 23 enum nft_registers sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
32 36
33static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { 37static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
34 [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, 38 [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
39 [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 },
40 [NFTA_FWD_NFPROTO] = { .type = NLA_U32 },
35}; 41};
36 42
37static int nft_fwd_netdev_init(const struct nft_ctx *ctx, 43static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
62 return -1; 68 return -1;
63} 69}
64 70
71struct nft_fwd_neigh {
72 enum nft_registers sreg_dev:8;
73 enum nft_registers sreg_addr:8;
74 u8 nfproto;
75};
76
77static void nft_fwd_neigh_eval(const struct nft_expr *expr,
78 struct nft_regs *regs,
79 const struct nft_pktinfo *pkt)
80{
81 struct nft_fwd_neigh *priv = nft_expr_priv(expr);
82 void *addr = &regs->data[priv->sreg_addr];
83 int oif = regs->data[priv->sreg_dev];
84 unsigned int verdict = NF_STOLEN;
85 struct sk_buff *skb = pkt->skb;
86 struct net_device *dev;
87 int neigh_table;
88
89 switch (priv->nfproto) {
90 case NFPROTO_IPV4: {
91 struct iphdr *iph;
92
93 if (skb->protocol != htons(ETH_P_IP)) {
94 verdict = NFT_BREAK;
95 goto out;
96 }
97 if (skb_try_make_writable(skb, sizeof(*iph))) {
98 verdict = NF_DROP;
99 goto out;
100 }
101 iph = ip_hdr(skb);
102 ip_decrease_ttl(iph);
103 neigh_table = NEIGH_ARP_TABLE;
104 break;
105 }
106 case NFPROTO_IPV6: {
107 struct ipv6hdr *ip6h;
108
109 if (skb->protocol != htons(ETH_P_IPV6)) {
110 verdict = NFT_BREAK;
111 goto out;
112 }
113 if (skb_try_make_writable(skb, sizeof(*ip6h))) {
114 verdict = NF_DROP;
115 goto out;
116 }
117 ip6h = ipv6_hdr(skb);
118 ip6h->hop_limit--;
119 neigh_table = NEIGH_ND_TABLE;
120 break;
121 }
122 default:
123 verdict = NFT_BREAK;
124 goto out;
125 }
126
127 dev = dev_get_by_index_rcu(nft_net(pkt), oif);
128 if (dev == NULL)
129 return;
130
131 skb->dev = dev;
132 neigh_xmit(neigh_table, dev, addr, skb);
133out:
134 regs->verdict.code = verdict;
135}
136
137static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
138 const struct nft_expr *expr,
139 const struct nlattr * const tb[])
140{
141 struct nft_fwd_neigh *priv = nft_expr_priv(expr);
142 unsigned int addr_len;
143 int err;
144
145 if (!tb[NFTA_FWD_SREG_DEV] ||
146 !tb[NFTA_FWD_SREG_ADDR] ||
147 !tb[NFTA_FWD_NFPROTO])
148 return -EINVAL;
149
150 priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
151 priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
152 priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
153
154 switch (priv->nfproto) {
155 case NFPROTO_IPV4:
156 addr_len = sizeof(struct in_addr);
157 break;
158 case NFPROTO_IPV6:
159 addr_len = sizeof(struct in6_addr);
160 break;
161 default:
162 return -EOPNOTSUPP;
163 }
164
165 err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
166 if (err < 0)
167 return err;
168
169 return nft_validate_register_load(priv->sreg_addr, addr_len);
170}
171
172static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
173
174static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
175{
176 struct nft_fwd_neigh *priv = nft_expr_priv(expr);
177
178 if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
179 nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
180 nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
181 goto nla_put_failure;
182
183 return 0;
184
185nla_put_failure:
186 return -1;
187}
188
65static struct nft_expr_type nft_fwd_netdev_type; 189static struct nft_expr_type nft_fwd_netdev_type;
190static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
191 .type = &nft_fwd_netdev_type,
192 .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
193 .eval = nft_fwd_neigh_eval,
194 .init = nft_fwd_neigh_init,
195 .dump = nft_fwd_neigh_dump,
196};
197
66static const struct nft_expr_ops nft_fwd_netdev_ops = { 198static const struct nft_expr_ops nft_fwd_netdev_ops = {
67 .type = &nft_fwd_netdev_type, 199 .type = &nft_fwd_netdev_type,
68 .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)), 200 .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
71 .dump = nft_fwd_netdev_dump, 203 .dump = nft_fwd_netdev_dump,
72}; 204};
73 205
206static const struct nft_expr_ops *
207nft_fwd_select_ops(const struct nft_ctx *ctx,
208 const struct nlattr * const tb[])
209{
210 if (tb[NFTA_FWD_SREG_ADDR])
211 return &nft_fwd_neigh_netdev_ops;
212 if (tb[NFTA_FWD_SREG_DEV])
213 return &nft_fwd_netdev_ops;
214
215 return ERR_PTR(-EOPNOTSUPP);
216}
217
74static struct nft_expr_type nft_fwd_netdev_type __read_mostly = { 218static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
75 .family = NFPROTO_NETDEV, 219 .family = NFPROTO_NETDEV,
76 .name = "fwd", 220 .name = "fwd",
77 .ops = &nft_fwd_netdev_ops, 221 .select_ops = nft_fwd_select_ops,
78 .policy = nft_fwd_netdev_policy, 222 .policy = nft_fwd_netdev_policy,
79 .maxattr = NFTA_FWD_MAX, 223 .maxattr = NFTA_FWD_MAX,
80 .owner = THIS_MODULE, 224 .owner = THIS_MODULE,
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 24f2f7567ddb..c2d237144f74 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,6 +25,7 @@ struct nft_jhash {
25 u32 modulus; 25 u32 modulus;
26 u32 seed; 26 u32 seed;
27 u32 offset; 27 u32 offset;
28 struct nft_set *map;
28}; 29};
29 30
30static void nft_jhash_eval(const struct nft_expr *expr, 31static void nft_jhash_eval(const struct nft_expr *expr,
@@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr,
35 const void *data = &regs->data[priv->sreg]; 36 const void *data = &regs->data[priv->sreg];
36 u32 h; 37 u32 h;
37 38
38 h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus); 39 h = reciprocal_scale(jhash(data, priv->len, priv->seed),
40 priv->modulus);
41
39 regs->data[priv->dreg] = h + priv->offset; 42 regs->data[priv->dreg] = h + priv->offset;
40} 43}
41 44
45static void nft_jhash_map_eval(const struct nft_expr *expr,
46 struct nft_regs *regs,
47 const struct nft_pktinfo *pkt)
48{
49 struct nft_jhash *priv = nft_expr_priv(expr);
50 const void *data = &regs->data[priv->sreg];
51 const struct nft_set *map = priv->map;
52 const struct nft_set_ext *ext;
53 u32 result;
54 bool found;
55
56 result = reciprocal_scale(jhash(data, priv->len, priv->seed),
57 priv->modulus) + priv->offset;
58
59 found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
60 if (!found)
61 return;
62
63 nft_data_copy(&regs->data[priv->dreg],
64 nft_set_ext_data(ext), map->dlen);
65}
66
42struct nft_symhash { 67struct nft_symhash {
43 enum nft_registers dreg:8; 68 enum nft_registers dreg:8;
44 u32 modulus; 69 u32 modulus;
45 u32 offset; 70 u32 offset;
71 struct nft_set *map;
46}; 72};
47 73
48static void nft_symhash_eval(const struct nft_expr *expr, 74static void nft_symhash_eval(const struct nft_expr *expr,
@@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr,
58 regs->data[priv->dreg] = h + priv->offset; 84 regs->data[priv->dreg] = h + priv->offset;
59} 85}
60 86
87static void nft_symhash_map_eval(const struct nft_expr *expr,
88 struct nft_regs *regs,
89 const struct nft_pktinfo *pkt)
90{
91 struct nft_symhash *priv = nft_expr_priv(expr);
92 struct sk_buff *skb = pkt->skb;
93 const struct nft_set *map = priv->map;
94 const struct nft_set_ext *ext;
95 u32 result;
96 bool found;
97
98 result = reciprocal_scale(__skb_get_hash_symmetric(skb),
99 priv->modulus) + priv->offset;
100
101 found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
102 if (!found)
103 return;
104
105 nft_data_copy(&regs->data[priv->dreg],
106 nft_set_ext_data(ext), map->dlen);
107}
108
61static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { 109static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
62 [NFTA_HASH_SREG] = { .type = NLA_U32 }, 110 [NFTA_HASH_SREG] = { .type = NLA_U32 },
63 [NFTA_HASH_DREG] = { .type = NLA_U32 }, 111 [NFTA_HASH_DREG] = { .type = NLA_U32 },
@@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
66 [NFTA_HASH_SEED] = { .type = NLA_U32 }, 114 [NFTA_HASH_SEED] = { .type = NLA_U32 },
67 [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, 115 [NFTA_HASH_OFFSET] = { .type = NLA_U32 },
68 [NFTA_HASH_TYPE] = { .type = NLA_U32 }, 116 [NFTA_HASH_TYPE] = { .type = NLA_U32 },
117 [NFTA_HASH_SET_NAME] = { .type = NLA_STRING,
118 .len = NFT_SET_MAXNAMELEN - 1 },
119 [NFTA_HASH_SET_ID] = { .type = NLA_U32 },
69}; 120};
70 121
71static int nft_jhash_init(const struct nft_ctx *ctx, 122static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -97,7 +148,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
97 priv->len = len; 148 priv->len = len;
98 149
99 priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); 150 priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
100 if (priv->modulus <= 1) 151 if (priv->modulus < 1)
101 return -ERANGE; 152 return -ERANGE;
102 153
103 if (priv->offset + priv->modulus - 1 < priv->offset) 154 if (priv->offset + priv->modulus - 1 < priv->offset)
@@ -115,6 +166,20 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
115 NFT_DATA_VALUE, sizeof(u32)); 166 NFT_DATA_VALUE, sizeof(u32));
116} 167}
117 168
169static int nft_jhash_map_init(const struct nft_ctx *ctx,
170 const struct nft_expr *expr,
171 const struct nlattr * const tb[])
172{
173 struct nft_jhash *priv = nft_expr_priv(expr);
174 u8 genmask = nft_genmask_next(ctx->net);
175
176 nft_jhash_init(ctx, expr, tb);
177 priv->map = nft_set_lookup_global(ctx->net, ctx->table,
178 tb[NFTA_HASH_SET_NAME],
179 tb[NFTA_HASH_SET_ID], genmask);
180 return PTR_ERR_OR_ZERO(priv->map);
181}
182
118static int nft_symhash_init(const struct nft_ctx *ctx, 183static int nft_symhash_init(const struct nft_ctx *ctx,
119 const struct nft_expr *expr, 184 const struct nft_expr *expr,
120 const struct nlattr * const tb[]) 185 const struct nlattr * const tb[])
@@ -141,6 +206,20 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
141 NFT_DATA_VALUE, sizeof(u32)); 206 NFT_DATA_VALUE, sizeof(u32));
142} 207}
143 208
209static int nft_symhash_map_init(const struct nft_ctx *ctx,
210 const struct nft_expr *expr,
211 const struct nlattr * const tb[])
212{
213 struct nft_jhash *priv = nft_expr_priv(expr);
214 u8 genmask = nft_genmask_next(ctx->net);
215
216 nft_symhash_init(ctx, expr, tb);
217 priv->map = nft_set_lookup_global(ctx->net, ctx->table,
218 tb[NFTA_HASH_SET_NAME],
219 tb[NFTA_HASH_SET_ID], genmask);
220 return PTR_ERR_OR_ZERO(priv->map);
221}
222
144static int nft_jhash_dump(struct sk_buff *skb, 223static int nft_jhash_dump(struct sk_buff *skb,
145 const struct nft_expr *expr) 224 const struct nft_expr *expr)
146{ 225{
@@ -168,6 +247,18 @@ nla_put_failure:
168 return -1; 247 return -1;
169} 248}
170 249
250static int nft_jhash_map_dump(struct sk_buff *skb,
251 const struct nft_expr *expr)
252{
253 const struct nft_jhash *priv = nft_expr_priv(expr);
254
255 if (nft_jhash_dump(skb, expr) ||
256 nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
257 return -1;
258
259 return 0;
260}
261
171static int nft_symhash_dump(struct sk_buff *skb, 262static int nft_symhash_dump(struct sk_buff *skb,
172 const struct nft_expr *expr) 263 const struct nft_expr *expr)
173{ 264{
@@ -188,6 +279,18 @@ nla_put_failure:
188 return -1; 279 return -1;
189} 280}
190 281
282static int nft_symhash_map_dump(struct sk_buff *skb,
283 const struct nft_expr *expr)
284{
285 const struct nft_symhash *priv = nft_expr_priv(expr);
286
287 if (nft_symhash_dump(skb, expr) ||
288 nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
289 return -1;
290
291 return 0;
292}
293
191static struct nft_expr_type nft_hash_type; 294static struct nft_expr_type nft_hash_type;
192static const struct nft_expr_ops nft_jhash_ops = { 295static const struct nft_expr_ops nft_jhash_ops = {
193 .type = &nft_hash_type, 296 .type = &nft_hash_type,
@@ -197,6 +300,14 @@ static const struct nft_expr_ops nft_jhash_ops = {
197 .dump = nft_jhash_dump, 300 .dump = nft_jhash_dump,
198}; 301};
199 302
303static const struct nft_expr_ops nft_jhash_map_ops = {
304 .type = &nft_hash_type,
305 .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
306 .eval = nft_jhash_map_eval,
307 .init = nft_jhash_map_init,
308 .dump = nft_jhash_map_dump,
309};
310
200static const struct nft_expr_ops nft_symhash_ops = { 311static const struct nft_expr_ops nft_symhash_ops = {
201 .type = &nft_hash_type, 312 .type = &nft_hash_type,
202 .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), 313 .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -205,6 +316,14 @@ static const struct nft_expr_ops nft_symhash_ops = {
205 .dump = nft_symhash_dump, 316 .dump = nft_symhash_dump,
206}; 317};
207 318
319static const struct nft_expr_ops nft_symhash_map_ops = {
320 .type = &nft_hash_type,
321 .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
322 .eval = nft_symhash_map_eval,
323 .init = nft_symhash_map_init,
324 .dump = nft_symhash_map_dump,
325};
326
208static const struct nft_expr_ops * 327static const struct nft_expr_ops *
209nft_hash_select_ops(const struct nft_ctx *ctx, 328nft_hash_select_ops(const struct nft_ctx *ctx,
210 const struct nlattr * const tb[]) 329 const struct nlattr * const tb[])
@@ -217,8 +336,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
217 type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); 336 type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
218 switch (type) { 337 switch (type) {
219 case NFT_HASH_SYM: 338 case NFT_HASH_SYM:
339 if (tb[NFTA_HASH_SET_NAME])
340 return &nft_symhash_map_ops;
220 return &nft_symhash_ops; 341 return &nft_symhash_ops;
221 case NFT_HASH_JENKINS: 342 case NFT_HASH_JENKINS:
343 if (tb[NFTA_HASH_SET_NAME])
344 return &nft_jhash_map_ops;
222 return &nft_jhash_ops; 345 return &nft_jhash_ops;
223 default: 346 default:
224 break; 347 break;
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index aa87ff8beae8..15adf8ca82c3 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -17,12 +17,6 @@
17#include <net/netfilter/nf_tables_core.h> 17#include <net/netfilter/nf_tables_core.h>
18#include <net/netfilter/nf_tables.h> 18#include <net/netfilter/nf_tables.h>
19 19
20struct nft_immediate_expr {
21 struct nft_data data;
22 enum nft_registers dreg:8;
23 u8 dlen;
24};
25
26static void nft_immediate_eval(const struct nft_expr *expr, 20static void nft_immediate_eval(const struct nft_expr *expr,
27 struct nft_regs *regs, 21 struct nft_regs *regs,
28 const struct nft_pktinfo *pkt) 22 const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ nla_put_failure:
101 95
102static int nft_immediate_validate(const struct nft_ctx *ctx, 96static int nft_immediate_validate(const struct nft_ctx *ctx,
103 const struct nft_expr *expr, 97 const struct nft_expr *expr,
104 const struct nft_data **data) 98 const struct nft_data **d)
105{ 99{
106 const struct nft_immediate_expr *priv = nft_expr_priv(expr); 100 const struct nft_immediate_expr *priv = nft_expr_priv(expr);
101 const struct nft_data *data;
102 int err;
107 103
108 if (priv->dreg == NFT_REG_VERDICT) 104 if (priv->dreg != NFT_REG_VERDICT)
109 *data = &priv->data; 105 return 0;
106
107 data = &priv->data;
108
109 switch (data->verdict.code) {
110 case NFT_JUMP:
111 case NFT_GOTO:
112 err = nft_chain_validate(ctx, data->verdict.chain);
113 if (err < 0)
114 return err;
115 break;
116 default:
117 break;
118 }
110 119
111 return 0; 120 return 0;
112} 121}
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
9 * Development of this code funded by Astaro AG (http://www.astaro.com/) 9 * Development of this code funded by Astaro AG (http://www.astaro.com/)
10 */ 10 */
11 11
12#include <linux/audit.h>
12#include <linux/kernel.h> 13#include <linux/kernel.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/netlink.h> 16#include <linux/netlink.h>
16#include <linux/netfilter.h> 17#include <linux/netfilter.h>
17#include <linux/netfilter/nf_tables.h> 18#include <linux/netfilter/nf_tables.h>
19#include <net/ipv6.h>
20#include <net/ip.h>
18#include <net/netfilter/nf_tables.h> 21#include <net/netfilter/nf_tables.h>
19#include <net/netfilter/nf_log.h> 22#include <net/netfilter/nf_log.h>
20#include <linux/netdevice.h> 23#include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
26 char *prefix; 29 char *prefix;
27}; 30};
28 31
32static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
33{
34 struct iphdr _iph;
35 const struct iphdr *ih;
36
37 ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
38 if (!ih)
39 return false;
40
41 audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
42 &ih->saddr, &ih->daddr, ih->protocol);
43
44 return true;
45}
46
47static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
48{
49 struct ipv6hdr _ip6h;
50 const struct ipv6hdr *ih;
51 u8 nexthdr;
52 __be16 frag_off;
53
54 ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
55 if (!ih)
56 return false;
57
58 nexthdr = ih->nexthdr;
59 ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
60
61 audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
62 &ih->saddr, &ih->daddr, nexthdr);
63
64 return true;
65}
66
67static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
68{
69 struct sk_buff *skb = pkt->skb;
70 struct audit_buffer *ab;
71 int fam = -1;
72
73 if (!audit_enabled)
74 return;
75
76 ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
77 if (!ab)
78 return;
79
80 audit_log_format(ab, "mark=%#x", skb->mark);
81
82 switch (nft_pf(pkt)) {
83 case NFPROTO_BRIDGE:
84 switch (eth_hdr(skb)->h_proto) {
85 case htons(ETH_P_IP):
86 fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
87 break;
88 case htons(ETH_P_IPV6):
89 fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
90 break;
91 }
92 break;
93 case NFPROTO_IPV4:
94 fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
95 break;
96 case NFPROTO_IPV6:
97 fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
98 break;
99 }
100
101 if (fam == -1)
102 audit_log_format(ab, " saddr=? daddr=? proto=-1");
103
104 audit_log_end(ab);
105}
106
29static void nft_log_eval(const struct nft_expr *expr, 107static void nft_log_eval(const struct nft_expr *expr,
30 struct nft_regs *regs, 108 struct nft_regs *regs,
31 const struct nft_pktinfo *pkt) 109 const struct nft_pktinfo *pkt)
32{ 110{
33 const struct nft_log *priv = nft_expr_priv(expr); 111 const struct nft_log *priv = nft_expr_priv(expr);
34 112
113 if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
114 priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
115 nft_log_eval_audit(pkt);
116 return;
117 }
118
35 nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb, 119 nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
36 nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s", 120 nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
37 priv->prefix); 121 priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
84 } else { 168 } else {
85 li->u.log.level = LOGLEVEL_WARNING; 169 li->u.log.level = LOGLEVEL_WARNING;
86 } 170 }
87 if (li->u.log.level > LOGLEVEL_DEBUG) { 171 if (li->u.log.level > LOGLEVEL_AUDIT) {
88 err = -EINVAL; 172 err = -EINVAL;
89 goto err1; 173 goto err1;
90 } 174 }
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
112 break; 196 break;
113 } 197 }
114 198
199 if (li->u.log.level == LOGLEVEL_AUDIT)
200 return 0;
201
115 err = nf_logger_find_get(ctx->family, li->type); 202 err = nf_logger_find_get(ctx->family, li->type);
116 if (err < 0) 203 if (err < 0)
117 goto err1; 204 goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
133 if (priv->prefix != nft_log_null_prefix) 220 if (priv->prefix != nft_log_null_prefix)
134 kfree(priv->prefix); 221 kfree(priv->prefix);
135 222
223 if (li->u.log.level == LOGLEVEL_AUDIT)
224 return;
225
136 nf_logger_put(ctx->family, li->type); 226 nf_logger_put(ctx->family, li->type);
137} 227}
138 228
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f52da5e2199f..42e6fadf1417 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -149,6 +149,52 @@ nla_put_failure:
149 return -1; 149 return -1;
150} 150}
151 151
152static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
153 struct nft_set *set,
154 const struct nft_set_iter *iter,
155 struct nft_set_elem *elem)
156{
157 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
158 const struct nft_data *data;
159
160 if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
161 *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
162 return 0;
163
164 data = nft_set_ext_data(ext);
165 switch (data->verdict.code) {
166 case NFT_JUMP:
167 case NFT_GOTO:
168 return nft_chain_validate(ctx, data->verdict.chain);
169 default:
170 return 0;
171 }
172}
173
174static int nft_lookup_validate(const struct nft_ctx *ctx,
175 const struct nft_expr *expr,
176 const struct nft_data **d)
177{
178 const struct nft_lookup *priv = nft_expr_priv(expr);
179 struct nft_set_iter iter;
180
181 if (!(priv->set->flags & NFT_SET_MAP) ||
182 priv->set->dtype != NFT_DATA_VERDICT)
183 return 0;
184
185 iter.genmask = nft_genmask_next(ctx->net);
186 iter.skip = 0;
187 iter.count = 0;
188 iter.err = 0;
189 iter.fn = nft_lookup_validate_setelem;
190
191 priv->set->ops->walk(ctx, priv->set, &iter);
192 if (iter.err < 0)
193 return iter.err;
194
195 return 0;
196}
197
152static const struct nft_expr_ops nft_lookup_ops = { 198static const struct nft_expr_ops nft_lookup_ops = {
153 .type = &nft_lookup_type, 199 .type = &nft_lookup_type,
154 .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), 200 .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
156 .init = nft_lookup_init, 202 .init = nft_lookup_init,
157 .destroy = nft_lookup_destroy, 203 .destroy = nft_lookup_destroy,
158 .dump = nft_lookup_dump, 204 .dump = nft_lookup_dump,
205 .validate = nft_lookup_validate,
159}; 206};
160 207
161struct nft_expr_type nft_lookup_type __read_mostly = { 208struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 204af9899482..1105a23bda5e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,5 +1,7 @@
1/* 1/*
2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
3 * Copyright (c) 2014 Intel Corporation
4 * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
3 * 5 *
4 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
@@ -9,8 +11,6 @@
9 */ 11 */
10 12
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/netlink.h> 14#include <linux/netlink.h>
15#include <linux/netfilter.h> 15#include <linux/netfilter.h>
16#include <linux/netfilter/nf_tables.h> 16#include <linux/netfilter/nf_tables.h>
@@ -24,21 +24,35 @@
24#include <net/tcp_states.h> /* for TCP_TIME_WAIT */ 24#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
25#include <net/netfilter/nf_tables.h> 25#include <net/netfilter/nf_tables.h>
26#include <net/netfilter/nf_tables_core.h> 26#include <net/netfilter/nf_tables_core.h>
27#include <net/netfilter/nft_meta.h>
28 27
29#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ 28#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
30 29
30struct nft_meta {
31 enum nft_meta_keys key:8;
32 union {
33 enum nft_registers dreg:8;
34 enum nft_registers sreg:8;
35 };
36};
37
31static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); 38static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
32 39
33void nft_meta_get_eval(const struct nft_expr *expr, 40#ifdef CONFIG_NF_TABLES_BRIDGE
34 struct nft_regs *regs, 41#include "../bridge/br_private.h"
35 const struct nft_pktinfo *pkt) 42#endif
43
44static void nft_meta_get_eval(const struct nft_expr *expr,
45 struct nft_regs *regs,
46 const struct nft_pktinfo *pkt)
36{ 47{
37 const struct nft_meta *priv = nft_expr_priv(expr); 48 const struct nft_meta *priv = nft_expr_priv(expr);
38 const struct sk_buff *skb = pkt->skb; 49 const struct sk_buff *skb = pkt->skb;
39 const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); 50 const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
40 struct sock *sk; 51 struct sock *sk;
41 u32 *dest = &regs->data[priv->dreg]; 52 u32 *dest = &regs->data[priv->dreg];
53#ifdef CONFIG_NF_TABLES_BRIDGE
54 const struct net_bridge_port *p;
55#endif
42 56
43 switch (priv->key) { 57 switch (priv->key) {
44 case NFT_META_LEN: 58 case NFT_META_LEN:
@@ -215,6 +229,18 @@ void nft_meta_get_eval(const struct nft_expr *expr,
215 nft_reg_store8(dest, !!skb->sp); 229 nft_reg_store8(dest, !!skb->sp);
216 break; 230 break;
217#endif 231#endif
232#ifdef CONFIG_NF_TABLES_BRIDGE
233 case NFT_META_BRI_IIFNAME:
234 if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
235 goto err;
236 strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
237 return;
238 case NFT_META_BRI_OIFNAME:
239 if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
240 goto err;
241 strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
242 return;
243#endif
218 default: 244 default:
219 WARN_ON(1); 245 WARN_ON(1);
220 goto err; 246 goto err;
@@ -224,11 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr,
224err: 250err:
225 regs->verdict.code = NFT_BREAK; 251 regs->verdict.code = NFT_BREAK;
226} 252}
227EXPORT_SYMBOL_GPL(nft_meta_get_eval);
228 253
229void nft_meta_set_eval(const struct nft_expr *expr, 254static void nft_meta_set_eval(const struct nft_expr *expr,
230 struct nft_regs *regs, 255 struct nft_regs *regs,
231 const struct nft_pktinfo *pkt) 256 const struct nft_pktinfo *pkt)
232{ 257{
233 const struct nft_meta *meta = nft_expr_priv(expr); 258 const struct nft_meta *meta = nft_expr_priv(expr);
234 struct sk_buff *skb = pkt->skb; 259 struct sk_buff *skb = pkt->skb;
@@ -260,18 +285,16 @@ void nft_meta_set_eval(const struct nft_expr *expr,
260 WARN_ON(1); 285 WARN_ON(1);
261 } 286 }
262} 287}
263EXPORT_SYMBOL_GPL(nft_meta_set_eval);
264 288
265const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { 289static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
266 [NFTA_META_DREG] = { .type = NLA_U32 }, 290 [NFTA_META_DREG] = { .type = NLA_U32 },
267 [NFTA_META_KEY] = { .type = NLA_U32 }, 291 [NFTA_META_KEY] = { .type = NLA_U32 },
268 [NFTA_META_SREG] = { .type = NLA_U32 }, 292 [NFTA_META_SREG] = { .type = NLA_U32 },
269}; 293};
270EXPORT_SYMBOL_GPL(nft_meta_policy);
271 294
272int nft_meta_get_init(const struct nft_ctx *ctx, 295static int nft_meta_get_init(const struct nft_ctx *ctx,
273 const struct nft_expr *expr, 296 const struct nft_expr *expr,
274 const struct nlattr * const tb[]) 297 const struct nlattr * const tb[])
275{ 298{
276 struct nft_meta *priv = nft_expr_priv(expr); 299 struct nft_meta *priv = nft_expr_priv(expr);
277 unsigned int len; 300 unsigned int len;
@@ -320,6 +343,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
320 len = sizeof(u8); 343 len = sizeof(u8);
321 break; 344 break;
322#endif 345#endif
346#ifdef CONFIG_NF_TABLES_BRIDGE
347 case NFT_META_BRI_IIFNAME:
348 case NFT_META_BRI_OIFNAME:
349 if (ctx->family != NFPROTO_BRIDGE)
350 return -EOPNOTSUPP;
351 len = IFNAMSIZ;
352 break;
353#endif
323 default: 354 default:
324 return -EOPNOTSUPP; 355 return -EOPNOTSUPP;
325 } 356 }
@@ -328,7 +359,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
328 return nft_validate_register_store(ctx, priv->dreg, NULL, 359 return nft_validate_register_store(ctx, priv->dreg, NULL,
329 NFT_DATA_VALUE, len); 360 NFT_DATA_VALUE, len);
330} 361}
331EXPORT_SYMBOL_GPL(nft_meta_get_init);
332 362
333static int nft_meta_get_validate(const struct nft_ctx *ctx, 363static int nft_meta_get_validate(const struct nft_ctx *ctx,
334 const struct nft_expr *expr, 364 const struct nft_expr *expr,
@@ -362,9 +392,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
362#endif 392#endif
363} 393}
364 394
365int nft_meta_set_validate(const struct nft_ctx *ctx, 395static int nft_meta_set_validate(const struct nft_ctx *ctx,
366 const struct nft_expr *expr, 396 const struct nft_expr *expr,
367 const struct nft_data **data) 397 const struct nft_data **data)
368{ 398{
369 struct nft_meta *priv = nft_expr_priv(expr); 399 struct nft_meta *priv = nft_expr_priv(expr);
370 unsigned int hooks; 400 unsigned int hooks;
@@ -390,11 +420,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
390 420
391 return nft_chain_validate_hooks(ctx->chain, hooks); 421 return nft_chain_validate_hooks(ctx->chain, hooks);
392} 422}
393EXPORT_SYMBOL_GPL(nft_meta_set_validate);
394 423
395int nft_meta_set_init(const struct nft_ctx *ctx, 424static int nft_meta_set_init(const struct nft_ctx *ctx,
396 const struct nft_expr *expr, 425 const struct nft_expr *expr,
397 const struct nlattr * const tb[]) 426 const struct nlattr * const tb[])
398{ 427{
399 struct nft_meta *priv = nft_expr_priv(expr); 428 struct nft_meta *priv = nft_expr_priv(expr);
400 unsigned int len; 429 unsigned int len;
@@ -426,10 +455,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
426 455
427 return 0; 456 return 0;
428} 457}
429EXPORT_SYMBOL_GPL(nft_meta_set_init);
430 458
431int nft_meta_get_dump(struct sk_buff *skb, 459static int nft_meta_get_dump(struct sk_buff *skb,
432 const struct nft_expr *expr) 460 const struct nft_expr *expr)
433{ 461{
434 const struct nft_meta *priv = nft_expr_priv(expr); 462 const struct nft_meta *priv = nft_expr_priv(expr);
435 463
@@ -442,10 +470,8 @@ int nft_meta_get_dump(struct sk_buff *skb,
442nla_put_failure: 470nla_put_failure:
443 return -1; 471 return -1;
444} 472}
445EXPORT_SYMBOL_GPL(nft_meta_get_dump);
446 473
447int nft_meta_set_dump(struct sk_buff *skb, 474static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
448 const struct nft_expr *expr)
449{ 475{
450 const struct nft_meta *priv = nft_expr_priv(expr); 476 const struct nft_meta *priv = nft_expr_priv(expr);
451 477
@@ -459,19 +485,16 @@ int nft_meta_set_dump(struct sk_buff *skb,
459nla_put_failure: 485nla_put_failure:
460 return -1; 486 return -1;
461} 487}
462EXPORT_SYMBOL_GPL(nft_meta_set_dump);
463 488
464void nft_meta_set_destroy(const struct nft_ctx *ctx, 489static void nft_meta_set_destroy(const struct nft_ctx *ctx,
465 const struct nft_expr *expr) 490 const struct nft_expr *expr)
466{ 491{
467 const struct nft_meta *priv = nft_expr_priv(expr); 492 const struct nft_meta *priv = nft_expr_priv(expr);
468 493
469 if (priv->key == NFT_META_NFTRACE) 494 if (priv->key == NFT_META_NFTRACE)
470 static_branch_dec(&nft_trace_enabled); 495 static_branch_dec(&nft_trace_enabled);
471} 496}
472EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
473 497
474static struct nft_expr_type nft_meta_type;
475static const struct nft_expr_ops nft_meta_get_ops = { 498static const struct nft_expr_ops nft_meta_get_ops = {
476 .type = &nft_meta_type, 499 .type = &nft_meta_type,
477 .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), 500 .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -510,27 +533,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
510 return ERR_PTR(-EINVAL); 533 return ERR_PTR(-EINVAL);
511} 534}
512 535
513static struct nft_expr_type nft_meta_type __read_mostly = { 536struct nft_expr_type nft_meta_type __read_mostly = {
514 .name = "meta", 537 .name = "meta",
515 .select_ops = nft_meta_select_ops, 538 .select_ops = nft_meta_select_ops,
516 .policy = nft_meta_policy, 539 .policy = nft_meta_policy,
517 .maxattr = NFTA_META_MAX, 540 .maxattr = NFTA_META_MAX,
518 .owner = THIS_MODULE, 541 .owner = THIS_MODULE,
519}; 542};
520
521static int __init nft_meta_module_init(void)
522{
523 return nft_register_expr(&nft_meta_type);
524}
525
526static void __exit nft_meta_module_exit(void)
527{
528 nft_unregister_expr(&nft_meta_type);
529}
530
531module_init(nft_meta_module_init);
532module_exit(nft_meta_module_exit);
533
534MODULE_LICENSE("GPL");
535MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
536MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 1f36954c2ba9..c15807d10b91 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr,
43 const struct nft_nat *priv = nft_expr_priv(expr); 43 const struct nft_nat *priv = nft_expr_priv(expr);
44 enum ip_conntrack_info ctinfo; 44 enum ip_conntrack_info ctinfo;
45 struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); 45 struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
46 struct nf_nat_range range; 46 struct nf_nat_range2 range;
47 47
48 memset(&range, 0, sizeof(range)); 48 memset(&range, 0, sizeof(range));
49 if (priv->sreg_addr_min) { 49 if (priv->sreg_addr_min) {
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 5a3a52c71545..1f4d0854cf70 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,13 +24,11 @@ struct nft_ng_inc {
24 u32 modulus; 24 u32 modulus;
25 atomic_t counter; 25 atomic_t counter;
26 u32 offset; 26 u32 offset;
27 struct nft_set *map;
27}; 28};
28 29
29static void nft_ng_inc_eval(const struct nft_expr *expr, 30static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
30 struct nft_regs *regs,
31 const struct nft_pktinfo *pkt)
32{ 31{
33 struct nft_ng_inc *priv = nft_expr_priv(expr);
34 u32 nval, oval; 32 u32 nval, oval;
35 33
36 do { 34 do {
@@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
38 nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; 36 nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
39 } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); 37 } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
40 38
41 regs->data[priv->dreg] = nval + priv->offset; 39 return nval + priv->offset;
40}
41
42static void nft_ng_inc_eval(const struct nft_expr *expr,
43 struct nft_regs *regs,
44 const struct nft_pktinfo *pkt)
45{
46 struct nft_ng_inc *priv = nft_expr_priv(expr);
47
48 regs->data[priv->dreg] = nft_ng_inc_gen(priv);
49}
50
51static void nft_ng_inc_map_eval(const struct nft_expr *expr,
52 struct nft_regs *regs,
53 const struct nft_pktinfo *pkt)
54{
55 struct nft_ng_inc *priv = nft_expr_priv(expr);
56 const struct nft_set *map = priv->map;
57 const struct nft_set_ext *ext;
58 u32 result;
59 bool found;
60
61 result = nft_ng_inc_gen(priv);
62 found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
63
64 if (!found)
65 return;
66
67 nft_data_copy(&regs->data[priv->dreg],
68 nft_set_ext_data(ext), map->dlen);
42} 69}
43 70
44static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { 71static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
@@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
46 [NFTA_NG_MODULUS] = { .type = NLA_U32 }, 73 [NFTA_NG_MODULUS] = { .type = NLA_U32 },
47 [NFTA_NG_TYPE] = { .type = NLA_U32 }, 74 [NFTA_NG_TYPE] = { .type = NLA_U32 },
48 [NFTA_NG_OFFSET] = { .type = NLA_U32 }, 75 [NFTA_NG_OFFSET] = { .type = NLA_U32 },
76 [NFTA_NG_SET_NAME] = { .type = NLA_STRING,
77 .len = NFT_SET_MAXNAMELEN - 1 },
78 [NFTA_NG_SET_ID] = { .type = NLA_U32 },
49}; 79};
50 80
51static int nft_ng_inc_init(const struct nft_ctx *ctx, 81static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -71,6 +101,22 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
71 NFT_DATA_VALUE, sizeof(u32)); 101 NFT_DATA_VALUE, sizeof(u32));
72} 102}
73 103
104static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
105 const struct nft_expr *expr,
106 const struct nlattr * const tb[])
107{
108 struct nft_ng_inc *priv = nft_expr_priv(expr);
109 u8 genmask = nft_genmask_next(ctx->net);
110
111 nft_ng_inc_init(ctx, expr, tb);
112
113 priv->map = nft_set_lookup_global(ctx->net, ctx->table,
114 tb[NFTA_NG_SET_NAME],
115 tb[NFTA_NG_SET_ID], genmask);
116
117 return PTR_ERR_OR_ZERO(priv->map);
118}
119
74static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, 120static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
75 u32 modulus, enum nft_ng_types type, u32 offset) 121 u32 modulus, enum nft_ng_types type, u32 offset)
76{ 122{
@@ -97,22 +143,63 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
97 priv->offset); 143 priv->offset);
98} 144}
99 145
146static int nft_ng_inc_map_dump(struct sk_buff *skb,
147 const struct nft_expr *expr)
148{
149 const struct nft_ng_inc *priv = nft_expr_priv(expr);
150
151 if (nft_ng_dump(skb, priv->dreg, priv->modulus,
152 NFT_NG_INCREMENTAL, priv->offset) ||
153 nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
154 goto nla_put_failure;
155
156 return 0;
157
158nla_put_failure:
159 return -1;
160}
161
100struct nft_ng_random { 162struct nft_ng_random {
101 enum nft_registers dreg:8; 163 enum nft_registers dreg:8;
102 u32 modulus; 164 u32 modulus;
103 u32 offset; 165 u32 offset;
166 struct nft_set *map;
104}; 167};
105 168
169static u32 nft_ng_random_gen(struct nft_ng_random *priv)
170{
171 struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
172
173 return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
174 priv->offset;
175}
176
106static void nft_ng_random_eval(const struct nft_expr *expr, 177static void nft_ng_random_eval(const struct nft_expr *expr,
107 struct nft_regs *regs, 178 struct nft_regs *regs,
108 const struct nft_pktinfo *pkt) 179 const struct nft_pktinfo *pkt)
109{ 180{
110 struct nft_ng_random *priv = nft_expr_priv(expr); 181 struct nft_ng_random *priv = nft_expr_priv(expr);
111 struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
112 u32 val;
113 182
114 val = reciprocal_scale(prandom_u32_state(state), priv->modulus); 183 regs->data[priv->dreg] = nft_ng_random_gen(priv);
115 regs->data[priv->dreg] = val + priv->offset; 184}
185
186static void nft_ng_random_map_eval(const struct nft_expr *expr,
187 struct nft_regs *regs,
188 const struct nft_pktinfo *pkt)
189{
190 struct nft_ng_random *priv = nft_expr_priv(expr);
191 const struct nft_set *map = priv->map;
192 const struct nft_set_ext *ext;
193 u32 result;
194 bool found;
195
196 result = nft_ng_random_gen(priv);
197 found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
198 if (!found)
199 return;
200
201 nft_data_copy(&regs->data[priv->dreg],
202 nft_set_ext_data(ext), map->dlen);
116} 203}
117 204
118static int nft_ng_random_init(const struct nft_ctx *ctx, 205static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -139,6 +226,23 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
139 NFT_DATA_VALUE, sizeof(u32)); 226 NFT_DATA_VALUE, sizeof(u32));
140} 227}
141 228
229static int nft_ng_random_map_init(const struct nft_ctx *ctx,
230 const struct nft_expr *expr,
231 const struct nlattr * const tb[])
232{
233 struct nft_ng_random *priv = nft_expr_priv(expr);
234 u8 genmask = nft_genmask_next(ctx->net);
235
236 nft_ng_random_init(ctx, expr, tb);
237 priv->map = nft_set_lookup_global(ctx->net, ctx->table,
238 tb[NFTA_NG_SET_NAME],
239 tb[NFTA_NG_SET_ID], genmask);
240 if (IS_ERR(priv->map))
241 return PTR_ERR(priv->map);
242
243 return 0;
244}
245
142static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) 246static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
143{ 247{
144 const struct nft_ng_random *priv = nft_expr_priv(expr); 248 const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -147,6 +251,22 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
147 priv->offset); 251 priv->offset);
148} 252}
149 253
254static int nft_ng_random_map_dump(struct sk_buff *skb,
255 const struct nft_expr *expr)
256{
257 const struct nft_ng_random *priv = nft_expr_priv(expr);
258
259 if (nft_ng_dump(skb, priv->dreg, priv->modulus,
260 NFT_NG_RANDOM, priv->offset) ||
261 nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
262 goto nla_put_failure;
263
264 return 0;
265
266nla_put_failure:
267 return -1;
268}
269
150static struct nft_expr_type nft_ng_type; 270static struct nft_expr_type nft_ng_type;
151static const struct nft_expr_ops nft_ng_inc_ops = { 271static const struct nft_expr_ops nft_ng_inc_ops = {
152 .type = &nft_ng_type, 272 .type = &nft_ng_type,
@@ -156,6 +276,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
156 .dump = nft_ng_inc_dump, 276 .dump = nft_ng_inc_dump,
157}; 277};
158 278
279static const struct nft_expr_ops nft_ng_inc_map_ops = {
280 .type = &nft_ng_type,
281 .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
282 .eval = nft_ng_inc_map_eval,
283 .init = nft_ng_inc_map_init,
284 .dump = nft_ng_inc_map_dump,
285};
286
159static const struct nft_expr_ops nft_ng_random_ops = { 287static const struct nft_expr_ops nft_ng_random_ops = {
160 .type = &nft_ng_type, 288 .type = &nft_ng_type,
161 .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), 289 .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -164,6 +292,14 @@ static const struct nft_expr_ops nft_ng_random_ops = {
164 .dump = nft_ng_random_dump, 292 .dump = nft_ng_random_dump,
165}; 293};
166 294
295static const struct nft_expr_ops nft_ng_random_map_ops = {
296 .type = &nft_ng_type,
297 .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
298 .eval = nft_ng_random_map_eval,
299 .init = nft_ng_random_map_init,
300 .dump = nft_ng_random_map_dump,
301};
302
167static const struct nft_expr_ops * 303static const struct nft_expr_ops *
168nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) 304nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
169{ 305{
@@ -178,8 +314,12 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
178 314
179 switch (type) { 315 switch (type) {
180 case NFT_NG_INCREMENTAL: 316 case NFT_NG_INCREMENTAL:
317 if (tb[NFTA_NG_SET_NAME])
318 return &nft_ng_inc_map_ops;
181 return &nft_ng_inc_ops; 319 return &nft_ng_inc_ops;
182 case NFT_NG_RANDOM: 320 case NFT_NG_RANDOM:
321 if (tb[NFTA_NG_SET_NAME])
322 return &nft_ng_random_map_ops;
183 return &nft_ng_random_ops; 323 return &nft_ng_random_ops;
184 } 324 }
185 325
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 0b02407773ad..cdf348f751ec 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
38 return -EINVAL; 38 return -EINVAL;
39 39
40 objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); 40 objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
41 obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, 41 obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
42 genmask); 42 genmask);
43 if (IS_ERR(obj)) 43 if (IS_ERR(obj))
44 return -ENOENT; 44 return -ENOENT;
45 45
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 11a2071b6dd4..76dba9f6b6f6 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -7,8 +7,6 @@
7 */ 7 */
8 8
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/netlink.h> 10#include <linux/netlink.h>
13#include <linux/netfilter.h> 11#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h> 12#include <linux/netfilter/nf_tables.h>
@@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
179 return nft_chain_validate_hooks(ctx->chain, hooks); 177 return nft_chain_validate_hooks(ctx->chain, hooks);
180} 178}
181 179
182static struct nft_expr_type nft_rt_type;
183static const struct nft_expr_ops nft_rt_get_ops = { 180static const struct nft_expr_ops nft_rt_get_ops = {
184 .type = &nft_rt_type, 181 .type = &nft_rt_type,
185 .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)), 182 .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)),
@@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = {
189 .validate = nft_rt_validate, 186 .validate = nft_rt_validate,
190}; 187};
191 188
192static struct nft_expr_type nft_rt_type __read_mostly = { 189struct nft_expr_type nft_rt_type __read_mostly = {
193 .name = "rt", 190 .name = "rt",
194 .ops = &nft_rt_get_ops, 191 .ops = &nft_rt_get_ops,
195 .policy = nft_rt_policy, 192 .policy = nft_rt_policy,
196 .maxattr = NFTA_RT_MAX, 193 .maxattr = NFTA_RT_MAX,
197 .owner = THIS_MODULE, 194 .owner = THIS_MODULE,
198}; 195};
199
200static int __init nft_rt_module_init(void)
201{
202 return nft_register_expr(&nft_rt_type);
203}
204
205static void __exit nft_rt_module_exit(void)
206{
207 nft_unregister_expr(&nft_rt_type);
208}
209
210module_init(nft_rt_module_init);
211module_exit(nft_rt_module_exit);
212
213MODULE_LICENSE("GPL");
214MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
215MODULE_ALIAS_NFT_EXPR("rt");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 45fb2752fb63..d6626e01c7ee 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
296 return true; 296 return true;
297} 297}
298 298
299static struct nft_set_type nft_bitmap_type;
300static struct nft_set_ops nft_bitmap_ops __read_mostly = {
301 .type = &nft_bitmap_type,
302 .privsize = nft_bitmap_privsize,
303 .elemsize = offsetof(struct nft_bitmap_elem, ext),
304 .estimate = nft_bitmap_estimate,
305 .init = nft_bitmap_init,
306 .destroy = nft_bitmap_destroy,
307 .insert = nft_bitmap_insert,
308 .remove = nft_bitmap_remove,
309 .deactivate = nft_bitmap_deactivate,
310 .flush = nft_bitmap_flush,
311 .activate = nft_bitmap_activate,
312 .lookup = nft_bitmap_lookup,
313 .walk = nft_bitmap_walk,
314 .get = nft_bitmap_get,
315};
316
317static struct nft_set_type nft_bitmap_type __read_mostly = { 299static struct nft_set_type nft_bitmap_type __read_mostly = {
318 .ops = &nft_bitmap_ops,
319 .owner = THIS_MODULE, 300 .owner = THIS_MODULE,
301 .ops = {
302 .privsize = nft_bitmap_privsize,
303 .elemsize = offsetof(struct nft_bitmap_elem, ext),
304 .estimate = nft_bitmap_estimate,
305 .init = nft_bitmap_init,
306 .destroy = nft_bitmap_destroy,
307 .insert = nft_bitmap_insert,
308 .remove = nft_bitmap_remove,
309 .deactivate = nft_bitmap_deactivate,
310 .flush = nft_bitmap_flush,
311 .activate = nft_bitmap_activate,
312 .lookup = nft_bitmap_lookup,
313 .walk = nft_bitmap_walk,
314 .get = nft_bitmap_get,
315 },
320}; 316};
321 317
322static int __init nft_bitmap_module_init(void) 318static int __init nft_bitmap_module_init(void)
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index fc9c6d5d64cd..6f9a1365a09f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -311,8 +311,16 @@ static void nft_rhash_gc(struct work_struct *work)
311 continue; 311 continue;
312 } 312 }
313 313
314 if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
315 struct nft_expr *expr = nft_set_ext_expr(&he->ext);
316
317 if (expr->ops->gc &&
318 expr->ops->gc(read_pnet(&set->net), expr))
319 goto gc;
320 }
314 if (!nft_set_elem_expired(&he->ext)) 321 if (!nft_set_elem_expired(&he->ext))
315 continue; 322 continue;
323gc:
316 if (nft_set_elem_mark_busy(&he->ext)) 324 if (nft_set_elem_mark_busy(&he->ext))
317 continue; 325 continue;
318 326
@@ -339,6 +347,14 @@ static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
339 return sizeof(struct nft_rhash); 347 return sizeof(struct nft_rhash);
340} 348}
341 349
350static void nft_rhash_gc_init(const struct nft_set *set)
351{
352 struct nft_rhash *priv = nft_set_priv(set);
353
354 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
355 nft_set_gc_interval(set));
356}
357
342static int nft_rhash_init(const struct nft_set *set, 358static int nft_rhash_init(const struct nft_set *set,
343 const struct nft_set_desc *desc, 359 const struct nft_set_desc *desc,
344 const struct nlattr * const tb[]) 360 const struct nlattr * const tb[])
@@ -356,8 +372,8 @@ static int nft_rhash_init(const struct nft_set *set,
356 372
357 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); 373 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
358 if (set->flags & NFT_SET_TIMEOUT) 374 if (set->flags & NFT_SET_TIMEOUT)
359 queue_delayed_work(system_power_efficient_wq, &priv->gc_work, 375 nft_rhash_gc_init(set);
360 nft_set_gc_interval(set)); 376
361 return 0; 377 return 0;
362} 378}
363 379
@@ -605,6 +621,12 @@ static void nft_hash_destroy(const struct nft_set *set)
605static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, 621static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
606 struct nft_set_estimate *est) 622 struct nft_set_estimate *est)
607{ 623{
624 if (!desc->size)
625 return false;
626
627 if (desc->klen == 4)
628 return false;
629
608 est->size = sizeof(struct nft_hash) + 630 est->size = sizeof(struct nft_hash) +
609 nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + 631 nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
610 desc->size * sizeof(struct nft_hash_elem); 632 desc->size * sizeof(struct nft_hash_elem);
@@ -614,91 +636,101 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
614 return true; 636 return true;
615} 637}
616 638
617static struct nft_set_type nft_hash_type; 639static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
618static struct nft_set_ops nft_rhash_ops __read_mostly = { 640 struct nft_set_estimate *est)
619 .type = &nft_hash_type, 641{
620 .privsize = nft_rhash_privsize, 642 if (!desc->size)
621 .elemsize = offsetof(struct nft_rhash_elem, ext), 643 return false;
622 .estimate = nft_rhash_estimate,
623 .init = nft_rhash_init,
624 .destroy = nft_rhash_destroy,
625 .insert = nft_rhash_insert,
626 .activate = nft_rhash_activate,
627 .deactivate = nft_rhash_deactivate,
628 .flush = nft_rhash_flush,
629 .remove = nft_rhash_remove,
630 .lookup = nft_rhash_lookup,
631 .update = nft_rhash_update,
632 .walk = nft_rhash_walk,
633 .get = nft_rhash_get,
634 .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
635};
636
637static struct nft_set_ops nft_hash_ops __read_mostly = {
638 .type = &nft_hash_type,
639 .privsize = nft_hash_privsize,
640 .elemsize = offsetof(struct nft_hash_elem, ext),
641 .estimate = nft_hash_estimate,
642 .init = nft_hash_init,
643 .destroy = nft_hash_destroy,
644 .insert = nft_hash_insert,
645 .activate = nft_hash_activate,
646 .deactivate = nft_hash_deactivate,
647 .flush = nft_hash_flush,
648 .remove = nft_hash_remove,
649 .lookup = nft_hash_lookup,
650 .walk = nft_hash_walk,
651 .get = nft_hash_get,
652 .features = NFT_SET_MAP | NFT_SET_OBJECT,
653};
654 644
655static struct nft_set_ops nft_hash_fast_ops __read_mostly = { 645 if (desc->klen != 4)
656 .type = &nft_hash_type, 646 return false;
657 .privsize = nft_hash_privsize,
658 .elemsize = offsetof(struct nft_hash_elem, ext),
659 .estimate = nft_hash_estimate,
660 .init = nft_hash_init,
661 .destroy = nft_hash_destroy,
662 .insert = nft_hash_insert,
663 .activate = nft_hash_activate,
664 .deactivate = nft_hash_deactivate,
665 .flush = nft_hash_flush,
666 .remove = nft_hash_remove,
667 .lookup = nft_hash_lookup_fast,
668 .walk = nft_hash_walk,
669 .get = nft_hash_get,
670 .features = NFT_SET_MAP | NFT_SET_OBJECT,
671};
672 647
673static const struct nft_set_ops * 648 est->size = sizeof(struct nft_hash) +
674nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, 649 nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
675 u32 flags) 650 desc->size * sizeof(struct nft_hash_elem);
676{ 651 est->lookup = NFT_SET_CLASS_O_1;
677 if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) { 652 est->space = NFT_SET_CLASS_O_N;
678 switch (desc->klen) {
679 case 4:
680 return &nft_hash_fast_ops;
681 default:
682 return &nft_hash_ops;
683 }
684 }
685 653
686 return &nft_rhash_ops; 654 return true;
687} 655}
688 656
657static struct nft_set_type nft_rhash_type __read_mostly = {
658 .owner = THIS_MODULE,
659 .features = NFT_SET_MAP | NFT_SET_OBJECT |
660 NFT_SET_TIMEOUT | NFT_SET_EVAL,
661 .ops = {
662 .privsize = nft_rhash_privsize,
663 .elemsize = offsetof(struct nft_rhash_elem, ext),
664 .estimate = nft_rhash_estimate,
665 .init = nft_rhash_init,
666 .gc_init = nft_rhash_gc_init,
667 .destroy = nft_rhash_destroy,
668 .insert = nft_rhash_insert,
669 .activate = nft_rhash_activate,
670 .deactivate = nft_rhash_deactivate,
671 .flush = nft_rhash_flush,
672 .remove = nft_rhash_remove,
673 .lookup = nft_rhash_lookup,
674 .update = nft_rhash_update,
675 .walk = nft_rhash_walk,
676 .get = nft_rhash_get,
677 },
678};
679
689static struct nft_set_type nft_hash_type __read_mostly = { 680static struct nft_set_type nft_hash_type __read_mostly = {
690 .select_ops = nft_hash_select_ops,
691 .owner = THIS_MODULE, 681 .owner = THIS_MODULE,
682 .features = NFT_SET_MAP | NFT_SET_OBJECT,
683 .ops = {
684 .privsize = nft_hash_privsize,
685 .elemsize = offsetof(struct nft_hash_elem, ext),
686 .estimate = nft_hash_estimate,
687 .init = nft_hash_init,
688 .destroy = nft_hash_destroy,
689 .insert = nft_hash_insert,
690 .activate = nft_hash_activate,
691 .deactivate = nft_hash_deactivate,
692 .flush = nft_hash_flush,
693 .remove = nft_hash_remove,
694 .lookup = nft_hash_lookup,
695 .walk = nft_hash_walk,
696 .get = nft_hash_get,
697 },
698};
699
700static struct nft_set_type nft_hash_fast_type __read_mostly = {
701 .owner = THIS_MODULE,
702 .features = NFT_SET_MAP | NFT_SET_OBJECT,
703 .ops = {
704 .privsize = nft_hash_privsize,
705 .elemsize = offsetof(struct nft_hash_elem, ext),
706 .estimate = nft_hash_fast_estimate,
707 .init = nft_hash_init,
708 .destroy = nft_hash_destroy,
709 .insert = nft_hash_insert,
710 .activate = nft_hash_activate,
711 .deactivate = nft_hash_deactivate,
712 .flush = nft_hash_flush,
713 .remove = nft_hash_remove,
714 .lookup = nft_hash_lookup_fast,
715 .walk = nft_hash_walk,
716 .get = nft_hash_get,
717 },
692}; 718};
693 719
694static int __init nft_hash_module_init(void) 720static int __init nft_hash_module_init(void)
695{ 721{
696 return nft_register_set(&nft_hash_type); 722 if (nft_register_set(&nft_hash_fast_type) ||
723 nft_register_set(&nft_hash_type) ||
724 nft_register_set(&nft_rhash_type))
725 return 1;
726 return 0;
697} 727}
698 728
699static void __exit nft_hash_module_exit(void) 729static void __exit nft_hash_module_exit(void)
700{ 730{
731 nft_unregister_set(&nft_rhash_type);
701 nft_unregister_set(&nft_hash_type); 732 nft_unregister_set(&nft_hash_type);
733 nft_unregister_set(&nft_hash_fast_type);
702} 734}
703 735
704module_init(nft_hash_module_init); 736module_init(nft_hash_module_init);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e6f08bc5f359..d260ce2d6671 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -22,6 +22,7 @@ struct nft_rbtree {
22 struct rb_root root; 22 struct rb_root root;
23 rwlock_t lock; 23 rwlock_t lock;
24 seqcount_t count; 24 seqcount_t count;
25 struct delayed_work gc_work;
25}; 26};
26 27
27struct nft_rbtree_elem { 28struct nft_rbtree_elem {
@@ -265,6 +266,7 @@ static void nft_rbtree_activate(const struct net *net,
265 struct nft_rbtree_elem *rbe = elem->priv; 266 struct nft_rbtree_elem *rbe = elem->priv;
266 267
267 nft_set_elem_change_active(net, set, &rbe->ext); 268 nft_set_elem_change_active(net, set, &rbe->ext);
269 nft_set_elem_clear_busy(&rbe->ext);
268} 270}
269 271
270static bool nft_rbtree_flush(const struct net *net, 272static bool nft_rbtree_flush(const struct net *net,
@@ -272,8 +274,12 @@ static bool nft_rbtree_flush(const struct net *net,
272{ 274{
273 struct nft_rbtree_elem *rbe = priv; 275 struct nft_rbtree_elem *rbe = priv;
274 276
275 nft_set_elem_change_active(net, set, &rbe->ext); 277 if (!nft_set_elem_mark_busy(&rbe->ext) ||
276 return true; 278 !nft_is_active(net, &rbe->ext)) {
279 nft_set_elem_change_active(net, set, &rbe->ext);
280 return true;
281 }
282 return false;
277} 283}
278 284
279static void *nft_rbtree_deactivate(const struct net *net, 285static void *nft_rbtree_deactivate(const struct net *net,
@@ -347,6 +353,62 @@ cont:
347 read_unlock_bh(&priv->lock); 353 read_unlock_bh(&priv->lock);
348} 354}
349 355
356static void nft_rbtree_gc(struct work_struct *work)
357{
358 struct nft_set_gc_batch *gcb = NULL;
359 struct rb_node *node, *prev = NULL;
360 struct nft_rbtree_elem *rbe;
361 struct nft_rbtree *priv;
362 struct nft_set *set;
363 int i;
364
365 priv = container_of(work, struct nft_rbtree, gc_work.work);
366 set = nft_set_container_of(priv);
367
368 write_lock_bh(&priv->lock);
369 write_seqcount_begin(&priv->count);
370 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
371 rbe = rb_entry(node, struct nft_rbtree_elem, node);
372
373 if (nft_rbtree_interval_end(rbe)) {
374 prev = node;
375 continue;
376 }
377 if (!nft_set_elem_expired(&rbe->ext))
378 continue;
379 if (nft_set_elem_mark_busy(&rbe->ext))
380 continue;
381
382 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
383 if (!gcb)
384 goto out;
385
386 atomic_dec(&set->nelems);
387 nft_set_gc_batch_add(gcb, rbe);
388
389 if (prev) {
390 rbe = rb_entry(prev, struct nft_rbtree_elem, node);
391 atomic_dec(&set->nelems);
392 nft_set_gc_batch_add(gcb, rbe);
393 }
394 node = rb_next(node);
395 }
396out:
397 if (gcb) {
398 for (i = 0; i < gcb->head.cnt; i++) {
399 rbe = gcb->elems[i];
400 rb_erase(&rbe->node, &priv->root);
401 }
402 }
403 write_seqcount_end(&priv->count);
404 write_unlock_bh(&priv->lock);
405
406 nft_set_gc_batch_complete(gcb);
407
408 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
409 nft_set_gc_interval(set));
410}
411
350static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[], 412static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[],
351 const struct nft_set_desc *desc) 413 const struct nft_set_desc *desc)
352{ 414{
@@ -362,6 +424,12 @@ static int nft_rbtree_init(const struct nft_set *set,
362 rwlock_init(&priv->lock); 424 rwlock_init(&priv->lock);
363 seqcount_init(&priv->count); 425 seqcount_init(&priv->count);
364 priv->root = RB_ROOT; 426 priv->root = RB_ROOT;
427
428 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
429 if (set->flags & NFT_SET_TIMEOUT)
430 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
431 nft_set_gc_interval(set));
432
365 return 0; 433 return 0;
366} 434}
367 435
@@ -371,6 +439,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
371 struct nft_rbtree_elem *rbe; 439 struct nft_rbtree_elem *rbe;
372 struct rb_node *node; 440 struct rb_node *node;
373 441
442 cancel_delayed_work_sync(&priv->gc_work);
374 while ((node = priv->root.rb_node) != NULL) { 443 while ((node = priv->root.rb_node) != NULL) {
375 rb_erase(node, &priv->root); 444 rb_erase(node, &priv->root);
376 rbe = rb_entry(node, struct nft_rbtree_elem, node); 445 rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -393,28 +462,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
393 return true; 462 return true;
394} 463}
395 464
396static struct nft_set_type nft_rbtree_type;
397static struct nft_set_ops nft_rbtree_ops __read_mostly = {
398 .type = &nft_rbtree_type,
399 .privsize = nft_rbtree_privsize,
400 .elemsize = offsetof(struct nft_rbtree_elem, ext),
401 .estimate = nft_rbtree_estimate,
402 .init = nft_rbtree_init,
403 .destroy = nft_rbtree_destroy,
404 .insert = nft_rbtree_insert,
405 .remove = nft_rbtree_remove,
406 .deactivate = nft_rbtree_deactivate,
407 .flush = nft_rbtree_flush,
408 .activate = nft_rbtree_activate,
409 .lookup = nft_rbtree_lookup,
410 .walk = nft_rbtree_walk,
411 .get = nft_rbtree_get,
412 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
413};
414
415static struct nft_set_type nft_rbtree_type __read_mostly = { 465static struct nft_set_type nft_rbtree_type __read_mostly = {
416 .ops = &nft_rbtree_ops,
417 .owner = THIS_MODULE, 466 .owner = THIS_MODULE,
467 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
468 .ops = {
469 .privsize = nft_rbtree_privsize,
470 .elemsize = offsetof(struct nft_rbtree_elem, ext),
471 .estimate = nft_rbtree_estimate,
472 .init = nft_rbtree_init,
473 .destroy = nft_rbtree_destroy,
474 .insert = nft_rbtree_insert,
475 .remove = nft_rbtree_remove,
476 .deactivate = nft_rbtree_deactivate,
477 .flush = nft_rbtree_flush,
478 .activate = nft_rbtree_activate,
479 .lookup = nft_rbtree_lookup,
480 .walk = nft_rbtree_walk,
481 .get = nft_rbtree_get,
482 },
418}; 483};
419 484
420static int __init nft_rbtree_module_init(void) 485static int __init nft_rbtree_module_init(void)
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644
index 000000000000..f28a0b944087
--- /dev/null
+++ b/net/netfilter/nft_socket.c
@@ -0,0 +1,144 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/module.h>
3#include <linux/netfilter/nf_tables.h>
4#include <net/netfilter/nf_tables.h>
5#include <net/netfilter/nf_tables_core.h>
6#include <net/netfilter/nf_socket.h>
7#include <net/inet_sock.h>
8#include <net/tcp.h>
9
10struct nft_socket {
11 enum nft_socket_keys key:8;
12 union {
13 enum nft_registers dreg:8;
14 };
15};
16
17static void nft_socket_eval(const struct nft_expr *expr,
18 struct nft_regs *regs,
19 const struct nft_pktinfo *pkt)
20{
21 const struct nft_socket *priv = nft_expr_priv(expr);
22 struct sk_buff *skb = pkt->skb;
23 struct sock *sk = skb->sk;
24 u32 *dest = &regs->data[priv->dreg];
25
26 if (!sk)
27 switch(nft_pf(pkt)) {
28 case NFPROTO_IPV4:
29 sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
30 break;
31#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
32 case NFPROTO_IPV6:
33 sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
34 break;
35#endif
36 default:
37 WARN_ON_ONCE(1);
38 regs->verdict.code = NFT_BREAK;
39 return;
40 }
41
42 if(!sk) {
43 nft_reg_store8(dest, 0);
44 return;
45 }
46
47 /* So that subsequent socket matching not to require other lookups. */
48 skb->sk = sk;
49
50 switch(priv->key) {
51 case NFT_SOCKET_TRANSPARENT:
52 nft_reg_store8(dest, inet_sk_transparent(sk));
53 break;
54 default:
55 WARN_ON(1);
56 regs->verdict.code = NFT_BREAK;
57 }
58}
59
60static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
61 [NFTA_SOCKET_KEY] = { .type = NLA_U32 },
62 [NFTA_SOCKET_DREG] = { .type = NLA_U32 },
63};
64
65static int nft_socket_init(const struct nft_ctx *ctx,
66 const struct nft_expr *expr,
67 const struct nlattr * const tb[])
68{
69 struct nft_socket *priv = nft_expr_priv(expr);
70 unsigned int len;
71
72 if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
73 return -EINVAL;
74
75 switch(ctx->family) {
76 case NFPROTO_IPV4:
77#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
78 case NFPROTO_IPV6:
79#endif
80 case NFPROTO_INET:
81 break;
82 default:
83 return -EOPNOTSUPP;
84 }
85
86 priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
87 switch(priv->key) {
88 case NFT_SOCKET_TRANSPARENT:
89 len = sizeof(u8);
90 break;
91 default:
92 return -EOPNOTSUPP;
93 }
94
95 priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
96 return nft_validate_register_store(ctx, priv->dreg, NULL,
97 NFT_DATA_VALUE, len);
98}
99
100static int nft_socket_dump(struct sk_buff *skb,
101 const struct nft_expr *expr)
102{
103 const struct nft_socket *priv = nft_expr_priv(expr);
104
105 if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
106 return -1;
107 if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
108 return -1;
109 return 0;
110}
111
112static struct nft_expr_type nft_socket_type;
113static const struct nft_expr_ops nft_socket_ops = {
114 .type = &nft_socket_type,
115 .size = NFT_EXPR_SIZE(sizeof(struct nft_socket)),
116 .eval = nft_socket_eval,
117 .init = nft_socket_init,
118 .dump = nft_socket_dump,
119};
120
121static struct nft_expr_type nft_socket_type __read_mostly = {
122 .name = "socket",
123 .ops = &nft_socket_ops,
124 .policy = nft_socket_policy,
125 .maxattr = NFTA_SOCKET_MAX,
126 .owner = THIS_MODULE,
127};
128
129static int __init nft_socket_module_init(void)
130{
131 return nft_register_expr(&nft_socket_type);
132}
133
134static void __exit nft_socket_module_exit(void)
135{
136 nft_unregister_expr(&nft_socket_type);
137}
138
139module_init(nft_socket_module_init);
140module_exit(nft_socket_module_exit);
141
142MODULE_LICENSE("GPL");
143MODULE_AUTHOR("Máté Eckl");
144MODULE_DESCRIPTION("nf_tables socket match module");
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 58aa9dd3c5b7..1d437875e15a 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -21,8 +21,8 @@
21static unsigned int 21static unsigned int
22netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) 22netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
23{ 23{
24 const struct nf_nat_range *range = par->targinfo; 24 const struct nf_nat_range2 *range = par->targinfo;
25 struct nf_nat_range newrange; 25 struct nf_nat_range2 newrange;
26 struct nf_conn *ct; 26 struct nf_conn *ct;
27 enum ip_conntrack_info ctinfo; 27 enum ip_conntrack_info ctinfo;
28 union nf_inet_addr new_addr, netmask; 28 union nf_inet_addr new_addr, netmask;
@@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
56 56
57static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) 57static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
58{ 58{
59 const struct nf_nat_range *range = par->targinfo; 59 const struct nf_nat_range2 *range = par->targinfo;
60 60
61 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 61 if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
62 return -EINVAL; 62 return -EINVAL;
@@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
75 enum ip_conntrack_info ctinfo; 75 enum ip_conntrack_info ctinfo;
76 __be32 new_ip, netmask; 76 __be32 new_ip, netmask;
77 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; 77 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
78 struct nf_nat_range newrange; 78 struct nf_nat_range2 newrange;
79 79
80 WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING && 80 WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
81 xt_hooknum(par) != NF_INET_POST_ROUTING && 81 xt_hooknum(par) != NF_INET_POST_ROUTING &&
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index c7f8958cea4a..1ed0cac585c4 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -13,7 +13,6 @@
13#include <linux/netfilter/x_tables.h> 13#include <linux/netfilter/x_tables.h>
14#include <linux/netfilter/xt_NFLOG.h> 14#include <linux/netfilter/xt_NFLOG.h>
15#include <net/netfilter/nf_log.h> 15#include <net/netfilter/nf_log.h>
16#include <net/netfilter/nfnetlink_log.h>
17 16
18MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 17MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
19MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); 18MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
@@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
37 if (info->flags & XT_NFLOG_F_COPY_LEN) 36 if (info->flags & XT_NFLOG_F_COPY_LEN)
38 li.u.ulog.flags |= NF_LOG_F_COPY_LEN; 37 li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
39 38
40 nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb, 39 nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
41 xt_in(par), xt_out(par), &li, info->prefix); 40 xt_out(par), &li, "%s", info->prefix);
41
42 return XT_CONTINUE; 42 return XT_CONTINUE;
43} 43}
44 44
@@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par)
50 return -EINVAL; 50 return -EINVAL;
51 if (info->prefix[sizeof(info->prefix) - 1] != '\0') 51 if (info->prefix[sizeof(info->prefix) - 1] != '\0')
52 return -EINVAL; 52 return -EINVAL;
53 return 0; 53
54 return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
55}
56
57static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
58{
59 nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
54} 60}
55 61
56static struct xt_target nflog_tg_reg __read_mostly = { 62static struct xt_target nflog_tg_reg __read_mostly = {
@@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = {
58 .revision = 0, 64 .revision = 0,
59 .family = NFPROTO_UNSPEC, 65 .family = NFPROTO_UNSPEC,
60 .checkentry = nflog_tg_check, 66 .checkentry = nflog_tg_check,
67 .destroy = nflog_tg_destroy,
61 .target = nflog_tg, 68 .target = nflog_tg,
62 .targetsize = sizeof(struct xt_nflog_info), 69 .targetsize = sizeof(struct xt_nflog_info),
63 .me = THIS_MODULE, 70 .me = THIS_MODULE,
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 98a4c6d4f1cb..5ce9461e979c 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
36 36
37static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) 37static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
38{ 38{
39 const struct nf_nat_range *range = par->targinfo; 39 const struct nf_nat_range2 *range = par->targinfo;
40 40
41 if (range->flags & NF_NAT_RANGE_MAP_IPS) 41 if (range->flags & NF_NAT_RANGE_MAP_IPS)
42 return -EINVAL; 42 return -EINVAL;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 8c89323c06af..58fce4e749a9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,264 +33,9 @@
33#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 33#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
34#endif 34#endif
35 35
36#include <net/netfilter/nf_tproxy.h>
36#include <linux/netfilter/xt_TPROXY.h> 37#include <linux/netfilter/xt_TPROXY.h>
37 38
38enum nf_tproxy_lookup_t {
39 NFT_LOOKUP_LISTENER,
40 NFT_LOOKUP_ESTABLISHED,
41};
42
43static bool tproxy_sk_is_transparent(struct sock *sk)
44{
45 switch (sk->sk_state) {
46 case TCP_TIME_WAIT:
47 if (inet_twsk(sk)->tw_transparent)
48 return true;
49 break;
50 case TCP_NEW_SYN_RECV:
51 if (inet_rsk(inet_reqsk(sk))->no_srccheck)
52 return true;
53 break;
54 default:
55 if (inet_sk(sk)->transparent)
56 return true;
57 }
58
59 sock_gen_put(sk);
60 return false;
61}
62
63static inline __be32
64tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
65{
66 struct in_device *indev;
67 __be32 laddr;
68
69 if (user_laddr)
70 return user_laddr;
71
72 laddr = 0;
73 indev = __in_dev_get_rcu(skb->dev);
74 for_primary_ifa(indev) {
75 laddr = ifa->ifa_local;
76 break;
77 } endfor_ifa(indev);
78
79 return laddr ? laddr : daddr;
80}
81
82/*
83 * This is used when the user wants to intercept a connection matching
84 * an explicit iptables rule. In this case the sockets are assumed
85 * matching in preference order:
86 *
87 * - match: if there's a fully established connection matching the
88 * _packet_ tuple, it is returned, assuming the redirection
89 * already took place and we process a packet belonging to an
90 * established connection
91 *
92 * - match: if there's a listening socket matching the redirection
93 * (e.g. on-port & on-ip of the connection), it is returned,
94 * regardless if it was bound to 0.0.0.0 or an explicit
95 * address. The reasoning is that if there's an explicit rule, it
96 * does not really matter if the listener is bound to an interface
97 * or to 0. The user already stated that he wants redirection
98 * (since he added the rule).
99 *
100 * Please note that there's an overlap between what a TPROXY target
101 * and a socket match will match. Normally if you have both rules the
102 * "socket" match will be the first one, effectively all packets
103 * belonging to established connections going through that one.
104 */
105static inline struct sock *
106nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
107 const u8 protocol,
108 const __be32 saddr, const __be32 daddr,
109 const __be16 sport, const __be16 dport,
110 const struct net_device *in,
111 const enum nf_tproxy_lookup_t lookup_type)
112{
113 struct sock *sk;
114 struct tcphdr *tcph;
115
116 switch (protocol) {
117 case IPPROTO_TCP:
118 switch (lookup_type) {
119 case NFT_LOOKUP_LISTENER:
120 tcph = hp;
121 sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
122 ip_hdrlen(skb) +
123 __tcp_hdrlen(tcph),
124 saddr, sport,
125 daddr, dport,
126 in->ifindex, 0);
127
128 if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
129 sk = NULL;
130 /* NOTE: we return listeners even if bound to
131 * 0.0.0.0, those are filtered out in
132 * xt_socket, since xt_TPROXY needs 0 bound
133 * listeners too
134 */
135 break;
136 case NFT_LOOKUP_ESTABLISHED:
137 sk = inet_lookup_established(net, &tcp_hashinfo,
138 saddr, sport, daddr, dport,
139 in->ifindex);
140 break;
141 default:
142 BUG();
143 }
144 break;
145 case IPPROTO_UDP:
146 sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
147 in->ifindex);
148 if (sk) {
149 int connected = (sk->sk_state == TCP_ESTABLISHED);
150 int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
151
152 /* NOTE: we return listeners even if bound to
153 * 0.0.0.0, those are filtered out in
154 * xt_socket, since xt_TPROXY needs 0 bound
155 * listeners too
156 */
157 if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
158 (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
159 sock_put(sk);
160 sk = NULL;
161 }
162 }
163 break;
164 default:
165 WARN_ON(1);
166 sk = NULL;
167 }
168
169 pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
170 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
171
172 return sk;
173}
174
175#ifdef XT_TPROXY_HAVE_IPV6
176static inline struct sock *
177nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
178 const u8 protocol,
179 const struct in6_addr *saddr, const struct in6_addr *daddr,
180 const __be16 sport, const __be16 dport,
181 const struct net_device *in,
182 const enum nf_tproxy_lookup_t lookup_type)
183{
184 struct sock *sk;
185 struct tcphdr *tcph;
186
187 switch (protocol) {
188 case IPPROTO_TCP:
189 switch (lookup_type) {
190 case NFT_LOOKUP_LISTENER:
191 tcph = hp;
192 sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
193 thoff + __tcp_hdrlen(tcph),
194 saddr, sport,
195 daddr, ntohs(dport),
196 in->ifindex, 0);
197
198 if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
199 sk = NULL;
200 /* NOTE: we return listeners even if bound to
201 * 0.0.0.0, those are filtered out in
202 * xt_socket, since xt_TPROXY needs 0 bound
203 * listeners too
204 */
205 break;
206 case NFT_LOOKUP_ESTABLISHED:
207 sk = __inet6_lookup_established(net, &tcp_hashinfo,
208 saddr, sport, daddr, ntohs(dport),
209 in->ifindex, 0);
210 break;
211 default:
212 BUG();
213 }
214 break;
215 case IPPROTO_UDP:
216 sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
217 in->ifindex);
218 if (sk) {
219 int connected = (sk->sk_state == TCP_ESTABLISHED);
220 int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
221
222 /* NOTE: we return listeners even if bound to
223 * 0.0.0.0, those are filtered out in
224 * xt_socket, since xt_TPROXY needs 0 bound
225 * listeners too
226 */
227 if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
228 (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
229 sock_put(sk);
230 sk = NULL;
231 }
232 }
233 break;
234 default:
235 WARN_ON(1);
236 sk = NULL;
237 }
238
239 pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
240 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
241
242 return sk;
243}
244#endif
245
246/**
247 * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
248 * @skb: The skb being processed.
249 * @laddr: IPv4 address to redirect to or zero.
250 * @lport: TCP port to redirect to or zero.
251 * @sk: The TIME_WAIT TCP socket found by the lookup.
252 *
253 * We have to handle SYN packets arriving to TIME_WAIT sockets
254 * differently: instead of reopening the connection we should rather
255 * redirect the new connection to the proxy if there's a listener
256 * socket present.
257 *
258 * tproxy_handle_time_wait4() consumes the socket reference passed in.
259 *
260 * Returns the listener socket if there's one, the TIME_WAIT socket if
261 * no such listener is found, or NULL if the TCP header is incomplete.
262 */
263static struct sock *
264tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
265 __be32 laddr, __be16 lport, struct sock *sk)
266{
267 const struct iphdr *iph = ip_hdr(skb);
268 struct tcphdr _hdr, *hp;
269
270 hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
271 if (hp == NULL) {
272 inet_twsk_put(inet_twsk(sk));
273 return NULL;
274 }
275
276 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
277 /* SYN to a TIME_WAIT socket, we'd rather redirect it
278 * to a listener socket if there's one */
279 struct sock *sk2;
280
281 sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
282 iph->saddr, laddr ? laddr : iph->daddr,
283 hp->source, lport ? lport : hp->dest,
284 skb->dev, NFT_LOOKUP_LISTENER);
285 if (sk2) {
286 inet_twsk_deschedule_put(inet_twsk(sk));
287 sk = sk2;
288 }
289 }
290
291 return sk;
292}
293
294/* assign a socket to the skb -- consumes sk */ 39/* assign a socket to the skb -- consumes sk */
295static void 40static void
296nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) 41nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
@@ -319,26 +64,26 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
319 sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, 64 sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
320 iph->saddr, iph->daddr, 65 iph->saddr, iph->daddr,
321 hp->source, hp->dest, 66 hp->source, hp->dest,
322 skb->dev, NFT_LOOKUP_ESTABLISHED); 67 skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
323 68
324 laddr = tproxy_laddr4(skb, laddr, iph->daddr); 69 laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
325 if (!lport) 70 if (!lport)
326 lport = hp->dest; 71 lport = hp->dest;
327 72
328 /* UDP has no TCP_TIME_WAIT state, so we never enter here */ 73 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
329 if (sk && sk->sk_state == TCP_TIME_WAIT) 74 if (sk && sk->sk_state == TCP_TIME_WAIT)
330 /* reopening a TIME_WAIT connection needs special handling */ 75 /* reopening a TIME_WAIT connection needs special handling */
331 sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk); 76 sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
332 else if (!sk) 77 else if (!sk)
333 /* no, there's no established connection, check if 78 /* no, there's no established connection, check if
334 * there's a listener on the redirected addr/port */ 79 * there's a listener on the redirected addr/port */
335 sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, 80 sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
336 iph->saddr, laddr, 81 iph->saddr, laddr,
337 hp->source, lport, 82 hp->source, lport,
338 skb->dev, NFT_LOOKUP_LISTENER); 83 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
339 84
340 /* NOTE: assign_sock consumes our sk reference */ 85 /* NOTE: assign_sock consumes our sk reference */
341 if (sk && tproxy_sk_is_transparent(sk)) { 86 if (sk && nf_tproxy_sk_is_transparent(sk)) {
342 /* This should be in a separate target, but we don't do multiple 87 /* This should be in a separate target, but we don't do multiple
343 targets on the same rule yet */ 88 targets on the same rule yet */
344 skb->mark = (skb->mark & ~mark_mask) ^ mark_value; 89 skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -377,87 +122,6 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
377 122
378#ifdef XT_TPROXY_HAVE_IPV6 123#ifdef XT_TPROXY_HAVE_IPV6
379 124
380static inline const struct in6_addr *
381tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
382 const struct in6_addr *daddr)
383{
384 struct inet6_dev *indev;
385 struct inet6_ifaddr *ifa;
386 struct in6_addr *laddr;
387
388 if (!ipv6_addr_any(user_laddr))
389 return user_laddr;
390 laddr = NULL;
391
392 indev = __in6_dev_get(skb->dev);
393 if (indev) {
394 read_lock_bh(&indev->lock);
395 list_for_each_entry(ifa, &indev->addr_list, if_list) {
396 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
397 continue;
398
399 laddr = &ifa->addr;
400 break;
401 }
402 read_unlock_bh(&indev->lock);
403 }
404
405 return laddr ? laddr : daddr;
406}
407
408/**
409 * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
410 * @skb: The skb being processed.
411 * @tproto: Transport protocol.
412 * @thoff: Transport protocol header offset.
413 * @par: Iptables target parameters.
414 * @sk: The TIME_WAIT TCP socket found by the lookup.
415 *
416 * We have to handle SYN packets arriving to TIME_WAIT sockets
417 * differently: instead of reopening the connection we should rather
418 * redirect the new connection to the proxy if there's a listener
419 * socket present.
420 *
421 * tproxy_handle_time_wait6() consumes the socket reference passed in.
422 *
423 * Returns the listener socket if there's one, the TIME_WAIT socket if
424 * no such listener is found, or NULL if the TCP header is incomplete.
425 */
426static struct sock *
427tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
428 const struct xt_action_param *par,
429 struct sock *sk)
430{
431 const struct ipv6hdr *iph = ipv6_hdr(skb);
432 struct tcphdr _hdr, *hp;
433 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
434
435 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
436 if (hp == NULL) {
437 inet_twsk_put(inet_twsk(sk));
438 return NULL;
439 }
440
441 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
442 /* SYN to a TIME_WAIT socket, we'd rather redirect it
443 * to a listener socket if there's one */
444 struct sock *sk2;
445
446 sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
447 &iph->saddr,
448 tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
449 hp->source,
450 tgi->lport ? tgi->lport : hp->dest,
451 skb->dev, NFT_LOOKUP_LISTENER);
452 if (sk2) {
453 inet_twsk_deschedule_put(inet_twsk(sk));
454 sk = sk2;
455 }
456 }
457
458 return sk;
459}
460
461static unsigned int 125static unsigned int
462tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) 126tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
463{ 127{
@@ -489,25 +153,31 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
489 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, 153 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
490 &iph->saddr, &iph->daddr, 154 &iph->saddr, &iph->daddr,
491 hp->source, hp->dest, 155 hp->source, hp->dest,
492 xt_in(par), NFT_LOOKUP_ESTABLISHED); 156 xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
493 157
494 laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); 158 laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
495 lport = tgi->lport ? tgi->lport : hp->dest; 159 lport = tgi->lport ? tgi->lport : hp->dest;
496 160
497 /* UDP has no TCP_TIME_WAIT state, so we never enter here */ 161 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
498 if (sk && sk->sk_state == TCP_TIME_WAIT) 162 if (sk && sk->sk_state == TCP_TIME_WAIT) {
163 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
499 /* reopening a TIME_WAIT connection needs special handling */ 164 /* reopening a TIME_WAIT connection needs special handling */
500 sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk); 165 sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff,
166 xt_net(par),
167 &tgi->laddr.in6,
168 tgi->lport,
169 sk);
170 }
501 else if (!sk) 171 else if (!sk)
502 /* no there's no established connection, check if 172 /* no there's no established connection, check if
503 * there's a listener on the redirected addr/port */ 173 * there's a listener on the redirected addr/port */
504 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, 174 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
505 tproto, &iph->saddr, laddr, 175 tproto, &iph->saddr, laddr,
506 hp->source, lport, 176 hp->source, lport,
507 xt_in(par), NFT_LOOKUP_LISTENER); 177 xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
508 178
509 /* NOTE: assign_sock consumes our sk reference */ 179 /* NOTE: assign_sock consumes our sk reference */
510 if (sk && tproxy_sk_is_transparent(sk)) { 180 if (sk && nf_tproxy_sk_is_transparent(sk)) {
511 /* This should be in a separate target, but we don't do multiple 181 /* This should be in a separate target, but we don't do multiple
512 targets on the same rule yet */ 182 targets on the same rule yet */
513 skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; 183 skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bdb689cdc829..8af9707f8789 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par)
37 nf_ct_netns_put(par->net, par->family); 37 nf_ct_netns_put(par->net, par->family);
38} 38}
39 39
40static void xt_nat_convert_range(struct nf_nat_range *dst, 40static void xt_nat_convert_range(struct nf_nat_range2 *dst,
41 const struct nf_nat_ipv4_range *src) 41 const struct nf_nat_ipv4_range *src)
42{ 42{
43 memset(&dst->min_addr, 0, sizeof(dst->min_addr)); 43 memset(&dst->min_addr, 0, sizeof(dst->min_addr));
44 memset(&dst->max_addr, 0, sizeof(dst->max_addr)); 44 memset(&dst->max_addr, 0, sizeof(dst->max_addr));
45 memset(&dst->base_proto, 0, sizeof(dst->base_proto));
45 46
46 dst->flags = src->flags; 47 dst->flags = src->flags;
47 dst->min_addr.ip = src->min_ip; 48 dst->min_addr.ip = src->min_ip;
@@ -54,7 +55,7 @@ static unsigned int
54xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) 55xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
55{ 56{
56 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; 57 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
57 struct nf_nat_range range; 58 struct nf_nat_range2 range;
58 enum ip_conntrack_info ctinfo; 59 enum ip_conntrack_info ctinfo;
59 struct nf_conn *ct; 60 struct nf_conn *ct;
60 61
@@ -71,7 +72,7 @@ static unsigned int
71xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) 72xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
72{ 73{
73 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; 74 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
74 struct nf_nat_range range; 75 struct nf_nat_range2 range;
75 enum ip_conntrack_info ctinfo; 76 enum ip_conntrack_info ctinfo;
76 struct nf_conn *ct; 77 struct nf_conn *ct;
77 78
@@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
86static unsigned int 87static unsigned int
87xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) 88xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
88{ 89{
89 const struct nf_nat_range *range = par->targinfo; 90 const struct nf_nat_range *range_v1 = par->targinfo;
91 struct nf_nat_range2 range;
90 enum ip_conntrack_info ctinfo; 92 enum ip_conntrack_info ctinfo;
91 struct nf_conn *ct; 93 struct nf_conn *ct;
92 94
@@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
95 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || 97 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
96 ctinfo == IP_CT_RELATED_REPLY))); 98 ctinfo == IP_CT_RELATED_REPLY)));
97 99
98 return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); 100 memcpy(&range, range_v1, sizeof(*range_v1));
101 memset(&range.base_proto, 0, sizeof(range.base_proto));
102
103 return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
99} 104}
100 105
101static unsigned int 106static unsigned int
102xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) 107xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
103{ 108{
104 const struct nf_nat_range *range = par->targinfo; 109 const struct nf_nat_range *range_v1 = par->targinfo;
110 struct nf_nat_range2 range;
111 enum ip_conntrack_info ctinfo;
112 struct nf_conn *ct;
113
114 ct = nf_ct_get(skb, &ctinfo);
115 WARN_ON(!(ct != NULL &&
116 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
117
118 memcpy(&range, range_v1, sizeof(*range_v1));
119 memset(&range.base_proto, 0, sizeof(range.base_proto));
120
121 return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
122}
123
124static unsigned int
125xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
126{
127 const struct nf_nat_range2 *range = par->targinfo;
128 enum ip_conntrack_info ctinfo;
129 struct nf_conn *ct;
130
131 ct = nf_ct_get(skb, &ctinfo);
132 WARN_ON(!(ct != NULL &&
133 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
134 ctinfo == IP_CT_RELATED_REPLY)));
135
136 return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
137}
138
139static unsigned int
140xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
141{
142 const struct nf_nat_range2 *range = par->targinfo;
105 enum ip_conntrack_info ctinfo; 143 enum ip_conntrack_info ctinfo;
106 struct nf_conn *ct; 144 struct nf_conn *ct;
107 145
@@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
163 (1 << NF_INET_LOCAL_OUT), 201 (1 << NF_INET_LOCAL_OUT),
164 .me = THIS_MODULE, 202 .me = THIS_MODULE,
165 }, 203 },
204 {
205 .name = "SNAT",
206 .revision = 2,
207 .checkentry = xt_nat_checkentry,
208 .destroy = xt_nat_destroy,
209 .target = xt_snat_target_v2,
210 .targetsize = sizeof(struct nf_nat_range2),
211 .table = "nat",
212 .hooks = (1 << NF_INET_POST_ROUTING) |
213 (1 << NF_INET_LOCAL_IN),
214 .me = THIS_MODULE,
215 },
216 {
217 .name = "DNAT",
218 .revision = 2,
219 .target = xt_dnat_target_v2,
220 .targetsize = sizeof(struct nf_nat_range2),
221 .table = "nat",
222 .hooks = (1 << NF_INET_PRE_ROUTING) |
223 (1 << NF_INET_LOCAL_OUT),
224 .me = THIS_MODULE,
225 },
166}; 226};
167 227
168static int __init xt_nat_init(void) 228static int __init xt_nat_init(void)
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a34f314a8c23..9cfef73b4107 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,21 +37,6 @@
37#include <net/netfilter/nf_log.h> 37#include <net/netfilter/nf_log.h>
38#include <linux/netfilter/xt_osf.h> 38#include <linux/netfilter/xt_osf.h>
39 39
40struct xt_osf_finger {
41 struct rcu_head rcu_head;
42 struct list_head finger_entry;
43 struct xt_osf_user_finger finger;
44};
45
46enum osf_fmatch_states {
47 /* Packet does not match the fingerprint */
48 FMATCH_WRONG = 0,
49 /* Packet matches the fingerprint */
50 FMATCH_OK,
51 /* Options do not match the fingerprint, but header does */
52 FMATCH_OPT_WRONG,
53};
54
55/* 40/*
56 * Indexed by dont-fragment bit. 41 * Indexed by dont-fragment bit.
57 * It is the only constant value in the fingerprint. 42 * It is the only constant value in the fingerprint.
@@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
164 .cb = xt_osf_nfnetlink_callbacks, 149 .cb = xt_osf_nfnetlink_callbacks,
165}; 150};
166 151
167static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
168 unsigned char f_ttl)
169{
170 const struct iphdr *ip = ip_hdr(skb);
171
172 if (info->flags & XT_OSF_TTL) {
173 if (info->ttl == XT_OSF_TTL_TRUE)
174 return ip->ttl == f_ttl;
175 if (info->ttl == XT_OSF_TTL_NOCHECK)
176 return 1;
177 else if (ip->ttl <= f_ttl)
178 return 1;
179 else {
180 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
181 int ret = 0;
182
183 for_ifa(in_dev) {
184 if (inet_ifa_match(ip->saddr, ifa)) {
185 ret = (ip->ttl == f_ttl);
186 break;
187 }
188 }
189 endfor_ifa(in_dev);
190
191 return ret;
192 }
193 }
194
195 return ip->ttl == f_ttl;
196}
197
198static bool 152static bool
199xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) 153xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
200{ 154{
201 const struct xt_osf_info *info = p->matchinfo; 155 const struct xt_osf_info *info = p->matchinfo;
202 const struct iphdr *ip = ip_hdr(skb);
203 const struct tcphdr *tcp;
204 struct tcphdr _tcph;
205 int fmatch = FMATCH_WRONG, fcount = 0;
206 unsigned int optsize = 0, check_WSS = 0;
207 u16 window, totlen, mss = 0;
208 bool df;
209 const unsigned char *optp = NULL, *_optp = NULL;
210 unsigned char opts[MAX_IPOPTLEN];
211 const struct xt_osf_finger *kf;
212 const struct xt_osf_user_finger *f;
213 struct net *net = xt_net(p); 156 struct net *net = xt_net(p);
214 157
215 if (!info) 158 if (!info)
216 return false; 159 return false;
217 160
218 tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); 161 return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
219 if (!tcp) 162 xt_out(p), info, net, xt_osf_fingers);
220 return false;
221
222 if (!tcp->syn)
223 return false;
224
225 totlen = ntohs(ip->tot_len);
226 df = ntohs(ip->frag_off) & IP_DF;
227 window = ntohs(tcp->window);
228
229 if (tcp->doff * 4 > sizeof(struct tcphdr)) {
230 optsize = tcp->doff * 4 - sizeof(struct tcphdr);
231
232 _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
233 sizeof(struct tcphdr), optsize, opts);
234 }
235
236 list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
237 int foptsize, optnum;
238
239 f = &kf->finger;
240
241 if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
242 continue;
243
244 optp = _optp;
245 fmatch = FMATCH_WRONG;
246
247 if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
248 continue;
249
250 /*
251 * Should not happen if userspace parser was written correctly.
252 */
253 if (f->wss.wc >= OSF_WSS_MAX)
254 continue;
255
256 /* Check options */
257
258 foptsize = 0;
259 for (optnum = 0; optnum < f->opt_num; ++optnum)
260 foptsize += f->opt[optnum].length;
261
262 if (foptsize > MAX_IPOPTLEN ||
263 optsize > MAX_IPOPTLEN ||
264 optsize != foptsize)
265 continue;
266
267 check_WSS = f->wss.wc;
268
269 for (optnum = 0; optnum < f->opt_num; ++optnum) {
270 if (f->opt[optnum].kind == (*optp)) {
271 __u32 len = f->opt[optnum].length;
272 const __u8 *optend = optp + len;
273
274 fmatch = FMATCH_OK;
275
276 switch (*optp) {
277 case OSFOPT_MSS:
278 mss = optp[3];
279 mss <<= 8;
280 mss |= optp[2];
281
282 mss = ntohs((__force __be16)mss);
283 break;
284 case OSFOPT_TS:
285 break;
286 }
287
288 optp = optend;
289 } else
290 fmatch = FMATCH_OPT_WRONG;
291
292 if (fmatch != FMATCH_OK)
293 break;
294 }
295
296 if (fmatch != FMATCH_OPT_WRONG) {
297 fmatch = FMATCH_WRONG;
298
299 switch (check_WSS) {
300 case OSF_WSS_PLAIN:
301 if (f->wss.val == 0 || window == f->wss.val)
302 fmatch = FMATCH_OK;
303 break;
304 case OSF_WSS_MSS:
305 /*
306 * Some smart modems decrease mangle MSS to
307 * SMART_MSS_2, so we check standard, decreased
308 * and the one provided in the fingerprint MSS
309 * values.
310 */
311#define SMART_MSS_1 1460
312#define SMART_MSS_2 1448
313 if (window == f->wss.val * mss ||
314 window == f->wss.val * SMART_MSS_1 ||
315 window == f->wss.val * SMART_MSS_2)
316 fmatch = FMATCH_OK;
317 break;
318 case OSF_WSS_MTU:
319 if (window == f->wss.val * (mss + 40) ||
320 window == f->wss.val * (SMART_MSS_1 + 40) ||
321 window == f->wss.val * (SMART_MSS_2 + 40))
322 fmatch = FMATCH_OK;
323 break;
324 case OSF_WSS_MODULO:
325 if ((window % f->wss.val) == 0)
326 fmatch = FMATCH_OK;
327 break;
328 }
329 }
330
331 if (fmatch != FMATCH_OK)
332 continue;
333
334 fcount++;
335
336 if (info->flags & XT_OSF_LOG)
337 nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
338 xt_in(p), xt_out(p), NULL,
339 "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
340 f->genre, f->version, f->subtype,
341 &ip->saddr, ntohs(tcp->source),
342 &ip->daddr, ntohs(tcp->dest),
343 f->ttl - ip->ttl);
344
345 if ((info->flags & XT_OSF_LOG) &&
346 info->loglevel == XT_OSF_LOGLEVEL_FIRST)
347 break;
348 }
349
350 if (!fcount && (info->flags & XT_OSF_LOG))
351 nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
352 xt_out(p), NULL,
353 "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
354 &ip->saddr, ntohs(tcp->source),
355 &ip->daddr, ntohs(tcp->dest));
356
357 if (fcount)
358 fmatch = FMATCH_OK;
359
360 return fmatch == FMATCH_OK;
361} 163}
362 164
363static struct xt_match xt_osf_match = { 165static struct xt_match xt_osf_match = {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ac7f674d19b..5c0779c4fa3c 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -73,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
73 * if XT_SOCKET_TRANSPARENT is used 73 * if XT_SOCKET_TRANSPARENT is used
74 */ 74 */
75 if (info->flags & XT_SOCKET_TRANSPARENT) 75 if (info->flags & XT_SOCKET_TRANSPARENT)
76 transparent = nf_sk_is_transparent(sk); 76 transparent = inet_sk_transparent(sk);
77 77
78 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && 78 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
79 transparent && sk_fullsock(sk)) 79 transparent && sk_fullsock(sk))
@@ -130,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
130 * if XT_SOCKET_TRANSPARENT is used 130 * if XT_SOCKET_TRANSPARENT is used
131 */ 131 */
132 if (info->flags & XT_SOCKET_TRANSPARENT) 132 if (info->flags & XT_SOCKET_TRANSPARENT)
133 transparent = nf_sk_is_transparent(sk); 133 transparent = inet_sk_transparent(sk);
134 134
135 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && 135 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
136 transparent && sk_fullsock(sk)) 136 transparent && sk_fullsock(sk))
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f018eafc2a0d..376181cc1def 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -206,7 +206,6 @@ int nfc_genl_targets_found(struct nfc_dev *dev)
206 return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); 206 return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
207 207
208nla_put_failure: 208nla_put_failure:
209 genlmsg_cancel(msg, hdr);
210free_msg: 209free_msg:
211 nlmsg_free(msg); 210 nlmsg_free(msg);
212 return -EMSGSIZE; 211 return -EMSGSIZE;
@@ -237,7 +236,6 @@ int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
237 return 0; 236 return 0;
238 237
239nla_put_failure: 238nla_put_failure:
240 genlmsg_cancel(msg, hdr);
241free_msg: 239free_msg:
242 nlmsg_free(msg); 240 nlmsg_free(msg);
243 return -EMSGSIZE; 241 return -EMSGSIZE;
@@ -269,7 +267,6 @@ int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
269 return 0; 267 return 0;
270 268
271nla_put_failure: 269nla_put_failure:
272 genlmsg_cancel(msg, hdr);
273free_msg: 270free_msg:
274 nlmsg_free(msg); 271 nlmsg_free(msg);
275 return -EMSGSIZE; 272 return -EMSGSIZE;
@@ -299,7 +296,6 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev)
299 return 0; 296 return 0;
300 297
301nla_put_failure: 298nla_put_failure:
302 genlmsg_cancel(msg, hdr);
303free_msg: 299free_msg:
304 nlmsg_free(msg); 300 nlmsg_free(msg);
305 return -EMSGSIZE; 301 return -EMSGSIZE;
@@ -340,7 +336,6 @@ int nfc_genl_device_added(struct nfc_dev *dev)
340 return 0; 336 return 0;
341 337
342nla_put_failure: 338nla_put_failure:
343 genlmsg_cancel(msg, hdr);
344free_msg: 339free_msg:
345 nlmsg_free(msg); 340 nlmsg_free(msg);
346 return -EMSGSIZE; 341 return -EMSGSIZE;
@@ -370,7 +365,6 @@ int nfc_genl_device_removed(struct nfc_dev *dev)
370 return 0; 365 return 0;
371 366
372nla_put_failure: 367nla_put_failure:
373 genlmsg_cancel(msg, hdr);
374free_msg: 368free_msg:
375 nlmsg_free(msg); 369 nlmsg_free(msg);
376 return -EMSGSIZE; 370 return -EMSGSIZE;
@@ -434,8 +428,6 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
434 return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); 428 return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
435 429
436nla_put_failure: 430nla_put_failure:
437 genlmsg_cancel(msg, hdr);
438
439free_msg: 431free_msg:
440 nlmsg_free(msg); 432 nlmsg_free(msg);
441 433
@@ -470,7 +462,6 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
470 return 0; 462 return 0;
471 463
472nla_put_failure: 464nla_put_failure:
473 genlmsg_cancel(msg, hdr);
474free_msg: 465free_msg:
475 nlmsg_free(msg); 466 nlmsg_free(msg);
476 return -EMSGSIZE; 467 return -EMSGSIZE;
@@ -501,7 +492,6 @@ int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
501 return 0; 492 return 0;
502 493
503nla_put_failure: 494nla_put_failure:
504 genlmsg_cancel(msg, hdr);
505free_msg: 495free_msg:
506 nlmsg_free(msg); 496 nlmsg_free(msg);
507 return -EMSGSIZE; 497 return -EMSGSIZE;
@@ -546,7 +536,6 @@ int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
546 return 0; 536 return 0;
547 537
548nla_put_failure: 538nla_put_failure:
549 genlmsg_cancel(msg, hdr);
550free_msg: 539free_msg:
551 /* evt_transaction is no more used */ 540 /* evt_transaction is no more used */
552 devm_kfree(&dev->dev, evt_transaction); 541 devm_kfree(&dev->dev, evt_transaction);
@@ -585,7 +574,6 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
585 return 0; 574 return 0;
586 575
587nla_put_failure: 576nla_put_failure:
588 genlmsg_cancel(msg, hdr);
589free_msg: 577free_msg:
590 nlmsg_free(msg); 578 nlmsg_free(msg);
591 return -EMSGSIZE; 579 return -EMSGSIZE;
@@ -703,7 +691,6 @@ int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
703 return 0; 691 return 0;
704 692
705nla_put_failure: 693nla_put_failure:
706 genlmsg_cancel(msg, hdr);
707free_msg: 694free_msg:
708 nlmsg_free(msg); 695 nlmsg_free(msg);
709 return -EMSGSIZE; 696 return -EMSGSIZE;
@@ -735,7 +722,6 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
735 return 0; 722 return 0;
736 723
737nla_put_failure: 724nla_put_failure:
738 genlmsg_cancel(msg, hdr);
739free_msg: 725free_msg:
740 nlmsg_free(msg); 726 nlmsg_free(msg);
741 return -EMSGSIZE; 727 return -EMSGSIZE;
@@ -1030,7 +1016,6 @@ static int nfc_genl_send_params(struct sk_buff *msg,
1030 return 0; 1016 return 0;
1031 1017
1032nla_put_failure: 1018nla_put_failure:
1033
1034 genlmsg_cancel(msg, hdr); 1019 genlmsg_cancel(msg, hdr);
1035 return -EMSGSIZE; 1020 return -EMSGSIZE;
1036} 1021}
@@ -1290,7 +1275,6 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
1290 return 0; 1275 return 0;
1291 1276
1292nla_put_failure: 1277nla_put_failure:
1293 genlmsg_cancel(msg, hdr);
1294free_msg: 1278free_msg:
1295 nlmsg_free(msg); 1279 nlmsg_free(msg);
1296 return -EMSGSIZE; 1280 return -EMSGSIZE;
@@ -1507,7 +1491,6 @@ static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
1507 return; 1491 return;
1508 1492
1509nla_put_failure: 1493nla_put_failure:
1510 genlmsg_cancel(msg, hdr);
1511free_msg: 1494free_msg:
1512 nlmsg_free(msg); 1495 nlmsg_free(msg);
1513 kfree(ctx); 1496 kfree(ctx);
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
9 (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ 9 (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
10 (!NF_NAT || NF_NAT) && \ 10 (!NF_NAT || NF_NAT) && \
11 (!NF_NAT_IPV4 || NF_NAT_IPV4) && \ 11 (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
12 (!NF_NAT_IPV6 || NF_NAT_IPV6))) 12 (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
13 (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
13 select LIBCRC32C 14 select LIBCRC32C
14 select MPLS 15 select MPLS
15 select NET_MPLS_GSO 16 select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..284aca2a252d 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
16#include <linux/tcp.h> 16#include <linux/tcp.h>
17#include <linux/udp.h> 17#include <linux/udp.h>
18#include <linux/sctp.h> 18#include <linux/sctp.h>
19#include <linux/static_key.h>
19#include <net/ip.h> 20#include <net/ip.h>
21#include <net/genetlink.h>
20#include <net/netfilter/nf_conntrack_core.h> 22#include <net/netfilter/nf_conntrack_core.h>
23#include <net/netfilter/nf_conntrack_count.h>
21#include <net/netfilter/nf_conntrack_helper.h> 24#include <net/netfilter/nf_conntrack_helper.h>
22#include <net/netfilter/nf_conntrack_labels.h> 25#include <net/netfilter/nf_conntrack_labels.h>
23#include <net/netfilter/nf_conntrack_seqadj.h> 26#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -72,10 +75,35 @@ struct ovs_conntrack_info {
72 struct md_mark mark; 75 struct md_mark mark;
73 struct md_labels labels; 76 struct md_labels labels;
74#ifdef CONFIG_NF_NAT_NEEDED 77#ifdef CONFIG_NF_NAT_NEEDED
75 struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ 78 struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */
76#endif 79#endif
77}; 80};
78 81
82#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
83#define OVS_CT_LIMIT_UNLIMITED 0
84#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
85#define CT_LIMIT_HASH_BUCKETS 512
86static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
87
88struct ovs_ct_limit {
89 /* Elements in ovs_ct_limit_info->limits hash table */
90 struct hlist_node hlist_node;
91 struct rcu_head rcu;
92 u16 zone;
93 u32 limit;
94};
95
96struct ovs_ct_limit_info {
97 u32 default_limit;
98 struct hlist_head *limits;
99 struct nf_conncount_data *data;
100};
101
102static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
103 [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
104};
105#endif
106
79static bool labels_nonzero(const struct ovs_key_ct_labels *labels); 107static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
80 108
81static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 109static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -710,7 +738,7 @@ static bool skb_nfct_cached(struct net *net,
710 */ 738 */
711static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, 739static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
712 enum ip_conntrack_info ctinfo, 740 enum ip_conntrack_info ctinfo,
713 const struct nf_nat_range *range, 741 const struct nf_nat_range2 *range,
714 enum nf_nat_manip_type maniptype) 742 enum nf_nat_manip_type maniptype)
715{ 743{
716 int hooknum, nh_off, err = NF_ACCEPT; 744 int hooknum, nh_off, err = NF_ACCEPT;
@@ -1036,6 +1064,89 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
1036 return false; 1064 return false;
1037} 1065}
1038 1066
1067#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
1068static struct hlist_head *ct_limit_hash_bucket(
1069 const struct ovs_ct_limit_info *info, u16 zone)
1070{
1071 return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
1072}
1073
1074/* Call with ovs_mutex */
1075static void ct_limit_set(const struct ovs_ct_limit_info *info,
1076 struct ovs_ct_limit *new_ct_limit)
1077{
1078 struct ovs_ct_limit *ct_limit;
1079 struct hlist_head *head;
1080
1081 head = ct_limit_hash_bucket(info, new_ct_limit->zone);
1082 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
1083 if (ct_limit->zone == new_ct_limit->zone) {
1084 hlist_replace_rcu(&ct_limit->hlist_node,
1085 &new_ct_limit->hlist_node);
1086 kfree_rcu(ct_limit, rcu);
1087 return;
1088 }
1089 }
1090
1091 hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
1092}
1093
1094/* Call with ovs_mutex */
1095static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
1096{
1097 struct ovs_ct_limit *ct_limit;
1098 struct hlist_head *head;
1099 struct hlist_node *n;
1100
1101 head = ct_limit_hash_bucket(info, zone);
1102 hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) {
1103 if (ct_limit->zone == zone) {
1104 hlist_del_rcu(&ct_limit->hlist_node);
1105 kfree_rcu(ct_limit, rcu);
1106 return;
1107 }
1108 }
1109}
1110
1111/* Call with RCU read lock */
1112static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
1113{
1114 struct ovs_ct_limit *ct_limit;
1115 struct hlist_head *head;
1116
1117 head = ct_limit_hash_bucket(info, zone);
1118 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
1119 if (ct_limit->zone == zone)
1120 return ct_limit->limit;
1121 }
1122
1123 return info->default_limit;
1124}
1125
1126static int ovs_ct_check_limit(struct net *net,
1127 const struct ovs_conntrack_info *info,
1128 const struct nf_conntrack_tuple *tuple)
1129{
1130 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
1131 const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
1132 u32 per_zone_limit, connections;
1133 u32 conncount_key;
1134
1135 conncount_key = info->zone.id;
1136
1137 per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
1138 if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
1139 return 0;
1140
1141 connections = nf_conncount_count(net, ct_limit_info->data,
1142 &conncount_key, tuple, &info->zone);
1143 if (connections > per_zone_limit)
1144 return -ENOMEM;
1145
1146 return 0;
1147}
1148#endif
1149
1039/* Lookup connection and confirm if unconfirmed. */ 1150/* Lookup connection and confirm if unconfirmed. */
1040static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, 1151static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
1041 const struct ovs_conntrack_info *info, 1152 const struct ovs_conntrack_info *info,
@@ -1054,6 +1165,21 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
1054 if (!ct) 1165 if (!ct)
1055 return 0; 1166 return 0;
1056 1167
1168#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
1169 if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
1170 if (!nf_ct_is_confirmed(ct)) {
1171 err = ovs_ct_check_limit(net, info,
1172 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
1173 if (err) {
1174 net_warn_ratelimited("openvswitch: zone: %u "
1175 "execeeds conntrack limit\n",
1176 info->zone.id);
1177 return err;
1178 }
1179 }
1180 }
1181#endif
1182
1057 /* Set the conntrack event mask if given. NEW and DELETE events have 1183 /* Set the conntrack event mask if given. NEW and DELETE events have
1058 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener 1184 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
1059 * typically would receive many kinds of updates. Setting the event 1185 * typically would receive many kinds of updates. Setting the event
@@ -1655,7 +1781,420 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
1655 nf_ct_tmpl_free(ct_info->ct); 1781 nf_ct_tmpl_free(ct_info->ct);
1656} 1782}
1657 1783
1658void ovs_ct_init(struct net *net) 1784#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
1785static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
1786{
1787 int i, err;
1788
1789 ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
1790 GFP_KERNEL);
1791 if (!ovs_net->ct_limit_info)
1792 return -ENOMEM;
1793
1794 ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
1795 ovs_net->ct_limit_info->limits =
1796 kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
1797 GFP_KERNEL);
1798 if (!ovs_net->ct_limit_info->limits) {
1799 kfree(ovs_net->ct_limit_info);
1800 return -ENOMEM;
1801 }
1802
1803 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
1804 INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
1805
1806 ovs_net->ct_limit_info->data =
1807 nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
1808
1809 if (IS_ERR(ovs_net->ct_limit_info->data)) {
1810 err = PTR_ERR(ovs_net->ct_limit_info->data);
1811 kfree(ovs_net->ct_limit_info->limits);
1812 kfree(ovs_net->ct_limit_info);
1813 pr_err("openvswitch: failed to init nf_conncount %d\n", err);
1814 return err;
1815 }
1816 return 0;
1817}
1818
1819static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
1820{
1821 const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
1822 int i;
1823
1824 nf_conncount_destroy(net, NFPROTO_INET, info->data);
1825 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
1826 struct hlist_head *head = &info->limits[i];
1827 struct ovs_ct_limit *ct_limit;
1828
1829 hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
1830 kfree_rcu(ct_limit, rcu);
1831 }
1832 kfree(ovs_net->ct_limit_info->limits);
1833 kfree(ovs_net->ct_limit_info);
1834}
1835
1836static struct sk_buff *
1837ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
1838 struct ovs_header **ovs_reply_header)
1839{
1840 struct ovs_header *ovs_header = info->userhdr;
1841 struct sk_buff *skb;
1842
1843 skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1844 if (!skb)
1845 return ERR_PTR(-ENOMEM);
1846
1847 *ovs_reply_header = genlmsg_put(skb, info->snd_portid,
1848 info->snd_seq,
1849 &dp_ct_limit_genl_family, 0, cmd);
1850
1851 if (!*ovs_reply_header) {
1852 nlmsg_free(skb);
1853 return ERR_PTR(-EMSGSIZE);
1854 }
1855 (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
1856
1857 return skb;
1858}
1859
1860static bool check_zone_id(int zone_id, u16 *pzone)
1861{
1862 if (zone_id >= 0 && zone_id <= 65535) {
1863 *pzone = (u16)zone_id;
1864 return true;
1865 }
1866 return false;
1867}
1868
1869static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
1870 struct ovs_ct_limit_info *info)
1871{
1872 struct ovs_zone_limit *zone_limit;
1873 int rem;
1874 u16 zone;
1875
1876 rem = NLA_ALIGN(nla_len(nla_zone_limit));
1877 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
1878
1879 while (rem >= sizeof(*zone_limit)) {
1880 if (unlikely(zone_limit->zone_id ==
1881 OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
1882 ovs_lock();
1883 info->default_limit = zone_limit->limit;
1884 ovs_unlock();
1885 } else if (unlikely(!check_zone_id(
1886 zone_limit->zone_id, &zone))) {
1887 OVS_NLERR(true, "zone id is out of range");
1888 } else {
1889 struct ovs_ct_limit *ct_limit;
1890
1891 ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
1892 if (!ct_limit)
1893 return -ENOMEM;
1894
1895 ct_limit->zone = zone;
1896 ct_limit->limit = zone_limit->limit;
1897
1898 ovs_lock();
1899 ct_limit_set(info, ct_limit);
1900 ovs_unlock();
1901 }
1902 rem -= NLA_ALIGN(sizeof(*zone_limit));
1903 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
1904 NLA_ALIGN(sizeof(*zone_limit)));
1905 }
1906
1907 if (rem)
1908 OVS_NLERR(true, "set zone limit has %d unknown bytes", rem);
1909
1910 return 0;
1911}
1912
1913static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
1914 struct ovs_ct_limit_info *info)
1915{
1916 struct ovs_zone_limit *zone_limit;
1917 int rem;
1918 u16 zone;
1919
1920 rem = NLA_ALIGN(nla_len(nla_zone_limit));
1921 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
1922
1923 while (rem >= sizeof(*zone_limit)) {
1924 if (unlikely(zone_limit->zone_id ==
1925 OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
1926 ovs_lock();
1927 info->default_limit = OVS_CT_LIMIT_DEFAULT;
1928 ovs_unlock();
1929 } else if (unlikely(!check_zone_id(
1930 zone_limit->zone_id, &zone))) {
1931 OVS_NLERR(true, "zone id is out of range");
1932 } else {
1933 ovs_lock();
1934 ct_limit_del(info, zone);
1935 ovs_unlock();
1936 }
1937 rem -= NLA_ALIGN(sizeof(*zone_limit));
1938 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
1939 NLA_ALIGN(sizeof(*zone_limit)));
1940 }
1941
1942 if (rem)
1943 OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
1944
1945 return 0;
1946}
1947
1948static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
1949 struct sk_buff *reply)
1950{
1951 struct ovs_zone_limit zone_limit;
1952 int err;
1953
1954 zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
1955 zone_limit.limit = info->default_limit;
1956 err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
1957 if (err)
1958 return err;
1959
1960 return 0;
1961}
1962
1963static int __ovs_ct_limit_get_zone_limit(struct net *net,
1964 struct nf_conncount_data *data,
1965 u16 zone_id, u32 limit,
1966 struct sk_buff *reply)
1967{
1968 struct nf_conntrack_zone ct_zone;
1969 struct ovs_zone_limit zone_limit;
1970 u32 conncount_key = zone_id;
1971
1972 zone_limit.zone_id = zone_id;
1973 zone_limit.limit = limit;
1974 nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0);
1975
1976 zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL,
1977 &ct_zone);
1978 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
1979}
1980
1981static int ovs_ct_limit_get_zone_limit(struct net *net,
1982 struct nlattr *nla_zone_limit,
1983 struct ovs_ct_limit_info *info,
1984 struct sk_buff *reply)
1985{
1986 struct ovs_zone_limit *zone_limit;
1987 int rem, err;
1988 u32 limit;
1989 u16 zone;
1990
1991 rem = NLA_ALIGN(nla_len(nla_zone_limit));
1992 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
1993
1994 while (rem >= sizeof(*zone_limit)) {
1995 if (unlikely(zone_limit->zone_id ==
1996 OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
1997 err = ovs_ct_limit_get_default_limit(info, reply);
1998 if (err)
1999 return err;
2000 } else if (unlikely(!check_zone_id(zone_limit->zone_id,
2001 &zone))) {
2002 OVS_NLERR(true, "zone id is out of range");
2003 } else {
2004 rcu_read_lock();
2005 limit = ct_limit_get(info, zone);
2006 rcu_read_unlock();
2007
2008 err = __ovs_ct_limit_get_zone_limit(
2009 net, info->data, zone, limit, reply);
2010 if (err)
2011 return err;
2012 }
2013 rem -= NLA_ALIGN(sizeof(*zone_limit));
2014 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
2015 NLA_ALIGN(sizeof(*zone_limit)));
2016 }
2017
2018 if (rem)
2019 OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
2020
2021 return 0;
2022}
2023
2024static int ovs_ct_limit_get_all_zone_limit(struct net *net,
2025 struct ovs_ct_limit_info *info,
2026 struct sk_buff *reply)
2027{
2028 struct ovs_ct_limit *ct_limit;
2029 struct hlist_head *head;
2030 int i, err = 0;
2031
2032 err = ovs_ct_limit_get_default_limit(info, reply);
2033 if (err)
2034 return err;
2035
2036 rcu_read_lock();
2037 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
2038 head = &info->limits[i];
2039 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
2040 err = __ovs_ct_limit_get_zone_limit(net, info->data,
2041 ct_limit->zone, ct_limit->limit, reply);
2042 if (err)
2043 goto exit_err;
2044 }
2045 }
2046
2047exit_err:
2048 rcu_read_unlock();
2049 return err;
2050}
2051
2052static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
2053{
2054 struct nlattr **a = info->attrs;
2055 struct sk_buff *reply;
2056 struct ovs_header *ovs_reply_header;
2057 struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
2058 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
2059 int err;
2060
2061 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
2062 &ovs_reply_header);
2063 if (IS_ERR(reply))
2064 return PTR_ERR(reply);
2065
2066 if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
2067 err = -EINVAL;
2068 goto exit_err;
2069 }
2070
2071 err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
2072 ct_limit_info);
2073 if (err)
2074 goto exit_err;
2075
2076 static_branch_enable(&ovs_ct_limit_enabled);
2077
2078 genlmsg_end(reply, ovs_reply_header);
2079 return genlmsg_reply(reply, info);
2080
2081exit_err:
2082 nlmsg_free(reply);
2083 return err;
2084}
2085
2086static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
2087{
2088 struct nlattr **a = info->attrs;
2089 struct sk_buff *reply;
2090 struct ovs_header *ovs_reply_header;
2091 struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
2092 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
2093 int err;
2094
2095 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
2096 &ovs_reply_header);
2097 if (IS_ERR(reply))
2098 return PTR_ERR(reply);
2099
2100 if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
2101 err = -EINVAL;
2102 goto exit_err;
2103 }
2104
2105 err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
2106 ct_limit_info);
2107 if (err)
2108 goto exit_err;
2109
2110 genlmsg_end(reply, ovs_reply_header);
2111 return genlmsg_reply(reply, info);
2112
2113exit_err:
2114 nlmsg_free(reply);
2115 return err;
2116}
2117
2118static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
2119{
2120 struct nlattr **a = info->attrs;
2121 struct nlattr *nla_reply;
2122 struct sk_buff *reply;
2123 struct ovs_header *ovs_reply_header;
2124 struct net *net = sock_net(skb->sk);
2125 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2126 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
2127 int err;
2128
2129 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
2130 &ovs_reply_header);
2131 if (IS_ERR(reply))
2132 return PTR_ERR(reply);
2133
2134 nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
2135
2136 if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
2137 err = ovs_ct_limit_get_zone_limit(
2138 net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info,
2139 reply);
2140 if (err)
2141 goto exit_err;
2142 } else {
2143 err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info,
2144 reply);
2145 if (err)
2146 goto exit_err;
2147 }
2148
2149 nla_nest_end(reply, nla_reply);
2150 genlmsg_end(reply, ovs_reply_header);
2151 return genlmsg_reply(reply, info);
2152
2153exit_err:
2154 nlmsg_free(reply);
2155 return err;
2156}
2157
2158static struct genl_ops ct_limit_genl_ops[] = {
2159 { .cmd = OVS_CT_LIMIT_CMD_SET,
2160 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
2161 * privilege. */
2162 .policy = ct_limit_policy,
2163 .doit = ovs_ct_limit_cmd_set,
2164 },
2165 { .cmd = OVS_CT_LIMIT_CMD_DEL,
2166 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
2167 * privilege. */
2168 .policy = ct_limit_policy,
2169 .doit = ovs_ct_limit_cmd_del,
2170 },
2171 { .cmd = OVS_CT_LIMIT_CMD_GET,
2172 .flags = 0, /* OK for unprivileged users. */
2173 .policy = ct_limit_policy,
2174 .doit = ovs_ct_limit_cmd_get,
2175 },
2176};
2177
2178static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
2179 .name = OVS_CT_LIMIT_MCGROUP,
2180};
2181
2182struct genl_family dp_ct_limit_genl_family __ro_after_init = {
2183 .hdrsize = sizeof(struct ovs_header),
2184 .name = OVS_CT_LIMIT_FAMILY,
2185 .version = OVS_CT_LIMIT_VERSION,
2186 .maxattr = OVS_CT_LIMIT_ATTR_MAX,
2187 .netnsok = true,
2188 .parallel_ops = true,
2189 .ops = ct_limit_genl_ops,
2190 .n_ops = ARRAY_SIZE(ct_limit_genl_ops),
2191 .mcgrps = &ovs_ct_limit_multicast_group,
2192 .n_mcgrps = 1,
2193 .module = THIS_MODULE,
2194};
2195#endif
2196
2197int ovs_ct_init(struct net *net)
1659{ 2198{
1660 unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; 2199 unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
1661 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2200 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2205,22 @@ void ovs_ct_init(struct net *net)
1666 } else { 2205 } else {
1667 ovs_net->xt_label = true; 2206 ovs_net->xt_label = true;
1668 } 2207 }
2208
2209#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
2210 return ovs_ct_limit_init(net, ovs_net);
2211#else
2212 return 0;
2213#endif
1669} 2214}
1670 2215
1671void ovs_ct_exit(struct net *net) 2216void ovs_ct_exit(struct net *net)
1672{ 2217{
1673 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2218 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
1674 2219
2220#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
2221 ovs_ct_limit_exit(net, ovs_net);
2222#endif
2223
1675 if (ovs_net->xt_label) 2224 if (ovs_net->xt_label)
1676 nf_connlabels_put(net); 2225 nf_connlabels_put(net);
1677} 2226}
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
17#include "flow.h" 17#include "flow.h"
18 18
19struct ovs_conntrack_info; 19struct ovs_conntrack_info;
20struct ovs_ct_limit_info;
20enum ovs_key_attr; 21enum ovs_key_attr;
21 22
22#if IS_ENABLED(CONFIG_NF_CONNTRACK) 23#if IS_ENABLED(CONFIG_NF_CONNTRACK)
23void ovs_ct_init(struct net *); 24int ovs_ct_init(struct net *);
24void ovs_ct_exit(struct net *); 25void ovs_ct_exit(struct net *);
25bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); 26bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
26int ovs_ct_copy_action(struct net *, const struct nlattr *, 27int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
44#else 45#else
45#include <linux/errno.h> 46#include <linux/errno.h>
46 47
47static inline void ovs_ct_init(struct net *net) { } 48static inline int ovs_ct_init(struct net *net) { return 0; }
48 49
49static inline void ovs_ct_exit(struct net *net) { } 50static inline void ovs_ct_exit(struct net *net) { }
50 51
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
104 105
105#define CT_SUPPORTED_MASK 0 106#define CT_SUPPORTED_MASK 0
106#endif /* CONFIG_NF_CONNTRACK */ 107#endif /* CONFIG_NF_CONNTRACK */
108
109#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
110extern struct genl_family dp_ct_limit_genl_family;
111#endif
107#endif /* ovs_conntrack.h */ 112#endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
2288 &dp_flow_genl_family, 2288 &dp_flow_genl_family,
2289 &dp_packet_genl_family, 2289 &dp_packet_genl_family,
2290 &dp_meter_genl_family, 2290 &dp_meter_genl_family,
2291#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
2292 &dp_ct_limit_genl_family,
2293#endif
2291}; 2294};
2292 2295
2293static void dp_unregister_genl(int n_families) 2296static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
2323 2326
2324 INIT_LIST_HEAD(&ovs_net->dps); 2327 INIT_LIST_HEAD(&ovs_net->dps);
2325 INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); 2328 INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
2326 ovs_ct_init(net); 2329 return ovs_ct_init(net);
2327 return 0;
2328} 2330}
2329 2331
2330static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, 2332static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
2469MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); 2471MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
2470MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); 2472MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
2471MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); 2473MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
2474MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..c9eb267c6f7e 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,9 @@ struct dp_upcall_info {
144struct ovs_net { 144struct ovs_net {
145 struct list_head dps; 145 struct list_head dps;
146 struct work_struct dp_notify_work; 146 struct work_struct dp_notify_work;
147#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
148 struct ovs_ct_limit_info *ct_limit_info;
149#endif
147 150
148 /* Module reference for configuring conntrack. */ 151 /* Module reference for configuring conntrack. */
149 bool xt_label; 152 bool xt_label;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 674390b1f084..54ce66f68482 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *,
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *, 209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *); 210 struct tpacket3_hdr *);
211static void packet_flush_mclist(struct sock *sk); 211static void packet_flush_mclist(struct sock *sk);
212static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
213 213
214struct packet_skb_cb { 214struct packet_skb_cb {
215 union { 215 union {
@@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
243 243
244static int packet_direct_xmit(struct sk_buff *skb) 244static int packet_direct_xmit(struct sk_buff *skb)
245{ 245{
246 struct net_device *dev = skb->dev; 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
247 struct sk_buff *orig_skb = skb;
248 struct netdev_queue *txq;
249 int ret = NETDEV_TX_BUSY;
250 bool again = false;
251
252 if (unlikely(!netif_running(dev) ||
253 !netif_carrier_ok(dev)))
254 goto drop;
255
256 skb = validate_xmit_skb_list(skb, dev, &again);
257 if (skb != orig_skb)
258 goto drop;
259
260 packet_pick_tx_queue(dev, skb);
261 txq = skb_get_tx_queue(dev, skb);
262
263 local_bh_disable();
264
265 HARD_TX_LOCK(dev, txq, smp_processor_id());
266 if (!netif_xmit_frozen_or_drv_stopped(txq))
267 ret = netdev_start_xmit(skb, dev, txq, false);
268 HARD_TX_UNLOCK(dev, txq);
269
270 local_bh_enable();
271
272 if (!dev_xmit_complete(ret))
273 kfree_skb(skb);
274
275 return ret;
276drop:
277 atomic_long_inc(&dev->tx_dropped);
278 kfree_skb_list(skb);
279 return NET_XMIT_DROP;
280} 247}
281 248
282static struct net_device *packet_cached_dev_get(struct packet_sock *po) 249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
@@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; 280 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
314} 281}
315 282
316static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) 283static u16 packet_pick_tx_queue(struct sk_buff *skb)
317{ 284{
285 struct net_device *dev = skb->dev;
318 const struct net_device_ops *ops = dev->netdev_ops; 286 const struct net_device_ops *ops = dev->netdev_ops;
319 u16 queue_index; 287 u16 queue_index;
320 288
@@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
326 queue_index = __packet_pick_tx_queue(dev, skb); 294 queue_index = __packet_pick_tx_queue(dev, skb);
327 } 295 }
328 296
329 skb_set_queue_mapping(skb, queue_index); 297 return queue_index;
330} 298}
331 299
332/* __register_prot_hook must be invoked through register_prot_hook 300/* __register_prot_hook must be invoked through register_prot_hook
@@ -4281,7 +4249,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4281 goto out; 4249 goto out;
4282 if (po->tp_version >= TPACKET_V3 && 4250 if (po->tp_version >= TPACKET_V3 &&
4283 req->tp_block_size <= 4251 req->tp_block_size <=
4284 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv)) 4252 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr))
4285 goto out; 4253 goto out;
4286 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 4254 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
4287 po->tp_reserve)) 4255 po->tp_reserve))
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 326fd97444f5..1944834d225c 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -21,4 +21,11 @@ config QRTR_SMD
21 Say Y here to support SMD based ipcrouter channels. SMD is the 21 Say Y here to support SMD based ipcrouter channels. SMD is the
22 most common transport for IPC Router. 22 most common transport for IPC Router.
23 23
24config QRTR_TUN
25 tristate "TUN device for Qualcomm IPC Router"
26 ---help---
27 Say Y here to expose a character device that allows user space to
28 implement endpoints of QRTR, for purpose of tunneling data to other
29 hosts or testing purposes.
30
24endif # QRTR 31endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index ab09e40f7c74..be012bfd3e52 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o
2 2
3obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o 3obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
4qrtr-smd-y := smd.o 4qrtr-smd-y := smd.o
5obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
6qrtr-tun-y := tun.o
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
new file mode 100644
index 000000000000..ccff1e544c21
--- /dev/null
+++ b/net/qrtr/tun.c
@@ -0,0 +1,161 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2018, Linaro Ltd */
3
4#include <linux/miscdevice.h>
5#include <linux/module.h>
6#include <linux/poll.h>
7#include <linux/skbuff.h>
8#include <linux/uaccess.h>
9
10#include "qrtr.h"
11
12struct qrtr_tun {
13 struct qrtr_endpoint ep;
14
15 struct sk_buff_head queue;
16 wait_queue_head_t readq;
17};
18
19static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
20{
21 struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep);
22
23 skb_queue_tail(&tun->queue, skb);
24
25 /* wake up any blocking processes, waiting for new data */
26 wake_up_interruptible(&tun->readq);
27
28 return 0;
29}
30
31static int qrtr_tun_open(struct inode *inode, struct file *filp)
32{
33 struct qrtr_tun *tun;
34
35 tun = kzalloc(sizeof(*tun), GFP_KERNEL);
36 if (!tun)
37 return -ENOMEM;
38
39 skb_queue_head_init(&tun->queue);
40 init_waitqueue_head(&tun->readq);
41
42 tun->ep.xmit = qrtr_tun_send;
43
44 filp->private_data = tun;
45
46 return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO);
47}
48
49static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to)
50{
51 struct file *filp = iocb->ki_filp;
52 struct qrtr_tun *tun = filp->private_data;
53 struct sk_buff *skb;
54 int count;
55
56 while (!(skb = skb_dequeue(&tun->queue))) {
57 if (filp->f_flags & O_NONBLOCK)
58 return -EAGAIN;
59
60 /* Wait until we get data or the endpoint goes away */
61 if (wait_event_interruptible(tun->readq,
62 !skb_queue_empty(&tun->queue)))
63 return -ERESTARTSYS;
64 }
65
66 count = min_t(size_t, iov_iter_count(to), skb->len);
67 if (copy_to_iter(skb->data, count, to) != count)
68 count = -EFAULT;
69
70 kfree_skb(skb);
71
72 return count;
73}
74
75static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
76{
77 struct file *filp = iocb->ki_filp;
78 struct qrtr_tun *tun = filp->private_data;
79 size_t len = iov_iter_count(from);
80 ssize_t ret;
81 void *kbuf;
82
83 kbuf = kzalloc(len, GFP_KERNEL);
84 if (!kbuf)
85 return -ENOMEM;
86
87 if (!copy_from_iter_full(kbuf, len, from))
88 return -EFAULT;
89
90 ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
91
92 return ret < 0 ? ret : len;
93}
94
95static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait)
96{
97 struct qrtr_tun *tun = filp->private_data;
98 __poll_t mask = 0;
99
100 poll_wait(filp, &tun->readq, wait);
101
102 if (!skb_queue_empty(&tun->queue))
103 mask |= EPOLLIN | EPOLLRDNORM;
104
105 return mask;
106}
107
108static int qrtr_tun_release(struct inode *inode, struct file *filp)
109{
110 struct qrtr_tun *tun = filp->private_data;
111 struct sk_buff *skb;
112
113 qrtr_endpoint_unregister(&tun->ep);
114
115 /* Discard all SKBs */
116 while (!skb_queue_empty(&tun->queue)) {
117 skb = skb_dequeue(&tun->queue);
118 kfree_skb(skb);
119 }
120
121 kfree(tun);
122
123 return 0;
124}
125
126static const struct file_operations qrtr_tun_ops = {
127 .owner = THIS_MODULE,
128 .open = qrtr_tun_open,
129 .poll = qrtr_tun_poll,
130 .read_iter = qrtr_tun_read_iter,
131 .write_iter = qrtr_tun_write_iter,
132 .release = qrtr_tun_release,
133};
134
135static struct miscdevice qrtr_tun_miscdev = {
136 MISC_DYNAMIC_MINOR,
137 "qrtr-tun",
138 &qrtr_tun_ops,
139};
140
141static int __init qrtr_tun_init(void)
142{
143 int ret;
144
145 ret = misc_register(&qrtr_tun_miscdev);
146 if (ret)
147 pr_err("failed to register Qualcomm IPC Router tun device\n");
148
149 return ret;
150}
151
152static void __exit qrtr_tun_exit(void)
153{
154 misc_deregister(&qrtr_tun_miscdev);
155}
156
157module_init(qrtr_tun_init);
158module_exit(qrtr_tun_exit);
159
160MODULE_DESCRIPTION("Qualcomm IPC Router TUN device");
161MODULE_LICENSE("GPL v2");
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 59d0eb960275..a7a4e6ff9be2 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -178,9 +178,10 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
178} 178}
179 179
180static struct led_trigger rfkill_any_led_trigger; 180static struct led_trigger rfkill_any_led_trigger;
181static struct work_struct rfkill_any_work; 181static struct led_trigger rfkill_none_led_trigger;
182static struct work_struct rfkill_global_led_trigger_work;
182 183
183static void rfkill_any_led_trigger_worker(struct work_struct *work) 184static void rfkill_global_led_trigger_worker(struct work_struct *work)
184{ 185{
185 enum led_brightness brightness = LED_OFF; 186 enum led_brightness brightness = LED_OFF;
186 struct rfkill *rfkill; 187 struct rfkill *rfkill;
@@ -195,30 +196,43 @@ static void rfkill_any_led_trigger_worker(struct work_struct *work)
195 mutex_unlock(&rfkill_global_mutex); 196 mutex_unlock(&rfkill_global_mutex);
196 197
197 led_trigger_event(&rfkill_any_led_trigger, brightness); 198 led_trigger_event(&rfkill_any_led_trigger, brightness);
199 led_trigger_event(&rfkill_none_led_trigger,
200 brightness == LED_OFF ? LED_FULL : LED_OFF);
198} 201}
199 202
200static void rfkill_any_led_trigger_event(void) 203static void rfkill_global_led_trigger_event(void)
201{ 204{
202 schedule_work(&rfkill_any_work); 205 schedule_work(&rfkill_global_led_trigger_work);
203} 206}
204 207
205static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev) 208static int rfkill_global_led_trigger_register(void)
206{ 209{
207 rfkill_any_led_trigger_event(); 210 int ret;
208} 211
212 INIT_WORK(&rfkill_global_led_trigger_work,
213 rfkill_global_led_trigger_worker);
209 214
210static int rfkill_any_led_trigger_register(void)
211{
212 INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
213 rfkill_any_led_trigger.name = "rfkill-any"; 215 rfkill_any_led_trigger.name = "rfkill-any";
214 rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate; 216 ret = led_trigger_register(&rfkill_any_led_trigger);
215 return led_trigger_register(&rfkill_any_led_trigger); 217 if (ret)
218 return ret;
219
220 rfkill_none_led_trigger.name = "rfkill-none";
221 ret = led_trigger_register(&rfkill_none_led_trigger);
222 if (ret)
223 led_trigger_unregister(&rfkill_any_led_trigger);
224 else
225 /* Delay activation until all global triggers are registered */
226 rfkill_global_led_trigger_event();
227
228 return ret;
216} 229}
217 230
218static void rfkill_any_led_trigger_unregister(void) 231static void rfkill_global_led_trigger_unregister(void)
219{ 232{
233 led_trigger_unregister(&rfkill_none_led_trigger);
220 led_trigger_unregister(&rfkill_any_led_trigger); 234 led_trigger_unregister(&rfkill_any_led_trigger);
221 cancel_work_sync(&rfkill_any_work); 235 cancel_work_sync(&rfkill_global_led_trigger_work);
222} 236}
223#else 237#else
224static void rfkill_led_trigger_event(struct rfkill *rfkill) 238static void rfkill_led_trigger_event(struct rfkill *rfkill)
@@ -234,16 +248,16 @@ static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
234{ 248{
235} 249}
236 250
237static void rfkill_any_led_trigger_event(void) 251static void rfkill_global_led_trigger_event(void)
238{ 252{
239} 253}
240 254
241static int rfkill_any_led_trigger_register(void) 255static int rfkill_global_led_trigger_register(void)
242{ 256{
243 return 0; 257 return 0;
244} 258}
245 259
246static void rfkill_any_led_trigger_unregister(void) 260static void rfkill_global_led_trigger_unregister(void)
247{ 261{
248} 262}
249#endif /* CONFIG_RFKILL_LEDS */ 263#endif /* CONFIG_RFKILL_LEDS */
@@ -354,7 +368,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
354 spin_unlock_irqrestore(&rfkill->lock, flags); 368 spin_unlock_irqrestore(&rfkill->lock, flags);
355 369
356 rfkill_led_trigger_event(rfkill); 370 rfkill_led_trigger_event(rfkill);
357 rfkill_any_led_trigger_event(); 371 rfkill_global_led_trigger_event();
358 372
359 if (prev != curr) 373 if (prev != curr)
360 rfkill_event(rfkill); 374 rfkill_event(rfkill);
@@ -535,7 +549,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
535 spin_unlock_irqrestore(&rfkill->lock, flags); 549 spin_unlock_irqrestore(&rfkill->lock, flags);
536 550
537 rfkill_led_trigger_event(rfkill); 551 rfkill_led_trigger_event(rfkill);
538 rfkill_any_led_trigger_event(); 552 rfkill_global_led_trigger_event();
539 553
540 if (rfkill->registered && prev != blocked) 554 if (rfkill->registered && prev != blocked)
541 schedule_work(&rfkill->uevent_work); 555 schedule_work(&rfkill->uevent_work);
@@ -579,7 +593,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
579 schedule_work(&rfkill->uevent_work); 593 schedule_work(&rfkill->uevent_work);
580 594
581 rfkill_led_trigger_event(rfkill); 595 rfkill_led_trigger_event(rfkill);
582 rfkill_any_led_trigger_event(); 596 rfkill_global_led_trigger_event();
583 597
584 return blocked; 598 return blocked;
585} 599}
@@ -629,7 +643,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
629 schedule_work(&rfkill->uevent_work); 643 schedule_work(&rfkill->uevent_work);
630 644
631 rfkill_led_trigger_event(rfkill); 645 rfkill_led_trigger_event(rfkill);
632 rfkill_any_led_trigger_event(); 646 rfkill_global_led_trigger_event();
633 } 647 }
634} 648}
635EXPORT_SYMBOL(rfkill_set_states); 649EXPORT_SYMBOL(rfkill_set_states);
@@ -1046,7 +1060,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
1046#endif 1060#endif
1047 } 1061 }
1048 1062
1049 rfkill_any_led_trigger_event(); 1063 rfkill_global_led_trigger_event();
1050 rfkill_send_events(rfkill, RFKILL_OP_ADD); 1064 rfkill_send_events(rfkill, RFKILL_OP_ADD);
1051 1065
1052 mutex_unlock(&rfkill_global_mutex); 1066 mutex_unlock(&rfkill_global_mutex);
@@ -1079,7 +1093,7 @@ void rfkill_unregister(struct rfkill *rfkill)
1079 mutex_lock(&rfkill_global_mutex); 1093 mutex_lock(&rfkill_global_mutex);
1080 rfkill_send_events(rfkill, RFKILL_OP_DEL); 1094 rfkill_send_events(rfkill, RFKILL_OP_DEL);
1081 list_del_init(&rfkill->node); 1095 list_del_init(&rfkill->node);
1082 rfkill_any_led_trigger_event(); 1096 rfkill_global_led_trigger_event();
1083 mutex_unlock(&rfkill_global_mutex); 1097 mutex_unlock(&rfkill_global_mutex);
1084 1098
1085 rfkill_led_trigger_unregister(rfkill); 1099 rfkill_led_trigger_unregister(rfkill);
@@ -1332,7 +1346,7 @@ static int __init rfkill_init(void)
1332 if (error) 1346 if (error)
1333 goto error_misc; 1347 goto error_misc;
1334 1348
1335 error = rfkill_any_led_trigger_register(); 1349 error = rfkill_global_led_trigger_register();
1336 if (error) 1350 if (error)
1337 goto error_led_trigger; 1351 goto error_led_trigger;
1338 1352
@@ -1346,7 +1360,7 @@ static int __init rfkill_init(void)
1346 1360
1347#ifdef CONFIG_RFKILL_INPUT 1361#ifdef CONFIG_RFKILL_INPUT
1348error_input: 1362error_input:
1349 rfkill_any_led_trigger_unregister(); 1363 rfkill_global_led_trigger_unregister();
1350#endif 1364#endif
1351error_led_trigger: 1365error_led_trigger:
1352 misc_deregister(&rfkill_miscdev); 1366 misc_deregister(&rfkill_miscdev);
@@ -1362,7 +1376,7 @@ static void __exit rfkill_exit(void)
1362#ifdef CONFIG_RFKILL_INPUT 1376#ifdef CONFIG_RFKILL_INPUT
1363 rfkill_handler_exit(); 1377 rfkill_handler_exit();
1364#endif 1378#endif
1365 rfkill_any_led_trigger_unregister(); 1379 rfkill_global_led_trigger_unregister();
1366 misc_deregister(&rfkill_miscdev); 1380 misc_deregister(&rfkill_miscdev);
1367 class_unregister(&rfkill_class); 1381 class_unregister(&rfkill_class);
1368} 1382}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 29923ec2189c..5fb7d3254d9e 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -477,6 +477,7 @@ enum rxrpc_call_flag {
477 RXRPC_CALL_PINGING, /* Ping in process */ 477 RXRPC_CALL_PINGING, /* Ping in process */
478 RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ 478 RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
479 RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ 479 RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
480 RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
480}; 481};
481 482
482/* 483/*
@@ -624,6 +625,7 @@ struct rxrpc_call {
624 */ 625 */
625 rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ 626 rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
626 rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ 627 rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
628 rxrpc_serial_t rx_serial; /* Highest serial received for this call */
627 u8 rx_winsize; /* Size of Rx window */ 629 u8 rx_winsize; /* Size of Rx window */
628 u8 tx_winsize; /* Maximum size of Tx window */ 630 u8 tx_winsize; /* Maximum size of Tx window */
629 bool tx_phase; /* T if transmission phase, F if receive phase */ 631 bool tx_phase; /* T if transmission phase, F if receive phase */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6e0d788b4dc4..20210418904b 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -392,7 +392,13 @@ recheck_state:
392 392
393 /* Process events */ 393 /* Process events */
394 if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { 394 if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
395 rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); 395 if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
396 (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
397 trace_rxrpc_call_reset(call);
398 rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ECONNRESET);
399 } else {
400 rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
401 }
396 set_bit(RXRPC_CALL_EV_ABORT, &call->events); 402 set_bit(RXRPC_CALL_EV_ABORT, &call->events);
397 goto recheck_state; 403 goto recheck_state;
398 } 404 }
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 1350f1be8037..8229a52c2acd 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -70,7 +70,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
70 iov[2].iov_len = sizeof(ack_info); 70 iov[2].iov_len = sizeof(ack_info);
71 71
72 pkt.whdr.epoch = htonl(conn->proto.epoch); 72 pkt.whdr.epoch = htonl(conn->proto.epoch);
73 pkt.whdr.cid = htonl(conn->proto.cid); 73 pkt.whdr.cid = htonl(conn->proto.cid | channel);
74 pkt.whdr.callNumber = htonl(call_id); 74 pkt.whdr.callNumber = htonl(call_id);
75 pkt.whdr.seq = 0; 75 pkt.whdr.seq = 0;
76 pkt.whdr.type = chan->last_type; 76 pkt.whdr.type = chan->last_type;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index b5fd6381313d..608d078a4981 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1278,8 +1278,14 @@ void rxrpc_data_ready(struct sock *udp_sk)
1278 call = NULL; 1278 call = NULL;
1279 } 1279 }
1280 1280
1281 if (call && sp->hdr.serviceId != call->service_id) 1281 if (call) {
1282 call->service_id = sp->hdr.serviceId; 1282 if (sp->hdr.serviceId != call->service_id)
1283 call->service_id = sp->hdr.serviceId;
1284 if ((int)sp->hdr.serial - (int)call->rx_serial > 0)
1285 call->rx_serial = sp->hdr.serial;
1286 if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
1287 set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
1288 }
1283 } else { 1289 } else {
1284 skew = 0; 1290 skew = 0;
1285 call = NULL; 1291 call = NULL;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 72251241665a..3f4cf930f809 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -77,9 +77,9 @@ static void free_tcf(struct tc_action *p)
77 77
78static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) 78static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
79{ 79{
80 spin_lock_bh(&idrinfo->lock); 80 spin_lock(&idrinfo->lock);
81 idr_remove(&idrinfo->action_idr, p->tcfa_index); 81 idr_remove(&idrinfo->action_idr, p->tcfa_index);
82 spin_unlock_bh(&idrinfo->lock); 82 spin_unlock(&idrinfo->lock);
83 gen_kill_estimator(&p->tcfa_rate_est); 83 gen_kill_estimator(&p->tcfa_rate_est);
84 free_tcf(p); 84 free_tcf(p);
85} 85}
@@ -156,7 +156,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
156 struct tc_action *p; 156 struct tc_action *p;
157 unsigned long id = 1; 157 unsigned long id = 1;
158 158
159 spin_lock_bh(&idrinfo->lock); 159 spin_lock(&idrinfo->lock);
160 160
161 s_i = cb->args[0]; 161 s_i = cb->args[0];
162 162
@@ -191,7 +191,7 @@ done:
191 if (index >= 0) 191 if (index >= 0)
192 cb->args[0] = index + 1; 192 cb->args[0] = index + 1;
193 193
194 spin_unlock_bh(&idrinfo->lock); 194 spin_unlock(&idrinfo->lock);
195 if (n_i) { 195 if (n_i) {
196 if (act_flags & TCA_FLAG_LARGE_DUMP_ON) 196 if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
197 cb->args[1] = n_i; 197 cb->args[1] = n_i;
@@ -261,9 +261,9 @@ static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
261{ 261{
262 struct tc_action *p = NULL; 262 struct tc_action *p = NULL;
263 263
264 spin_lock_bh(&idrinfo->lock); 264 spin_lock(&idrinfo->lock);
265 p = idr_find(&idrinfo->action_idr, index); 265 p = idr_find(&idrinfo->action_idr, index);
266 spin_unlock_bh(&idrinfo->lock); 266 spin_unlock(&idrinfo->lock);
267 267
268 return p; 268 return p;
269} 269}
@@ -323,7 +323,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
323 } 323 }
324 spin_lock_init(&p->tcfa_lock); 324 spin_lock_init(&p->tcfa_lock);
325 idr_preload(GFP_KERNEL); 325 idr_preload(GFP_KERNEL);
326 spin_lock_bh(&idrinfo->lock); 326 spin_lock(&idrinfo->lock);
327 /* user doesn't specify an index */ 327 /* user doesn't specify an index */
328 if (!index) { 328 if (!index) {
329 index = 1; 329 index = 1;
@@ -331,7 +331,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
331 } else { 331 } else {
332 err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); 332 err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC);
333 } 333 }
334 spin_unlock_bh(&idrinfo->lock); 334 spin_unlock(&idrinfo->lock);
335 idr_preload_end(); 335 idr_preload_end();
336 if (err) 336 if (err)
337 goto err3; 337 goto err3;
@@ -369,9 +369,9 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
369{ 369{
370 struct tcf_idrinfo *idrinfo = tn->idrinfo; 370 struct tcf_idrinfo *idrinfo = tn->idrinfo;
371 371
372 spin_lock_bh(&idrinfo->lock); 372 spin_lock(&idrinfo->lock);
373 idr_replace(&idrinfo->action_idr, a, a->tcfa_index); 373 idr_replace(&idrinfo->action_idr, a, a->tcfa_index);
374 spin_unlock_bh(&idrinfo->lock); 374 spin_unlock(&idrinfo->lock);
375} 375}
376EXPORT_SYMBOL(tcf_idr_insert); 376EXPORT_SYMBOL(tcf_idr_insert);
377 377
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 7e28b2ce1437..526a8e491626 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -648,6 +648,11 @@ static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
648 return tcf_idr_search(tn, a, index); 648 return tcf_idr_search(tn, a, index);
649} 649}
650 650
651static size_t tcf_csum_get_fill_size(const struct tc_action *act)
652{
653 return nla_total_size(sizeof(struct tc_csum));
654}
655
651static struct tc_action_ops act_csum_ops = { 656static struct tc_action_ops act_csum_ops = {
652 .kind = "csum", 657 .kind = "csum",
653 .type = TCA_ACT_CSUM, 658 .type = TCA_ACT_CSUM,
@@ -658,6 +663,7 @@ static struct tc_action_ops act_csum_ops = {
658 .cleanup = tcf_csum_cleanup, 663 .cleanup = tcf_csum_cleanup,
659 .walk = tcf_csum_walker, 664 .walk = tcf_csum_walker,
660 .lookup = tcf_csum_search, 665 .lookup = tcf_csum_search,
666 .get_fill_size = tcf_csum_get_fill_size,
661 .size = sizeof(struct tcf_csum), 667 .size = sizeof(struct tcf_csum),
662}; 668};
663 669
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a57e112d9b3e..cdc3c87c53e6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -103,9 +103,10 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
103} 103}
104EXPORT_SYMBOL(unregister_tcf_proto_ops); 104EXPORT_SYMBOL(unregister_tcf_proto_ops);
105 105
106bool tcf_queue_work(struct work_struct *work) 106bool tcf_queue_work(struct rcu_work *rwork, work_func_t func)
107{ 107{
108 return queue_work(tc_filter_wq, work); 108 INIT_RCU_WORK(rwork, func);
109 return queue_rcu_work(tc_filter_wq, rwork);
109} 110}
110EXPORT_SYMBOL(tcf_queue_work); 111EXPORT_SYMBOL(tcf_queue_work);
111 112
@@ -436,6 +437,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
436 return idr_find(&tn->idr, block_index); 437 return idr_find(&tn->idr, block_index);
437} 438}
438 439
440/* Find tcf block.
441 * Set q, parent, cl when appropriate.
442 */
443
444static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
445 u32 *parent, unsigned long *cl,
446 int ifindex, u32 block_index,
447 struct netlink_ext_ack *extack)
448{
449 struct tcf_block *block;
450
451 if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
452 block = tcf_block_lookup(net, block_index);
453 if (!block) {
454 NL_SET_ERR_MSG(extack, "Block of given index was not found");
455 return ERR_PTR(-EINVAL);
456 }
457 } else {
458 const struct Qdisc_class_ops *cops;
459 struct net_device *dev;
460
461 /* Find link */
462 dev = __dev_get_by_index(net, ifindex);
463 if (!dev)
464 return ERR_PTR(-ENODEV);
465
466 /* Find qdisc */
467 if (!*parent) {
468 *q = dev->qdisc;
469 *parent = (*q)->handle;
470 } else {
471 *q = qdisc_lookup(dev, TC_H_MAJ(*parent));
472 if (!*q) {
473 NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
474 return ERR_PTR(-EINVAL);
475 }
476 }
477
478 /* Is it classful? */
479 cops = (*q)->ops->cl_ops;
480 if (!cops) {
481 NL_SET_ERR_MSG(extack, "Qdisc not classful");
482 return ERR_PTR(-EINVAL);
483 }
484
485 if (!cops->tcf_block) {
486 NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
487 return ERR_PTR(-EOPNOTSUPP);
488 }
489
490 /* Do we search for filter, attached to class? */
491 if (TC_H_MIN(*parent)) {
492 *cl = cops->find(*q, *parent);
493 if (*cl == 0) {
494 NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
495 return ERR_PTR(-ENOENT);
496 }
497 }
498
499 /* And the last stroke */
500 block = cops->tcf_block(*q, *cl, extack);
501 if (!block)
502 return ERR_PTR(-EINVAL);
503 if (tcf_block_shared(block)) {
504 NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
505 return ERR_PTR(-EOPNOTSUPP);
506 }
507 }
508
509 return block;
510}
511
439static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) 512static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
440{ 513{
441 return list_first_entry(&block->chain_list, struct tcf_chain, list); 514 return list_first_entry(&block->chain_list, struct tcf_chain, list);
@@ -983,9 +1056,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
983 q, parent, 0, event, false); 1056 q, parent, 0, event, false);
984} 1057}
985 1058
986/* Add/change/delete/get a filter node */ 1059static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
987
988static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
989 struct netlink_ext_ack *extack) 1060 struct netlink_ext_ack *extack)
990{ 1061{
991 struct net *net = sock_net(skb->sk); 1062 struct net *net = sock_net(skb->sk);
@@ -1006,8 +1077,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
1006 int err; 1077 int err;
1007 int tp_created; 1078 int tp_created;
1008 1079
1009 if ((n->nlmsg_type != RTM_GETTFILTER) && 1080 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1010 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1011 return -EPERM; 1081 return -EPERM;
1012 1082
1013replay: 1083replay:
@@ -1025,24 +1095,13 @@ replay:
1025 cl = 0; 1095 cl = 0;
1026 1096
1027 if (prio == 0) { 1097 if (prio == 0) {
1028 switch (n->nlmsg_type) { 1098 /* If no priority is provided by the user,
1029 case RTM_DELTFILTER: 1099 * we allocate one.
1030 if (protocol || t->tcm_handle || tca[TCA_KIND]) { 1100 */
1031 NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); 1101 if (n->nlmsg_flags & NLM_F_CREATE) {
1032 return -ENOENT; 1102 prio = TC_H_MAKE(0x80000000U, 0U);
1033 } 1103 prio_allocate = true;
1034 break; 1104 } else {
1035 case RTM_NEWTFILTER:
1036 /* If no priority is provided by the user,
1037 * we allocate one.
1038 */
1039 if (n->nlmsg_flags & NLM_F_CREATE) {
1040 prio = TC_H_MAKE(0x80000000U, 0U);
1041 prio_allocate = true;
1042 break;
1043 }
1044 /* fall-through */
1045 default:
1046 NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); 1105 NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
1047 return -ENOENT; 1106 return -ENOENT;
1048 } 1107 }
@@ -1050,66 +1109,11 @@ replay:
1050 1109
1051 /* Find head of filter chain. */ 1110 /* Find head of filter chain. */
1052 1111
1053 if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { 1112 block = tcf_block_find(net, &q, &parent, &cl,
1054 block = tcf_block_lookup(net, t->tcm_block_index); 1113 t->tcm_ifindex, t->tcm_block_index, extack);
1055 if (!block) { 1114 if (IS_ERR(block)) {
1056 NL_SET_ERR_MSG(extack, "Block of given index was not found"); 1115 err = PTR_ERR(block);
1057 err = -EINVAL; 1116 goto errout;
1058 goto errout;
1059 }
1060 } else {
1061 const struct Qdisc_class_ops *cops;
1062 struct net_device *dev;
1063
1064 /* Find link */
1065 dev = __dev_get_by_index(net, t->tcm_ifindex);
1066 if (!dev)
1067 return -ENODEV;
1068
1069 /* Find qdisc */
1070 if (!parent) {
1071 q = dev->qdisc;
1072 parent = q->handle;
1073 } else {
1074 q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
1075 if (!q) {
1076 NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
1077 return -EINVAL;
1078 }
1079 }
1080
1081 /* Is it classful? */
1082 cops = q->ops->cl_ops;
1083 if (!cops) {
1084 NL_SET_ERR_MSG(extack, "Qdisc not classful");
1085 return -EINVAL;
1086 }
1087
1088 if (!cops->tcf_block) {
1089 NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
1090 return -EOPNOTSUPP;
1091 }
1092
1093 /* Do we search for filter, attached to class? */
1094 if (TC_H_MIN(parent)) {
1095 cl = cops->find(q, parent);
1096 if (cl == 0) {
1097 NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
1098 return -ENOENT;
1099 }
1100 }
1101
1102 /* And the last stroke */
1103 block = cops->tcf_block(q, cl, extack);
1104 if (!block) {
1105 err = -EINVAL;
1106 goto errout;
1107 }
1108 if (tcf_block_shared(block)) {
1109 NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
1110 err = -EOPNOTSUPP;
1111 goto errout;
1112 }
1113 } 1117 }
1114 1118
1115 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; 1119 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
@@ -1118,19 +1122,10 @@ replay:
1118 err = -EINVAL; 1122 err = -EINVAL;
1119 goto errout; 1123 goto errout;
1120 } 1124 }
1121 chain = tcf_chain_get(block, chain_index, 1125 chain = tcf_chain_get(block, chain_index, true);
1122 n->nlmsg_type == RTM_NEWTFILTER);
1123 if (!chain) { 1126 if (!chain) {
1124 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); 1127 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
1125 err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL; 1128 err = -ENOMEM;
1126 goto errout;
1127 }
1128
1129 if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
1130 tfilter_notify_chain(net, skb, block, q, parent, n,
1131 chain, RTM_DELTFILTER);
1132 tcf_chain_flush(chain);
1133 err = 0;
1134 goto errout; 1129 goto errout;
1135 } 1130 }
1136 1131
@@ -1151,8 +1146,7 @@ replay:
1151 goto errout; 1146 goto errout;
1152 } 1147 }
1153 1148
1154 if (n->nlmsg_type != RTM_NEWTFILTER || 1149 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1155 !(n->nlmsg_flags & NLM_F_CREATE)) {
1156 NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); 1150 NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
1157 err = -ENOENT; 1151 err = -ENOENT;
1158 goto errout; 1152 goto errout;
@@ -1177,56 +1171,15 @@ replay:
1177 fh = tp->ops->get(tp, t->tcm_handle); 1171 fh = tp->ops->get(tp, t->tcm_handle);
1178 1172
1179 if (!fh) { 1173 if (!fh) {
1180 if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { 1174 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1181 tcf_chain_tp_remove(chain, &chain_info, tp);
1182 tfilter_notify(net, skb, n, tp, block, q, parent, fh,
1183 RTM_DELTFILTER, false);
1184 tcf_proto_destroy(tp, extack);
1185 err = 0;
1186 goto errout;
1187 }
1188
1189 if (n->nlmsg_type != RTM_NEWTFILTER ||
1190 !(n->nlmsg_flags & NLM_F_CREATE)) {
1191 NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); 1175 NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
1192 err = -ENOENT; 1176 err = -ENOENT;
1193 goto errout; 1177 goto errout;
1194 } 1178 }
1195 } else { 1179 } else if (n->nlmsg_flags & NLM_F_EXCL) {
1196 bool last; 1180 NL_SET_ERR_MSG(extack, "Filter already exists");
1197 1181 err = -EEXIST;
1198 switch (n->nlmsg_type) { 1182 goto errout;
1199 case RTM_NEWTFILTER:
1200 if (n->nlmsg_flags & NLM_F_EXCL) {
1201 if (tp_created)
1202 tcf_proto_destroy(tp, NULL);
1203 NL_SET_ERR_MSG(extack, "Filter already exists");
1204 err = -EEXIST;
1205 goto errout;
1206 }
1207 break;
1208 case RTM_DELTFILTER:
1209 err = tfilter_del_notify(net, skb, n, tp, block,
1210 q, parent, fh, false, &last,
1211 extack);
1212 if (err)
1213 goto errout;
1214 if (last) {
1215 tcf_chain_tp_remove(chain, &chain_info, tp);
1216 tcf_proto_destroy(tp, extack);
1217 }
1218 goto errout;
1219 case RTM_GETTFILTER:
1220 err = tfilter_notify(net, skb, n, tp, block, q, parent,
1221 fh, RTM_NEWTFILTER, true);
1222 if (err < 0)
1223 NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
1224 goto errout;
1225 default:
1226 NL_SET_ERR_MSG(extack, "Invalid netlink message type");
1227 err = -EINVAL;
1228 goto errout;
1229 }
1230 } 1183 }
1231 1184
1232 err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, 1185 err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
@@ -1251,6 +1204,202 @@ errout:
1251 return err; 1204 return err;
1252} 1205}
1253 1206
1207static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
1208 struct netlink_ext_ack *extack)
1209{
1210 struct net *net = sock_net(skb->sk);
1211 struct nlattr *tca[TCA_MAX + 1];
1212 struct tcmsg *t;
1213 u32 protocol;
1214 u32 prio;
1215 u32 parent;
1216 u32 chain_index;
1217 struct Qdisc *q = NULL;
1218 struct tcf_chain_info chain_info;
1219 struct tcf_chain *chain = NULL;
1220 struct tcf_block *block;
1221 struct tcf_proto *tp = NULL;
1222 unsigned long cl = 0;
1223 void *fh = NULL;
1224 int err;
1225
1226 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1227 return -EPERM;
1228
1229 err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
1230 if (err < 0)
1231 return err;
1232
1233 t = nlmsg_data(n);
1234 protocol = TC_H_MIN(t->tcm_info);
1235 prio = TC_H_MAJ(t->tcm_info);
1236 parent = t->tcm_parent;
1237
1238 if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) {
1239 NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
1240 return -ENOENT;
1241 }
1242
1243 /* Find head of filter chain. */
1244
1245 block = tcf_block_find(net, &q, &parent, &cl,
1246 t->tcm_ifindex, t->tcm_block_index, extack);
1247 if (IS_ERR(block)) {
1248 err = PTR_ERR(block);
1249 goto errout;
1250 }
1251
1252 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
1253 if (chain_index > TC_ACT_EXT_VAL_MASK) {
1254 NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
1255 err = -EINVAL;
1256 goto errout;
1257 }
1258 chain = tcf_chain_get(block, chain_index, false);
1259 if (!chain) {
1260 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
1261 err = -EINVAL;
1262 goto errout;
1263 }
1264
1265 if (prio == 0) {
1266 tfilter_notify_chain(net, skb, block, q, parent, n,
1267 chain, RTM_DELTFILTER);
1268 tcf_chain_flush(chain);
1269 err = 0;
1270 goto errout;
1271 }
1272
1273 tp = tcf_chain_tp_find(chain, &chain_info, protocol,
1274 prio, false);
1275 if (!tp || IS_ERR(tp)) {
1276 NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
1277 err = tp ? PTR_ERR(tp) : -ENOENT;
1278 goto errout;
1279 } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
1280 NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
1281 err = -EINVAL;
1282 goto errout;
1283 }
1284
1285 fh = tp->ops->get(tp, t->tcm_handle);
1286
1287 if (!fh) {
1288 if (t->tcm_handle == 0) {
1289 tcf_chain_tp_remove(chain, &chain_info, tp);
1290 tfilter_notify(net, skb, n, tp, block, q, parent, fh,
1291 RTM_DELTFILTER, false);
1292 tcf_proto_destroy(tp, extack);
1293 err = 0;
1294 } else {
1295 NL_SET_ERR_MSG(extack, "Specified filter handle not found");
1296 err = -ENOENT;
1297 }
1298 } else {
1299 bool last;
1300
1301 err = tfilter_del_notify(net, skb, n, tp, block,
1302 q, parent, fh, false, &last,
1303 extack);
1304 if (err)
1305 goto errout;
1306 if (last) {
1307 tcf_chain_tp_remove(chain, &chain_info, tp);
1308 tcf_proto_destroy(tp, extack);
1309 }
1310 }
1311
1312errout:
1313 if (chain)
1314 tcf_chain_put(chain);
1315 return err;
1316}
1317
1318static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
1319 struct netlink_ext_ack *extack)
1320{
1321 struct net *net = sock_net(skb->sk);
1322 struct nlattr *tca[TCA_MAX + 1];
1323 struct tcmsg *t;
1324 u32 protocol;
1325 u32 prio;
1326 u32 parent;
1327 u32 chain_index;
1328 struct Qdisc *q = NULL;
1329 struct tcf_chain_info chain_info;
1330 struct tcf_chain *chain = NULL;
1331 struct tcf_block *block;
1332 struct tcf_proto *tp = NULL;
1333 unsigned long cl = 0;
1334 void *fh = NULL;
1335 int err;
1336
1337 err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
1338 if (err < 0)
1339 return err;
1340
1341 t = nlmsg_data(n);
1342 protocol = TC_H_MIN(t->tcm_info);
1343 prio = TC_H_MAJ(t->tcm_info);
1344 parent = t->tcm_parent;
1345
1346 if (prio == 0) {
1347 NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
1348 return -ENOENT;
1349 }
1350
1351 /* Find head of filter chain. */
1352
1353 block = tcf_block_find(net, &q, &parent, &cl,
1354 t->tcm_ifindex, t->tcm_block_index, extack);
1355 if (IS_ERR(block)) {
1356 err = PTR_ERR(block);
1357 goto errout;
1358 }
1359
1360 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
1361 if (chain_index > TC_ACT_EXT_VAL_MASK) {
1362 NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
1363 err = -EINVAL;
1364 goto errout;
1365 }
1366 chain = tcf_chain_get(block, chain_index, false);
1367 if (!chain) {
1368 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
1369 err = -EINVAL;
1370 goto errout;
1371 }
1372
1373 tp = tcf_chain_tp_find(chain, &chain_info, protocol,
1374 prio, false);
1375 if (!tp || IS_ERR(tp)) {
1376 NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
1377 err = tp ? PTR_ERR(tp) : -ENOENT;
1378 goto errout;
1379 } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
1380 NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
1381 err = -EINVAL;
1382 goto errout;
1383 }
1384
1385 fh = tp->ops->get(tp, t->tcm_handle);
1386
1387 if (!fh) {
1388 NL_SET_ERR_MSG(extack, "Specified filter handle not found");
1389 err = -ENOENT;
1390 } else {
1391 err = tfilter_notify(net, skb, n, tp, block, q, parent,
1392 fh, RTM_NEWTFILTER, true);
1393 if (err < 0)
1394 NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
1395 }
1396
1397errout:
1398 if (chain)
1399 tcf_chain_put(chain);
1400 return err;
1401}
1402
1254struct tcf_dump_args { 1403struct tcf_dump_args {
1255 struct tcf_walker w; 1404 struct tcf_walker w;
1256 struct sk_buff *skb; 1405 struct sk_buff *skb;
@@ -1633,9 +1782,9 @@ static int __init tc_filter_init(void)
1633 if (err) 1782 if (err)
1634 goto err_register_pernet_subsys; 1783 goto err_register_pernet_subsys;
1635 1784
1636 rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); 1785 rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
1637 rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0); 1786 rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
1638 rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, 1787 rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
1639 tc_dump_tfilter, 0); 1788 tc_dump_tfilter, 0);
1640 1789
1641 return 0; 1790 return 0;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6b7ab3512f5b..95367f37098d 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -35,10 +35,7 @@ struct basic_filter {
35 struct tcf_result res; 35 struct tcf_result res;
36 struct tcf_proto *tp; 36 struct tcf_proto *tp;
37 struct list_head link; 37 struct list_head link;
38 union { 38 struct rcu_work rwork;
39 struct work_struct work;
40 struct rcu_head rcu;
41 };
42}; 39};
43 40
44static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, 41static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -97,21 +94,14 @@ static void __basic_delete_filter(struct basic_filter *f)
97 94
98static void basic_delete_filter_work(struct work_struct *work) 95static void basic_delete_filter_work(struct work_struct *work)
99{ 96{
100 struct basic_filter *f = container_of(work, struct basic_filter, work); 97 struct basic_filter *f = container_of(to_rcu_work(work),
101 98 struct basic_filter,
99 rwork);
102 rtnl_lock(); 100 rtnl_lock();
103 __basic_delete_filter(f); 101 __basic_delete_filter(f);
104 rtnl_unlock(); 102 rtnl_unlock();
105} 103}
106 104
107static void basic_delete_filter(struct rcu_head *head)
108{
109 struct basic_filter *f = container_of(head, struct basic_filter, rcu);
110
111 INIT_WORK(&f->work, basic_delete_filter_work);
112 tcf_queue_work(&f->work);
113}
114
115static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) 105static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
116{ 106{
117 struct basic_head *head = rtnl_dereference(tp->root); 107 struct basic_head *head = rtnl_dereference(tp->root);
@@ -122,7 +112,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
122 tcf_unbind_filter(tp, &f->res); 112 tcf_unbind_filter(tp, &f->res);
123 idr_remove(&head->handle_idr, f->handle); 113 idr_remove(&head->handle_idr, f->handle);
124 if (tcf_exts_get_net(&f->exts)) 114 if (tcf_exts_get_net(&f->exts))
125 call_rcu(&f->rcu, basic_delete_filter); 115 tcf_queue_work(&f->rwork, basic_delete_filter_work);
126 else 116 else
127 __basic_delete_filter(f); 117 __basic_delete_filter(f);
128 } 118 }
@@ -140,7 +130,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
140 tcf_unbind_filter(tp, &f->res); 130 tcf_unbind_filter(tp, &f->res);
141 idr_remove(&head->handle_idr, f->handle); 131 idr_remove(&head->handle_idr, f->handle);
142 tcf_exts_get_net(&f->exts); 132 tcf_exts_get_net(&f->exts);
143 call_rcu(&f->rcu, basic_delete_filter); 133 tcf_queue_work(&f->rwork, basic_delete_filter_work);
144 *last = list_empty(&head->flist); 134 *last = list_empty(&head->flist);
145 return 0; 135 return 0;
146} 136}
@@ -234,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
234 list_replace_rcu(&fold->link, &fnew->link); 224 list_replace_rcu(&fold->link, &fnew->link);
235 tcf_unbind_filter(tp, &fold->res); 225 tcf_unbind_filter(tp, &fold->res);
236 tcf_exts_get_net(&fold->exts); 226 tcf_exts_get_net(&fold->exts);
237 call_rcu(&fold->rcu, basic_delete_filter); 227 tcf_queue_work(&fold->rwork, basic_delete_filter_work);
238 } else { 228 } else {
239 list_add_rcu(&fnew->link, &head->flist); 229 list_add_rcu(&fnew->link, &head->flist);
240 } 230 }
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b07c1fa8bc0d..1aa7f6511065 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -49,10 +49,7 @@ struct cls_bpf_prog {
49 struct sock_filter *bpf_ops; 49 struct sock_filter *bpf_ops;
50 const char *bpf_name; 50 const char *bpf_name;
51 struct tcf_proto *tp; 51 struct tcf_proto *tp;
52 union { 52 struct rcu_work rwork;
53 struct work_struct work;
54 struct rcu_head rcu;
55 };
56}; 53};
57 54
58static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { 55static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
@@ -275,21 +272,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
275 272
276static void cls_bpf_delete_prog_work(struct work_struct *work) 273static void cls_bpf_delete_prog_work(struct work_struct *work)
277{ 274{
278 struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work); 275 struct cls_bpf_prog *prog = container_of(to_rcu_work(work),
279 276 struct cls_bpf_prog,
277 rwork);
280 rtnl_lock(); 278 rtnl_lock();
281 __cls_bpf_delete_prog(prog); 279 __cls_bpf_delete_prog(prog);
282 rtnl_unlock(); 280 rtnl_unlock();
283} 281}
284 282
285static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
286{
287 struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
288
289 INIT_WORK(&prog->work, cls_bpf_delete_prog_work);
290 tcf_queue_work(&prog->work);
291}
292
293static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, 283static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
294 struct netlink_ext_ack *extack) 284 struct netlink_ext_ack *extack)
295{ 285{
@@ -300,7 +290,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
300 list_del_rcu(&prog->link); 290 list_del_rcu(&prog->link);
301 tcf_unbind_filter(tp, &prog->res); 291 tcf_unbind_filter(tp, &prog->res);
302 if (tcf_exts_get_net(&prog->exts)) 292 if (tcf_exts_get_net(&prog->exts))
303 call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); 293 tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work);
304 else 294 else
305 __cls_bpf_delete_prog(prog); 295 __cls_bpf_delete_prog(prog);
306} 296}
@@ -526,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
526 list_replace_rcu(&oldprog->link, &prog->link); 516 list_replace_rcu(&oldprog->link, &prog->link);
527 tcf_unbind_filter(tp, &oldprog->res); 517 tcf_unbind_filter(tp, &oldprog->res);
528 tcf_exts_get_net(&oldprog->exts); 518 tcf_exts_get_net(&oldprog->exts);
529 call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); 519 tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work);
530 } else { 520 } else {
531 list_add_rcu(&prog->link, &head->plist); 521 list_add_rcu(&prog->link, &head->plist);
532 } 522 }
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 762da5c0cf5e..3bc01bdde165 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,10 +23,7 @@ struct cls_cgroup_head {
23 struct tcf_exts exts; 23 struct tcf_exts exts;
24 struct tcf_ematch_tree ematches; 24 struct tcf_ematch_tree ematches;
25 struct tcf_proto *tp; 25 struct tcf_proto *tp;
26 union { 26 struct rcu_work rwork;
27 struct work_struct work;
28 struct rcu_head rcu;
29 };
30}; 27};
31 28
32static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, 29static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -70,24 +67,14 @@ static void __cls_cgroup_destroy(struct cls_cgroup_head *head)
70 67
71static void cls_cgroup_destroy_work(struct work_struct *work) 68static void cls_cgroup_destroy_work(struct work_struct *work)
72{ 69{
73 struct cls_cgroup_head *head = container_of(work, 70 struct cls_cgroup_head *head = container_of(to_rcu_work(work),
74 struct cls_cgroup_head, 71 struct cls_cgroup_head,
75 work); 72 rwork);
76 rtnl_lock(); 73 rtnl_lock();
77 __cls_cgroup_destroy(head); 74 __cls_cgroup_destroy(head);
78 rtnl_unlock(); 75 rtnl_unlock();
79} 76}
80 77
81static void cls_cgroup_destroy_rcu(struct rcu_head *root)
82{
83 struct cls_cgroup_head *head = container_of(root,
84 struct cls_cgroup_head,
85 rcu);
86
87 INIT_WORK(&head->work, cls_cgroup_destroy_work);
88 tcf_queue_work(&head->work);
89}
90
91static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, 78static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
92 struct tcf_proto *tp, unsigned long base, 79 struct tcf_proto *tp, unsigned long base,
93 u32 handle, struct nlattr **tca, 80 u32 handle, struct nlattr **tca,
@@ -134,7 +121,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
134 rcu_assign_pointer(tp->root, new); 121 rcu_assign_pointer(tp->root, new);
135 if (head) { 122 if (head) {
136 tcf_exts_get_net(&head->exts); 123 tcf_exts_get_net(&head->exts);
137 call_rcu(&head->rcu, cls_cgroup_destroy_rcu); 124 tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
138 } 125 }
139 return 0; 126 return 0;
140errout: 127errout:
@@ -151,7 +138,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp,
151 /* Head can still be NULL due to cls_cgroup_init(). */ 138 /* Head can still be NULL due to cls_cgroup_init(). */
152 if (head) { 139 if (head) {
153 if (tcf_exts_get_net(&head->exts)) 140 if (tcf_exts_get_net(&head->exts))
154 call_rcu(&head->rcu, cls_cgroup_destroy_rcu); 141 tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
155 else 142 else
156 __cls_cgroup_destroy(head); 143 __cls_cgroup_destroy(head);
157 } 144 }
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index cd5fe383afdd..2bb043cd436b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -57,10 +57,7 @@ struct flow_filter {
57 u32 divisor; 57 u32 divisor;
58 u32 baseclass; 58 u32 baseclass;
59 u32 hashrnd; 59 u32 hashrnd;
60 union { 60 struct rcu_work rwork;
61 struct work_struct work;
62 struct rcu_head rcu;
63 };
64}; 61};
65 62
66static inline u32 addr_fold(void *addr) 63static inline u32 addr_fold(void *addr)
@@ -383,21 +380,14 @@ static void __flow_destroy_filter(struct flow_filter *f)
383 380
384static void flow_destroy_filter_work(struct work_struct *work) 381static void flow_destroy_filter_work(struct work_struct *work)
385{ 382{
386 struct flow_filter *f = container_of(work, struct flow_filter, work); 383 struct flow_filter *f = container_of(to_rcu_work(work),
387 384 struct flow_filter,
385 rwork);
388 rtnl_lock(); 386 rtnl_lock();
389 __flow_destroy_filter(f); 387 __flow_destroy_filter(f);
390 rtnl_unlock(); 388 rtnl_unlock();
391} 389}
392 390
393static void flow_destroy_filter(struct rcu_head *head)
394{
395 struct flow_filter *f = container_of(head, struct flow_filter, rcu);
396
397 INIT_WORK(&f->work, flow_destroy_filter_work);
398 tcf_queue_work(&f->work);
399}
400
401static int flow_change(struct net *net, struct sk_buff *in_skb, 391static int flow_change(struct net *net, struct sk_buff *in_skb,
402 struct tcf_proto *tp, unsigned long base, 392 struct tcf_proto *tp, unsigned long base,
403 u32 handle, struct nlattr **tca, 393 u32 handle, struct nlattr **tca,
@@ -563,7 +553,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
563 553
564 if (fold) { 554 if (fold) {
565 tcf_exts_get_net(&fold->exts); 555 tcf_exts_get_net(&fold->exts);
566 call_rcu(&fold->rcu, flow_destroy_filter); 556 tcf_queue_work(&fold->rwork, flow_destroy_filter_work);
567 } 557 }
568 return 0; 558 return 0;
569 559
@@ -583,7 +573,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
583 573
584 list_del_rcu(&f->list); 574 list_del_rcu(&f->list);
585 tcf_exts_get_net(&f->exts); 575 tcf_exts_get_net(&f->exts);
586 call_rcu(&f->rcu, flow_destroy_filter); 576 tcf_queue_work(&f->rwork, flow_destroy_filter_work);
587 *last = list_empty(&head->filters); 577 *last = list_empty(&head->filters);
588 return 0; 578 return 0;
589} 579}
@@ -608,7 +598,7 @@ static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
608 list_for_each_entry_safe(f, next, &head->filters, list) { 598 list_for_each_entry_safe(f, next, &head->filters, list) {
609 list_del_rcu(&f->list); 599 list_del_rcu(&f->list);
610 if (tcf_exts_get_net(&f->exts)) 600 if (tcf_exts_get_net(&f->exts))
611 call_rcu(&f->rcu, flow_destroy_filter); 601 tcf_queue_work(&f->rwork, flow_destroy_filter_work);
612 else 602 else
613 __flow_destroy_filter(f); 603 __flow_destroy_filter(f);
614 } 604 }
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c79f6e71512e..2b5be42a9f1c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -61,24 +61,24 @@ struct fl_flow_mask_range {
61struct fl_flow_mask { 61struct fl_flow_mask {
62 struct fl_flow_key key; 62 struct fl_flow_key key;
63 struct fl_flow_mask_range range; 63 struct fl_flow_mask_range range;
64 struct rcu_head rcu; 64 struct rhash_head ht_node;
65 struct rhashtable ht;
66 struct rhashtable_params filter_ht_params;
67 struct flow_dissector dissector;
68 struct list_head filters;
69 struct rcu_head rcu;
70 struct list_head list;
65}; 71};
66 72
67struct cls_fl_head { 73struct cls_fl_head {
68 struct rhashtable ht; 74 struct rhashtable ht;
69 struct fl_flow_mask mask; 75 struct list_head masks;
70 struct flow_dissector dissector; 76 struct rcu_work rwork;
71 bool mask_assigned;
72 struct list_head filters;
73 struct rhashtable_params ht_params;
74 union {
75 struct work_struct work;
76 struct rcu_head rcu;
77 };
78 struct idr handle_idr; 77 struct idr handle_idr;
79}; 78};
80 79
81struct cls_fl_filter { 80struct cls_fl_filter {
81 struct fl_flow_mask *mask;
82 struct rhash_head ht_node; 82 struct rhash_head ht_node;
83 struct fl_flow_key mkey; 83 struct fl_flow_key mkey;
84 struct tcf_exts exts; 84 struct tcf_exts exts;
@@ -87,13 +87,17 @@ struct cls_fl_filter {
87 struct list_head list; 87 struct list_head list;
88 u32 handle; 88 u32 handle;
89 u32 flags; 89 u32 flags;
90 union { 90 struct rcu_work rwork;
91 struct work_struct work;
92 struct rcu_head rcu;
93 };
94 struct net_device *hw_dev; 91 struct net_device *hw_dev;
95}; 92};
96 93
94static const struct rhashtable_params mask_ht_params = {
95 .key_offset = offsetof(struct fl_flow_mask, key),
96 .key_len = sizeof(struct fl_flow_key),
97 .head_offset = offsetof(struct fl_flow_mask, ht_node),
98 .automatic_shrinking = true,
99};
100
97static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) 101static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
98{ 102{
99 return mask->range.end - mask->range.start; 103 return mask->range.end - mask->range.start;
@@ -103,13 +107,19 @@ static void fl_mask_update_range(struct fl_flow_mask *mask)
103{ 107{
104 const u8 *bytes = (const u8 *) &mask->key; 108 const u8 *bytes = (const u8 *) &mask->key;
105 size_t size = sizeof(mask->key); 109 size_t size = sizeof(mask->key);
106 size_t i, first = 0, last = size - 1; 110 size_t i, first = 0, last;
107 111
108 for (i = 0; i < sizeof(mask->key); i++) { 112 for (i = 0; i < size; i++) {
113 if (bytes[i]) {
114 first = i;
115 break;
116 }
117 }
118 last = first;
119 for (i = size - 1; i != first; i--) {
109 if (bytes[i]) { 120 if (bytes[i]) {
110 if (!first && i)
111 first = i;
112 last = i; 121 last = i;
122 break;
113 } 123 }
114 } 124 }
115 mask->range.start = rounddown(first, sizeof(long)); 125 mask->range.start = rounddown(first, sizeof(long));
@@ -140,12 +150,11 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
140 memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); 150 memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
141} 151}
142 152
143static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head, 153static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
144 struct fl_flow_key *mkey) 154 struct fl_flow_key *mkey)
145{ 155{
146 return rhashtable_lookup_fast(&head->ht, 156 return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
147 fl_key_get_start(mkey, &head->mask), 157 mask->filter_ht_params);
148 head->ht_params);
149} 158}
150 159
151static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, 160static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -153,28 +162,28 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
153{ 162{
154 struct cls_fl_head *head = rcu_dereference_bh(tp->root); 163 struct cls_fl_head *head = rcu_dereference_bh(tp->root);
155 struct cls_fl_filter *f; 164 struct cls_fl_filter *f;
165 struct fl_flow_mask *mask;
156 struct fl_flow_key skb_key; 166 struct fl_flow_key skb_key;
157 struct fl_flow_key skb_mkey; 167 struct fl_flow_key skb_mkey;
158 168
159 if (!atomic_read(&head->ht.nelems)) 169 list_for_each_entry_rcu(mask, &head->masks, list) {
160 return -1; 170 fl_clear_masked_range(&skb_key, mask);
161
162 fl_clear_masked_range(&skb_key, &head->mask);
163 171
164 skb_key.indev_ifindex = skb->skb_iif; 172 skb_key.indev_ifindex = skb->skb_iif;
165 /* skb_flow_dissect() does not set n_proto in case an unknown protocol, 173 /* skb_flow_dissect() does not set n_proto in case an unknown
166 * so do it rather here. 174 * protocol, so do it rather here.
167 */ 175 */
168 skb_key.basic.n_proto = skb->protocol; 176 skb_key.basic.n_proto = skb->protocol;
169 skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key); 177 skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
170 skb_flow_dissect(skb, &head->dissector, &skb_key, 0); 178 skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
171 179
172 fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); 180 fl_set_masked_key(&skb_mkey, &skb_key, mask);
173 181
174 f = fl_lookup(head, &skb_mkey); 182 f = fl_lookup(mask, &skb_mkey);
175 if (f && !tc_skip_sw(f->flags)) { 183 if (f && !tc_skip_sw(f->flags)) {
176 *res = f->res; 184 *res = f->res;
177 return tcf_exts_exec(skb, &f->exts, res); 185 return tcf_exts_exec(skb, &f->exts, res);
186 }
178 } 187 }
179 return -1; 188 return -1;
180} 189}
@@ -187,11 +196,28 @@ static int fl_init(struct tcf_proto *tp)
187 if (!head) 196 if (!head)
188 return -ENOBUFS; 197 return -ENOBUFS;
189 198
190 INIT_LIST_HEAD_RCU(&head->filters); 199 INIT_LIST_HEAD_RCU(&head->masks);
191 rcu_assign_pointer(tp->root, head); 200 rcu_assign_pointer(tp->root, head);
192 idr_init(&head->handle_idr); 201 idr_init(&head->handle_idr);
193 202
194 return 0; 203 return rhashtable_init(&head->ht, &mask_ht_params);
204}
205
206static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
207 bool async)
208{
209 if (!list_empty(&mask->filters))
210 return false;
211
212 rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
213 rhashtable_destroy(&mask->ht);
214 list_del_rcu(&mask->list);
215 if (async)
216 kfree_rcu(mask, rcu);
217 else
218 kfree(mask);
219
220 return true;
195} 221}
196 222
197static void __fl_destroy_filter(struct cls_fl_filter *f) 223static void __fl_destroy_filter(struct cls_fl_filter *f)
@@ -203,21 +229,14 @@ static void __fl_destroy_filter(struct cls_fl_filter *f)
203 229
204static void fl_destroy_filter_work(struct work_struct *work) 230static void fl_destroy_filter_work(struct work_struct *work)
205{ 231{
206 struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work); 232 struct cls_fl_filter *f = container_of(to_rcu_work(work),
233 struct cls_fl_filter, rwork);
207 234
208 rtnl_lock(); 235 rtnl_lock();
209 __fl_destroy_filter(f); 236 __fl_destroy_filter(f);
210 rtnl_unlock(); 237 rtnl_unlock();
211} 238}
212 239
213static void fl_destroy_filter(struct rcu_head *head)
214{
215 struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
216
217 INIT_WORK(&f->work, fl_destroy_filter_work);
218 tcf_queue_work(&f->work);
219}
220
221static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, 240static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
222 struct netlink_ext_ack *extack) 241 struct netlink_ext_ack *extack)
223{ 242{
@@ -234,8 +253,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
234} 253}
235 254
236static int fl_hw_replace_filter(struct tcf_proto *tp, 255static int fl_hw_replace_filter(struct tcf_proto *tp,
237 struct flow_dissector *dissector,
238 struct fl_flow_key *mask,
239 struct cls_fl_filter *f, 256 struct cls_fl_filter *f,
240 struct netlink_ext_ack *extack) 257 struct netlink_ext_ack *extack)
241{ 258{
@@ -247,8 +264,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
247 tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); 264 tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
248 cls_flower.command = TC_CLSFLOWER_REPLACE; 265 cls_flower.command = TC_CLSFLOWER_REPLACE;
249 cls_flower.cookie = (unsigned long) f; 266 cls_flower.cookie = (unsigned long) f;
250 cls_flower.dissector = dissector; 267 cls_flower.dissector = &f->mask->dissector;
251 cls_flower.mask = mask; 268 cls_flower.mask = &f->mask->key;
252 cls_flower.key = &f->mkey; 269 cls_flower.key = &f->mkey;
253 cls_flower.exts = &f->exts; 270 cls_flower.exts = &f->exts;
254 cls_flower.classid = f->res.classid; 271 cls_flower.classid = f->res.classid;
@@ -283,51 +300,54 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
283 &cls_flower, false); 300 &cls_flower, false);
284} 301}
285 302
286static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, 303static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
287 struct netlink_ext_ack *extack) 304 struct netlink_ext_ack *extack)
288{ 305{
289 struct cls_fl_head *head = rtnl_dereference(tp->root); 306 struct cls_fl_head *head = rtnl_dereference(tp->root);
307 bool async = tcf_exts_get_net(&f->exts);
308 bool last;
290 309
291 idr_remove(&head->handle_idr, f->handle); 310 idr_remove(&head->handle_idr, f->handle);
292 list_del_rcu(&f->list); 311 list_del_rcu(&f->list);
312 last = fl_mask_put(head, f->mask, async);
293 if (!tc_skip_hw(f->flags)) 313 if (!tc_skip_hw(f->flags))
294 fl_hw_destroy_filter(tp, f, extack); 314 fl_hw_destroy_filter(tp, f, extack);
295 tcf_unbind_filter(tp, &f->res); 315 tcf_unbind_filter(tp, &f->res);
296 if (tcf_exts_get_net(&f->exts)) 316 if (async)
297 call_rcu(&f->rcu, fl_destroy_filter); 317 tcf_queue_work(&f->rwork, fl_destroy_filter_work);
298 else 318 else
299 __fl_destroy_filter(f); 319 __fl_destroy_filter(f);
320
321 return last;
300} 322}
301 323
302static void fl_destroy_sleepable(struct work_struct *work) 324static void fl_destroy_sleepable(struct work_struct *work)
303{ 325{
304 struct cls_fl_head *head = container_of(work, struct cls_fl_head, 326 struct cls_fl_head *head = container_of(to_rcu_work(work),
305 work); 327 struct cls_fl_head,
306 if (head->mask_assigned) 328 rwork);
307 rhashtable_destroy(&head->ht); 329
330 rhashtable_destroy(&head->ht);
308 kfree(head); 331 kfree(head);
309 module_put(THIS_MODULE); 332 module_put(THIS_MODULE);
310} 333}
311 334
312static void fl_destroy_rcu(struct rcu_head *rcu)
313{
314 struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
315
316 INIT_WORK(&head->work, fl_destroy_sleepable);
317 schedule_work(&head->work);
318}
319
320static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) 335static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
321{ 336{
322 struct cls_fl_head *head = rtnl_dereference(tp->root); 337 struct cls_fl_head *head = rtnl_dereference(tp->root);
338 struct fl_flow_mask *mask, *next_mask;
323 struct cls_fl_filter *f, *next; 339 struct cls_fl_filter *f, *next;
324 340
325 list_for_each_entry_safe(f, next, &head->filters, list) 341 list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
326 __fl_delete(tp, f, extack); 342 list_for_each_entry_safe(f, next, &mask->filters, list) {
343 if (__fl_delete(tp, f, extack))
344 break;
345 }
346 }
327 idr_destroy(&head->handle_idr); 347 idr_destroy(&head->handle_idr);
328 348
329 __module_get(THIS_MODULE); 349 __module_get(THIS_MODULE);
330 call_rcu(&head->rcu, fl_destroy_rcu); 350 tcf_queue_work(&head->rwork, fl_destroy_sleepable);
331} 351}
332 352
333static void *fl_get(struct tcf_proto *tp, u32 handle) 353static void *fl_get(struct tcf_proto *tp, u32 handle)
@@ -715,14 +735,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
715 return ret; 735 return ret;
716} 736}
717 737
718static bool fl_mask_eq(struct fl_flow_mask *mask1, 738static void fl_mask_copy(struct fl_flow_mask *dst,
719 struct fl_flow_mask *mask2) 739 struct fl_flow_mask *src)
720{ 740{
721 const long *lmask1 = fl_key_get_start(&mask1->key, mask1); 741 const void *psrc = fl_key_get_start(&src->key, src);
722 const long *lmask2 = fl_key_get_start(&mask2->key, mask2); 742 void *pdst = fl_key_get_start(&dst->key, src);
723 743
724 return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && 744 memcpy(pdst, psrc, fl_mask_range(src));
725 !memcmp(lmask1, lmask2, fl_mask_range(mask1)); 745 dst->range = src->range;
726} 746}
727 747
728static const struct rhashtable_params fl_ht_params = { 748static const struct rhashtable_params fl_ht_params = {
@@ -731,14 +751,13 @@ static const struct rhashtable_params fl_ht_params = {
731 .automatic_shrinking = true, 751 .automatic_shrinking = true,
732}; 752};
733 753
734static int fl_init_hashtable(struct cls_fl_head *head, 754static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
735 struct fl_flow_mask *mask)
736{ 755{
737 head->ht_params = fl_ht_params; 756 mask->filter_ht_params = fl_ht_params;
738 head->ht_params.key_len = fl_mask_range(mask); 757 mask->filter_ht_params.key_len = fl_mask_range(mask);
739 head->ht_params.key_offset += mask->range.start; 758 mask->filter_ht_params.key_offset += mask->range.start;
740 759
741 return rhashtable_init(&head->ht, &head->ht_params); 760 return rhashtable_init(&mask->ht, &mask->filter_ht_params);
742} 761}
743 762
744#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) 763#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
@@ -761,8 +780,7 @@ static int fl_init_hashtable(struct cls_fl_head *head,
761 FL_KEY_SET(keys, cnt, id, member); \ 780 FL_KEY_SET(keys, cnt, id, member); \
762 } while(0); 781 } while(0);
763 782
764static void fl_init_dissector(struct cls_fl_head *head, 783static void fl_init_dissector(struct fl_flow_mask *mask)
765 struct fl_flow_mask *mask)
766{ 784{
767 struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; 785 struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
768 size_t cnt = 0; 786 size_t cnt = 0;
@@ -802,31 +820,66 @@ static void fl_init_dissector(struct cls_fl_head *head,
802 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 820 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
803 FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); 821 FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
804 822
805 skb_flow_dissector_init(&head->dissector, keys, cnt); 823 skb_flow_dissector_init(&mask->dissector, keys, cnt);
824}
825
826static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
827 struct fl_flow_mask *mask)
828{
829 struct fl_flow_mask *newmask;
830 int err;
831
832 newmask = kzalloc(sizeof(*newmask), GFP_KERNEL);
833 if (!newmask)
834 return ERR_PTR(-ENOMEM);
835
836 fl_mask_copy(newmask, mask);
837
838 err = fl_init_mask_hashtable(newmask);
839 if (err)
840 goto errout_free;
841
842 fl_init_dissector(newmask);
843
844 INIT_LIST_HEAD_RCU(&newmask->filters);
845
846 err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
847 mask_ht_params);
848 if (err)
849 goto errout_destroy;
850
851 list_add_tail_rcu(&newmask->list, &head->masks);
852
853 return newmask;
854
855errout_destroy:
856 rhashtable_destroy(&newmask->ht);
857errout_free:
858 kfree(newmask);
859
860 return ERR_PTR(err);
806} 861}
807 862
808static int fl_check_assign_mask(struct cls_fl_head *head, 863static int fl_check_assign_mask(struct cls_fl_head *head,
864 struct cls_fl_filter *fnew,
865 struct cls_fl_filter *fold,
809 struct fl_flow_mask *mask) 866 struct fl_flow_mask *mask)
810{ 867{
811 int err; 868 struct fl_flow_mask *newmask;
812 869
813 if (head->mask_assigned) { 870 fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
814 if (!fl_mask_eq(&head->mask, mask)) 871 if (!fnew->mask) {
872 if (fold)
815 return -EINVAL; 873 return -EINVAL;
816 else
817 return 0;
818 }
819 874
820 /* Mask is not assigned yet. So assign it and init hashtable 875 newmask = fl_create_new_mask(head, mask);
821 * according to that. 876 if (IS_ERR(newmask))
822 */ 877 return PTR_ERR(newmask);
823 err = fl_init_hashtable(head, mask);
824 if (err)
825 return err;
826 memcpy(&head->mask, mask, sizeof(head->mask));
827 head->mask_assigned = true;
828 878
829 fl_init_dissector(head, mask); 879 fnew->mask = newmask;
880 } else if (fold && fold->mask != fnew->mask) {
881 return -EINVAL;
882 }
830 883
831 return 0; 884 return 0;
832} 885}
@@ -924,30 +977,26 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
924 if (err) 977 if (err)
925 goto errout_idr; 978 goto errout_idr;
926 979
927 err = fl_check_assign_mask(head, &mask); 980 err = fl_check_assign_mask(head, fnew, fold, &mask);
928 if (err) 981 if (err)
929 goto errout_idr; 982 goto errout_idr;
930 983
931 if (!tc_skip_sw(fnew->flags)) { 984 if (!tc_skip_sw(fnew->flags)) {
932 if (!fold && fl_lookup(head, &fnew->mkey)) { 985 if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
933 err = -EEXIST; 986 err = -EEXIST;
934 goto errout_idr; 987 goto errout_mask;
935 } 988 }
936 989
937 err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, 990 err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
938 head->ht_params); 991 fnew->mask->filter_ht_params);
939 if (err) 992 if (err)
940 goto errout_idr; 993 goto errout_mask;
941 } 994 }
942 995
943 if (!tc_skip_hw(fnew->flags)) { 996 if (!tc_skip_hw(fnew->flags)) {
944 err = fl_hw_replace_filter(tp, 997 err = fl_hw_replace_filter(tp, fnew, extack);
945 &head->dissector,
946 &mask.key,
947 fnew,
948 extack);
949 if (err) 998 if (err)
950 goto errout_idr; 999 goto errout_mask;
951 } 1000 }
952 1001
953 if (!tc_in_hw(fnew->flags)) 1002 if (!tc_in_hw(fnew->flags))
@@ -955,8 +1004,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
955 1004
956 if (fold) { 1005 if (fold) {
957 if (!tc_skip_sw(fold->flags)) 1006 if (!tc_skip_sw(fold->flags))
958 rhashtable_remove_fast(&head->ht, &fold->ht_node, 1007 rhashtable_remove_fast(&fold->mask->ht,
959 head->ht_params); 1008 &fold->ht_node,
1009 fold->mask->filter_ht_params);
960 if (!tc_skip_hw(fold->flags)) 1010 if (!tc_skip_hw(fold->flags))
961 fl_hw_destroy_filter(tp, fold, NULL); 1011 fl_hw_destroy_filter(tp, fold, NULL);
962 } 1012 }
@@ -968,14 +1018,17 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
968 list_replace_rcu(&fold->list, &fnew->list); 1018 list_replace_rcu(&fold->list, &fnew->list);
969 tcf_unbind_filter(tp, &fold->res); 1019 tcf_unbind_filter(tp, &fold->res);
970 tcf_exts_get_net(&fold->exts); 1020 tcf_exts_get_net(&fold->exts);
971 call_rcu(&fold->rcu, fl_destroy_filter); 1021 tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
972 } else { 1022 } else {
973 list_add_tail_rcu(&fnew->list, &head->filters); 1023 list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
974 } 1024 }
975 1025
976 kfree(tb); 1026 kfree(tb);
977 return 0; 1027 return 0;
978 1028
1029errout_mask:
1030 fl_mask_put(head, fnew->mask, false);
1031
979errout_idr: 1032errout_idr:
980 if (!fold) 1033 if (!fold)
981 idr_remove(&head->handle_idr, fnew->handle); 1034 idr_remove(&head->handle_idr, fnew->handle);
@@ -994,10 +1047,10 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
994 struct cls_fl_filter *f = arg; 1047 struct cls_fl_filter *f = arg;
995 1048
996 if (!tc_skip_sw(f->flags)) 1049 if (!tc_skip_sw(f->flags))
997 rhashtable_remove_fast(&head->ht, &f->ht_node, 1050 rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
998 head->ht_params); 1051 f->mask->filter_ht_params);
999 __fl_delete(tp, f, extack); 1052 __fl_delete(tp, f, extack);
1000 *last = list_empty(&head->filters); 1053 *last = list_empty(&head->masks);
1001 return 0; 1054 return 0;
1002} 1055}
1003 1056
@@ -1005,16 +1058,19 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1005{ 1058{
1006 struct cls_fl_head *head = rtnl_dereference(tp->root); 1059 struct cls_fl_head *head = rtnl_dereference(tp->root);
1007 struct cls_fl_filter *f; 1060 struct cls_fl_filter *f;
1008 1061 struct fl_flow_mask *mask;
1009 list_for_each_entry_rcu(f, &head->filters, list) { 1062
1010 if (arg->count < arg->skip) 1063 list_for_each_entry_rcu(mask, &head->masks, list) {
1011 goto skip; 1064 list_for_each_entry_rcu(f, &mask->filters, list) {
1012 if (arg->fn(tp, f, arg) < 0) { 1065 if (arg->count < arg->skip)
1013 arg->stop = 1; 1066 goto skip;
1014 break; 1067 if (arg->fn(tp, f, arg) < 0) {
1015 } 1068 arg->stop = 1;
1069 break;
1070 }
1016skip: 1071skip:
1017 arg->count++; 1072 arg->count++;
1073 }
1018 } 1074 }
1019} 1075}
1020 1076
@@ -1150,7 +1206,6 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
1150static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, 1206static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1151 struct sk_buff *skb, struct tcmsg *t) 1207 struct sk_buff *skb, struct tcmsg *t)
1152{ 1208{
1153 struct cls_fl_head *head = rtnl_dereference(tp->root);
1154 struct cls_fl_filter *f = fh; 1209 struct cls_fl_filter *f = fh;
1155 struct nlattr *nest; 1210 struct nlattr *nest;
1156 struct fl_flow_key *key, *mask; 1211 struct fl_flow_key *key, *mask;
@@ -1169,7 +1224,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1169 goto nla_put_failure; 1224 goto nla_put_failure;
1170 1225
1171 key = &f->key; 1226 key = &f->key;
1172 mask = &head->mask.key; 1227 mask = &f->mask->key;
1173 1228
1174 if (mask->indev_ifindex) { 1229 if (mask->indev_ifindex) {
1175 struct net_device *dev; 1230 struct net_device *dev;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 8b207723fbc2..29eeeaf3ea44 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -47,10 +47,7 @@ struct fw_filter {
47#endif /* CONFIG_NET_CLS_IND */ 47#endif /* CONFIG_NET_CLS_IND */
48 struct tcf_exts exts; 48 struct tcf_exts exts;
49 struct tcf_proto *tp; 49 struct tcf_proto *tp;
50 union { 50 struct rcu_work rwork;
51 struct work_struct work;
52 struct rcu_head rcu;
53 };
54}; 51};
55 52
56static u32 fw_hash(u32 handle) 53static u32 fw_hash(u32 handle)
@@ -134,21 +131,14 @@ static void __fw_delete_filter(struct fw_filter *f)
134 131
135static void fw_delete_filter_work(struct work_struct *work) 132static void fw_delete_filter_work(struct work_struct *work)
136{ 133{
137 struct fw_filter *f = container_of(work, struct fw_filter, work); 134 struct fw_filter *f = container_of(to_rcu_work(work),
138 135 struct fw_filter,
136 rwork);
139 rtnl_lock(); 137 rtnl_lock();
140 __fw_delete_filter(f); 138 __fw_delete_filter(f);
141 rtnl_unlock(); 139 rtnl_unlock();
142} 140}
143 141
144static void fw_delete_filter(struct rcu_head *head)
145{
146 struct fw_filter *f = container_of(head, struct fw_filter, rcu);
147
148 INIT_WORK(&f->work, fw_delete_filter_work);
149 tcf_queue_work(&f->work);
150}
151
152static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) 142static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
153{ 143{
154 struct fw_head *head = rtnl_dereference(tp->root); 144 struct fw_head *head = rtnl_dereference(tp->root);
@@ -164,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
164 rtnl_dereference(f->next)); 154 rtnl_dereference(f->next));
165 tcf_unbind_filter(tp, &f->res); 155 tcf_unbind_filter(tp, &f->res);
166 if (tcf_exts_get_net(&f->exts)) 156 if (tcf_exts_get_net(&f->exts))
167 call_rcu(&f->rcu, fw_delete_filter); 157 tcf_queue_work(&f->rwork, fw_delete_filter_work);
168 else 158 else
169 __fw_delete_filter(f); 159 __fw_delete_filter(f);
170 } 160 }
@@ -193,7 +183,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
193 RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); 183 RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
194 tcf_unbind_filter(tp, &f->res); 184 tcf_unbind_filter(tp, &f->res);
195 tcf_exts_get_net(&f->exts); 185 tcf_exts_get_net(&f->exts);
196 call_rcu(&f->rcu, fw_delete_filter); 186 tcf_queue_work(&f->rwork, fw_delete_filter_work);
197 ret = 0; 187 ret = 0;
198 break; 188 break;
199 } 189 }
@@ -316,7 +306,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
316 rcu_assign_pointer(*fp, fnew); 306 rcu_assign_pointer(*fp, fnew);
317 tcf_unbind_filter(tp, &f->res); 307 tcf_unbind_filter(tp, &f->res);
318 tcf_exts_get_net(&f->exts); 308 tcf_exts_get_net(&f->exts);
319 call_rcu(&f->rcu, fw_delete_filter); 309 tcf_queue_work(&f->rwork, fw_delete_filter_work);
320 310
321 *arg = fnew; 311 *arg = fnew;
322 return err; 312 return err;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2ba721a590a7..47b207ef7762 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,10 +21,7 @@ struct cls_mall_head {
21 struct tcf_result res; 21 struct tcf_result res;
22 u32 handle; 22 u32 handle;
23 u32 flags; 23 u32 flags;
24 union { 24 struct rcu_work rwork;
25 struct work_struct work;
26 struct rcu_head rcu;
27 };
28}; 25};
29 26
30static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, 27static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -53,22 +50,14 @@ static void __mall_destroy(struct cls_mall_head *head)
53 50
54static void mall_destroy_work(struct work_struct *work) 51static void mall_destroy_work(struct work_struct *work)
55{ 52{
56 struct cls_mall_head *head = container_of(work, struct cls_mall_head, 53 struct cls_mall_head *head = container_of(to_rcu_work(work),
57 work); 54 struct cls_mall_head,
55 rwork);
58 rtnl_lock(); 56 rtnl_lock();
59 __mall_destroy(head); 57 __mall_destroy(head);
60 rtnl_unlock(); 58 rtnl_unlock();
61} 59}
62 60
63static void mall_destroy_rcu(struct rcu_head *rcu)
64{
65 struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
66 rcu);
67
68 INIT_WORK(&head->work, mall_destroy_work);
69 tcf_queue_work(&head->work);
70}
71
72static void mall_destroy_hw_filter(struct tcf_proto *tp, 61static void mall_destroy_hw_filter(struct tcf_proto *tp,
73 struct cls_mall_head *head, 62 struct cls_mall_head *head,
74 unsigned long cookie, 63 unsigned long cookie,
@@ -126,7 +115,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
126 mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); 115 mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
127 116
128 if (tcf_exts_get_net(&head->exts)) 117 if (tcf_exts_get_net(&head->exts))
129 call_rcu(&head->rcu, mall_destroy_rcu); 118 tcf_queue_work(&head->rwork, mall_destroy_work);
130 else 119 else
131 __mall_destroy(head); 120 __mall_destroy(head);
132} 121}
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 21a03a8ee029..0404aa5fa7cb 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -57,10 +57,7 @@ struct route4_filter {
57 u32 handle; 57 u32 handle;
58 struct route4_bucket *bkt; 58 struct route4_bucket *bkt;
59 struct tcf_proto *tp; 59 struct tcf_proto *tp;
60 union { 60 struct rcu_work rwork;
61 struct work_struct work;
62 struct rcu_head rcu;
63 };
64}; 61};
65 62
66#define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) 63#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -266,19 +263,17 @@ static void __route4_delete_filter(struct route4_filter *f)
266 263
267static void route4_delete_filter_work(struct work_struct *work) 264static void route4_delete_filter_work(struct work_struct *work)
268{ 265{
269 struct route4_filter *f = container_of(work, struct route4_filter, work); 266 struct route4_filter *f = container_of(to_rcu_work(work),
270 267 struct route4_filter,
268 rwork);
271 rtnl_lock(); 269 rtnl_lock();
272 __route4_delete_filter(f); 270 __route4_delete_filter(f);
273 rtnl_unlock(); 271 rtnl_unlock();
274} 272}
275 273
276static void route4_delete_filter(struct rcu_head *head) 274static void route4_queue_work(struct route4_filter *f)
277{ 275{
278 struct route4_filter *f = container_of(head, struct route4_filter, rcu); 276 tcf_queue_work(&f->rwork, route4_delete_filter_work);
279
280 INIT_WORK(&f->work, route4_delete_filter_work);
281 tcf_queue_work(&f->work);
282} 277}
283 278
284static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) 279static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
@@ -304,7 +299,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
304 RCU_INIT_POINTER(b->ht[h2], next); 299 RCU_INIT_POINTER(b->ht[h2], next);
305 tcf_unbind_filter(tp, &f->res); 300 tcf_unbind_filter(tp, &f->res);
306 if (tcf_exts_get_net(&f->exts)) 301 if (tcf_exts_get_net(&f->exts))
307 call_rcu(&f->rcu, route4_delete_filter); 302 route4_queue_work(f);
308 else 303 else
309 __route4_delete_filter(f); 304 __route4_delete_filter(f);
310 } 305 }
@@ -349,7 +344,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
349 /* Delete it */ 344 /* Delete it */
350 tcf_unbind_filter(tp, &f->res); 345 tcf_unbind_filter(tp, &f->res);
351 tcf_exts_get_net(&f->exts); 346 tcf_exts_get_net(&f->exts);
352 call_rcu(&f->rcu, route4_delete_filter); 347 tcf_queue_work(&f->rwork, route4_delete_filter_work);
353 348
354 /* Strip RTNL protected tree */ 349 /* Strip RTNL protected tree */
355 for (i = 0; i <= 32; i++) { 350 for (i = 0; i <= 32; i++) {
@@ -554,7 +549,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
554 if (fold) { 549 if (fold) {
555 tcf_unbind_filter(tp, &fold->res); 550 tcf_unbind_filter(tp, &fold->res);
556 tcf_exts_get_net(&fold->exts); 551 tcf_exts_get_net(&fold->exts);
557 call_rcu(&fold->rcu, route4_delete_filter); 552 tcf_queue_work(&fold->rwork, route4_delete_filter_work);
558 } 553 }
559 return 0; 554 return 0;
560 555
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f1297657c27..e9ccf7daea7d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -97,10 +97,7 @@ struct rsvp_filter {
97 97
98 u32 handle; 98 u32 handle;
99 struct rsvp_session *sess; 99 struct rsvp_session *sess;
100 union { 100 struct rcu_work rwork;
101 struct work_struct work;
102 struct rcu_head rcu;
103 };
104}; 101};
105 102
106static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) 103static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
@@ -294,21 +291,14 @@ static void __rsvp_delete_filter(struct rsvp_filter *f)
294 291
295static void rsvp_delete_filter_work(struct work_struct *work) 292static void rsvp_delete_filter_work(struct work_struct *work)
296{ 293{
297 struct rsvp_filter *f = container_of(work, struct rsvp_filter, work); 294 struct rsvp_filter *f = container_of(to_rcu_work(work),
298 295 struct rsvp_filter,
296 rwork);
299 rtnl_lock(); 297 rtnl_lock();
300 __rsvp_delete_filter(f); 298 __rsvp_delete_filter(f);
301 rtnl_unlock(); 299 rtnl_unlock();
302} 300}
303 301
304static void rsvp_delete_filter_rcu(struct rcu_head *head)
305{
306 struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
307
308 INIT_WORK(&f->work, rsvp_delete_filter_work);
309 tcf_queue_work(&f->work);
310}
311
312static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) 302static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
313{ 303{
314 tcf_unbind_filter(tp, &f->res); 304 tcf_unbind_filter(tp, &f->res);
@@ -317,7 +307,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
317 * in cleanup() callback 307 * in cleanup() callback
318 */ 308 */
319 if (tcf_exts_get_net(&f->exts)) 309 if (tcf_exts_get_net(&f->exts))
320 call_rcu(&f->rcu, rsvp_delete_filter_rcu); 310 tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
321 else 311 else
322 __rsvp_delete_filter(f); 312 __rsvp_delete_filter(f);
323} 313}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index b49cc990a000..32f4bbd82f35 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -28,20 +28,14 @@
28struct tcindex_filter_result { 28struct tcindex_filter_result {
29 struct tcf_exts exts; 29 struct tcf_exts exts;
30 struct tcf_result res; 30 struct tcf_result res;
31 union { 31 struct rcu_work rwork;
32 struct work_struct work;
33 struct rcu_head rcu;
34 };
35}; 32};
36 33
37struct tcindex_filter { 34struct tcindex_filter {
38 u16 key; 35 u16 key;
39 struct tcindex_filter_result result; 36 struct tcindex_filter_result result;
40 struct tcindex_filter __rcu *next; 37 struct tcindex_filter __rcu *next;
41 union { 38 struct rcu_work rwork;
42 struct work_struct work;
43 struct rcu_head rcu;
44 };
45}; 39};
46 40
47 41
@@ -152,21 +146,14 @@ static void tcindex_destroy_rexts_work(struct work_struct *work)
152{ 146{
153 struct tcindex_filter_result *r; 147 struct tcindex_filter_result *r;
154 148
155 r = container_of(work, struct tcindex_filter_result, work); 149 r = container_of(to_rcu_work(work),
150 struct tcindex_filter_result,
151 rwork);
156 rtnl_lock(); 152 rtnl_lock();
157 __tcindex_destroy_rexts(r); 153 __tcindex_destroy_rexts(r);
158 rtnl_unlock(); 154 rtnl_unlock();
159} 155}
160 156
161static void tcindex_destroy_rexts(struct rcu_head *head)
162{
163 struct tcindex_filter_result *r;
164
165 r = container_of(head, struct tcindex_filter_result, rcu);
166 INIT_WORK(&r->work, tcindex_destroy_rexts_work);
167 tcf_queue_work(&r->work);
168}
169
170static void __tcindex_destroy_fexts(struct tcindex_filter *f) 157static void __tcindex_destroy_fexts(struct tcindex_filter *f)
171{ 158{
172 tcf_exts_destroy(&f->result.exts); 159 tcf_exts_destroy(&f->result.exts);
@@ -176,23 +163,15 @@ static void __tcindex_destroy_fexts(struct tcindex_filter *f)
176 163
177static void tcindex_destroy_fexts_work(struct work_struct *work) 164static void tcindex_destroy_fexts_work(struct work_struct *work)
178{ 165{
179 struct tcindex_filter *f = container_of(work, struct tcindex_filter, 166 struct tcindex_filter *f = container_of(to_rcu_work(work),
180 work); 167 struct tcindex_filter,
168 rwork);
181 169
182 rtnl_lock(); 170 rtnl_lock();
183 __tcindex_destroy_fexts(f); 171 __tcindex_destroy_fexts(f);
184 rtnl_unlock(); 172 rtnl_unlock();
185} 173}
186 174
187static void tcindex_destroy_fexts(struct rcu_head *head)
188{
189 struct tcindex_filter *f = container_of(head, struct tcindex_filter,
190 rcu);
191
192 INIT_WORK(&f->work, tcindex_destroy_fexts_work);
193 tcf_queue_work(&f->work);
194}
195
196static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, 175static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
197 struct netlink_ext_ack *extack) 176 struct netlink_ext_ack *extack)
198{ 177{
@@ -228,12 +207,12 @@ found:
228 */ 207 */
229 if (f) { 208 if (f) {
230 if (tcf_exts_get_net(&f->result.exts)) 209 if (tcf_exts_get_net(&f->result.exts))
231 call_rcu(&f->rcu, tcindex_destroy_fexts); 210 tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
232 else 211 else
233 __tcindex_destroy_fexts(f); 212 __tcindex_destroy_fexts(f);
234 } else { 213 } else {
235 if (tcf_exts_get_net(&r->exts)) 214 if (tcf_exts_get_net(&r->exts))
236 call_rcu(&r->rcu, tcindex_destroy_rexts); 215 tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
237 else 216 else
238 __tcindex_destroy_rexts(r); 217 __tcindex_destroy_rexts(r);
239 } 218 }
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index bac47b5d18fd..fb861f90fde6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,10 +68,7 @@ struct tc_u_knode {
68 u32 __percpu *pcpu_success; 68 u32 __percpu *pcpu_success;
69#endif 69#endif
70 struct tcf_proto *tp; 70 struct tcf_proto *tp;
71 union { 71 struct rcu_work rwork;
72 struct work_struct work;
73 struct rcu_head rcu;
74 };
75 /* The 'sel' field MUST be the last field in structure to allow for 72 /* The 'sel' field MUST be the last field in structure to allow for
76 * tc_u32_keys allocated at end of structure. 73 * tc_u32_keys allocated at end of structure.
77 */ 74 */
@@ -436,21 +433,14 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
436 */ 433 */
437static void u32_delete_key_work(struct work_struct *work) 434static void u32_delete_key_work(struct work_struct *work)
438{ 435{
439 struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); 436 struct tc_u_knode *key = container_of(to_rcu_work(work),
440 437 struct tc_u_knode,
438 rwork);
441 rtnl_lock(); 439 rtnl_lock();
442 u32_destroy_key(key->tp, key, false); 440 u32_destroy_key(key->tp, key, false);
443 rtnl_unlock(); 441 rtnl_unlock();
444} 442}
445 443
446static void u32_delete_key_rcu(struct rcu_head *rcu)
447{
448 struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
449
450 INIT_WORK(&key->work, u32_delete_key_work);
451 tcf_queue_work(&key->work);
452}
453
454/* u32_delete_key_freepf_rcu is the rcu callback variant 444/* u32_delete_key_freepf_rcu is the rcu callback variant
455 * that free's the entire structure including the statistics 445 * that free's the entire structure including the statistics
456 * percpu variables. Only use this if the key is not a copy 446 * percpu variables. Only use this if the key is not a copy
@@ -460,21 +450,14 @@ static void u32_delete_key_rcu(struct rcu_head *rcu)
460 */ 450 */
461static void u32_delete_key_freepf_work(struct work_struct *work) 451static void u32_delete_key_freepf_work(struct work_struct *work)
462{ 452{
463 struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); 453 struct tc_u_knode *key = container_of(to_rcu_work(work),
464 454 struct tc_u_knode,
455 rwork);
465 rtnl_lock(); 456 rtnl_lock();
466 u32_destroy_key(key->tp, key, true); 457 u32_destroy_key(key->tp, key, true);
467 rtnl_unlock(); 458 rtnl_unlock();
468} 459}
469 460
470static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
471{
472 struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
473
474 INIT_WORK(&key->work, u32_delete_key_freepf_work);
475 tcf_queue_work(&key->work);
476}
477
478static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) 461static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
479{ 462{
480 struct tc_u_knode __rcu **kp; 463 struct tc_u_knode __rcu **kp;
@@ -491,7 +474,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
491 tcf_unbind_filter(tp, &key->res); 474 tcf_unbind_filter(tp, &key->res);
492 idr_remove(&ht->handle_idr, key->handle); 475 idr_remove(&ht->handle_idr, key->handle);
493 tcf_exts_get_net(&key->exts); 476 tcf_exts_get_net(&key->exts);
494 call_rcu(&key->rcu, u32_delete_key_freepf_rcu); 477 tcf_queue_work(&key->rwork, u32_delete_key_freepf_work);
495 return 0; 478 return 0;
496 } 479 }
497 } 480 }
@@ -611,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
611 u32_remove_hw_knode(tp, n, extack); 594 u32_remove_hw_knode(tp, n, extack);
612 idr_remove(&ht->handle_idr, n->handle); 595 idr_remove(&ht->handle_idr, n->handle);
613 if (tcf_exts_get_net(&n->exts)) 596 if (tcf_exts_get_net(&n->exts))
614 call_rcu(&n->rcu, u32_delete_key_freepf_rcu); 597 tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
615 else 598 else
616 u32_destroy_key(n->tp, n, true); 599 u32_destroy_key(n->tp, n, true);
617 } 600 }
@@ -995,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
995 u32_replace_knode(tp, tp_c, new); 978 u32_replace_knode(tp, tp_c, new);
996 tcf_unbind_filter(tp, &n->res); 979 tcf_unbind_filter(tp, &n->res);
997 tcf_exts_get_net(&n->exts); 980 tcf_exts_get_net(&n->exts);
998 call_rcu(&n->rcu, u32_delete_key_rcu); 981 tcf_queue_work(&n->rwork, u32_delete_key_work);
999 return 0; 982 return 0;
1000 } 983 }
1001 984
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39c144b6ff98..69078c82963e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
346 return false; 346 return false;
347 } 347 }
348 348
349 if (ret && netif_xmit_frozen_or_stopped(txq))
350 return false;
351
352 return true; 349 return true;
353} 350}
354 351
@@ -373,33 +370,24 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
373 */ 370 */
374static inline bool qdisc_restart(struct Qdisc *q, int *packets) 371static inline bool qdisc_restart(struct Qdisc *q, int *packets)
375{ 372{
376 bool more, validate, nolock = q->flags & TCQ_F_NOLOCK;
377 spinlock_t *root_lock = NULL; 373 spinlock_t *root_lock = NULL;
378 struct netdev_queue *txq; 374 struct netdev_queue *txq;
379 struct net_device *dev; 375 struct net_device *dev;
380 struct sk_buff *skb; 376 struct sk_buff *skb;
377 bool validate;
381 378
382 /* Dequeue packet */ 379 /* Dequeue packet */
383 if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
384 return false;
385
386 skb = dequeue_skb(q, &validate, packets); 380 skb = dequeue_skb(q, &validate, packets);
387 if (unlikely(!skb)) { 381 if (unlikely(!skb))
388 if (nolock)
389 clear_bit(__QDISC_STATE_RUNNING, &q->state);
390 return false; 382 return false;
391 }
392 383
393 if (!nolock) 384 if (!(q->flags & TCQ_F_NOLOCK))
394 root_lock = qdisc_lock(q); 385 root_lock = qdisc_lock(q);
395 386
396 dev = qdisc_dev(q); 387 dev = qdisc_dev(q);
397 txq = skb_get_tx_queue(dev, skb); 388 txq = skb_get_tx_queue(dev, skb);
398 389
399 more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate); 390 return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
400 if (nolock)
401 clear_bit(__QDISC_STATE_RUNNING, &q->state);
402 return more;
403} 391}
404 392
405void __qdisc_run(struct Qdisc *q) 393void __qdisc_run(struct Qdisc *q)
@@ -665,7 +653,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
665 if (__skb_array_empty(q)) 653 if (__skb_array_empty(q))
666 continue; 654 continue;
667 655
668 skb = skb_array_consume_bh(q); 656 skb = __skb_array_consume(q);
669 } 657 }
670 if (likely(skb)) { 658 if (likely(skb)) {
671 qdisc_qstats_cpu_backlog_dec(qdisc, skb); 659 qdisc_qstats_cpu_backlog_dec(qdisc, skb);
@@ -706,7 +694,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
706 if (!q->ring.queue) 694 if (!q->ring.queue)
707 continue; 695 continue;
708 696
709 while ((skb = skb_array_consume_bh(q)) != NULL) 697 while ((skb = __skb_array_consume(q)) != NULL)
710 kfree_skb(skb); 698 kfree_skb(skb);
711 } 699 }
712 700
@@ -867,6 +855,11 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
867 lockdep_set_class(&sch->busylock, 855 lockdep_set_class(&sch->busylock,
868 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); 856 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
869 857
858 /* seqlock has the same scope of busylock, for NOLOCK qdisc */
859 spin_lock_init(&sch->seqlock);
860 lockdep_set_class(&sch->busylock,
861 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
862
870 seqcount_init(&sch->running); 863 seqcount_init(&sch->running);
871 lockdep_set_class(&sch->running, 864 lockdep_set_class(&sch->running,
872 dev->qdisc_running_key ?: &qdisc_running_key); 865 dev->qdisc_running_key ?: &qdisc_running_key);
@@ -1106,6 +1099,10 @@ static void dev_deactivate_queue(struct net_device *dev,
1106 1099
1107 qdisc = rtnl_dereference(dev_queue->qdisc); 1100 qdisc = rtnl_dereference(dev_queue->qdisc);
1108 if (qdisc) { 1101 if (qdisc) {
1102 bool nolock = qdisc->flags & TCQ_F_NOLOCK;
1103
1104 if (nolock)
1105 spin_lock_bh(&qdisc->seqlock);
1109 spin_lock_bh(qdisc_lock(qdisc)); 1106 spin_lock_bh(qdisc_lock(qdisc));
1110 1107
1111 if (!(qdisc->flags & TCQ_F_BUILTIN)) 1108 if (!(qdisc->flags & TCQ_F_BUILTIN))
@@ -1115,6 +1112,8 @@ static void dev_deactivate_queue(struct net_device *dev,
1115 qdisc_reset(qdisc); 1112 qdisc_reset(qdisc);
1116 1113
1117 spin_unlock_bh(qdisc_lock(qdisc)); 1114 spin_unlock_bh(qdisc_lock(qdisc));
1115 if (nolock)
1116 spin_unlock_bh(&qdisc->seqlock);
1118 } 1117 }
1119} 1118}
1120 1119
@@ -1131,17 +1130,13 @@ static bool some_qdisc_is_busy(struct net_device *dev)
1131 dev_queue = netdev_get_tx_queue(dev, i); 1130 dev_queue = netdev_get_tx_queue(dev, i);
1132 q = dev_queue->qdisc_sleeping; 1131 q = dev_queue->qdisc_sleeping;
1133 1132
1134 if (q->flags & TCQ_F_NOLOCK) { 1133 root_lock = qdisc_lock(q);
1135 val = test_bit(__QDISC_STATE_SCHED, &q->state); 1134 spin_lock_bh(root_lock);
1136 } else {
1137 root_lock = qdisc_lock(q);
1138 spin_lock_bh(root_lock);
1139 1135
1140 val = (qdisc_is_running(q) || 1136 val = (qdisc_is_running(q) ||
1141 test_bit(__QDISC_STATE_SCHED, &q->state)); 1137 test_bit(__QDISC_STATE_SCHED, &q->state));
1142 1138
1143 spin_unlock_bh(root_lock); 1139 spin_unlock_bh(root_lock);
1144 }
1145 1140
1146 if (val) 1141 if (val)
1147 return true; 1142 return true;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f062a18e9162..d6b8ae4ed7a3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -16,6 +16,7 @@
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <net/netlink.h> 18#include <net/netlink.h>
19#include <net/pkt_cls.h>
19#include <net/pkt_sched.h> 20#include <net/pkt_sched.h>
20#include <net/sch_generic.h> 21#include <net/sch_generic.h>
21 22
@@ -23,12 +24,44 @@ struct mq_sched {
23 struct Qdisc **qdiscs; 24 struct Qdisc **qdiscs;
24}; 25};
25 26
27static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
28{
29 struct net_device *dev = qdisc_dev(sch);
30 struct tc_mq_qopt_offload opt = {
31 .command = cmd,
32 .handle = sch->handle,
33 };
34
35 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
36 return -EOPNOTSUPP;
37
38 return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
39}
40
41static void mq_offload_stats(struct Qdisc *sch)
42{
43 struct net_device *dev = qdisc_dev(sch);
44 struct tc_mq_qopt_offload opt = {
45 .command = TC_MQ_STATS,
46 .handle = sch->handle,
47 .stats = {
48 .bstats = &sch->bstats,
49 .qstats = &sch->qstats,
50 },
51 };
52
53 if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
54 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
55}
56
26static void mq_destroy(struct Qdisc *sch) 57static void mq_destroy(struct Qdisc *sch)
27{ 58{
28 struct net_device *dev = qdisc_dev(sch); 59 struct net_device *dev = qdisc_dev(sch);
29 struct mq_sched *priv = qdisc_priv(sch); 60 struct mq_sched *priv = qdisc_priv(sch);
30 unsigned int ntx; 61 unsigned int ntx;
31 62
63 mq_offload(sch, TC_MQ_DESTROY);
64
32 if (!priv->qdiscs) 65 if (!priv->qdiscs)
33 return; 66 return;
34 for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) 67 for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
@@ -70,6 +103,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
70 } 103 }
71 104
72 sch->flags |= TCQ_F_MQROOT; 105 sch->flags |= TCQ_F_MQROOT;
106
107 mq_offload(sch, TC_MQ_CREATE);
73 return 0; 108 return 0;
74} 109}
75 110
@@ -127,6 +162,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
127 sch->q.qlen += qdisc->q.qlen; 162 sch->q.qlen += qdisc->q.qlen;
128 sch->bstats.bytes += qdisc->bstats.bytes; 163 sch->bstats.bytes += qdisc->bstats.bytes;
129 sch->bstats.packets += qdisc->bstats.packets; 164 sch->bstats.packets += qdisc->bstats.packets;
165 sch->qstats.qlen += qdisc->qstats.qlen;
130 sch->qstats.backlog += qdisc->qstats.backlog; 166 sch->qstats.backlog += qdisc->qstats.backlog;
131 sch->qstats.drops += qdisc->qstats.drops; 167 sch->qstats.drops += qdisc->qstats.drops;
132 sch->qstats.requeues += qdisc->qstats.requeues; 168 sch->qstats.requeues += qdisc->qstats.requeues;
@@ -135,6 +171,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
135 171
136 spin_unlock_bh(qdisc_lock(qdisc)); 172 spin_unlock_bh(qdisc_lock(qdisc));
137 } 173 }
174 mq_offload_stats(sch);
138 175
139 return 0; 176 return 0;
140} 177}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a47179da24e6..5d5a16204d50 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -652,33 +652,20 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
652 */ 652 */
653 peer->param_flags = asoc->param_flags; 653 peer->param_flags = asoc->param_flags;
654 654
655 sctp_transport_route(peer, NULL, sp);
656
657 /* Initialize the pmtu of the transport. */ 655 /* Initialize the pmtu of the transport. */
658 if (peer->param_flags & SPP_PMTUD_DISABLE) { 656 sctp_transport_route(peer, NULL, sp);
659 if (asoc->pathmtu)
660 peer->pathmtu = asoc->pathmtu;
661 else
662 peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
663 }
664 657
665 /* If this is the first transport addr on this association, 658 /* If this is the first transport addr on this association,
666 * initialize the association PMTU to the peer's PMTU. 659 * initialize the association PMTU to the peer's PMTU.
667 * If not and the current association PMTU is higher than the new 660 * If not and the current association PMTU is higher than the new
668 * peer's PMTU, reset the association PMTU to the new peer's PMTU. 661 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
669 */ 662 */
670 if (asoc->pathmtu) 663 sctp_assoc_set_pmtu(asoc, asoc->pathmtu ?
671 asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); 664 min_t(int, peer->pathmtu, asoc->pathmtu) :
672 else 665 peer->pathmtu);
673 asoc->pathmtu = peer->pathmtu;
674
675 pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc,
676 asoc->pathmtu);
677 666
678 peer->pmtu_pending = 0; 667 peer->pmtu_pending = 0;
679 668
680 asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
681
682 /* The asoc->peer.port might not be meaningful yet, but 669 /* The asoc->peer.port might not be meaningful yet, but
683 * initialize the packet structure anyway. 670 * initialize the packet structure anyway.
684 */ 671 */
@@ -988,31 +975,6 @@ out:
988 return match; 975 return match;
989} 976}
990 977
991/* Is this the association we are looking for? */
992struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
993 struct net *net,
994 const union sctp_addr *laddr,
995 const union sctp_addr *paddr)
996{
997 struct sctp_transport *transport;
998
999 if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
1000 (htons(asoc->peer.port) == paddr->v4.sin_port) &&
1001 net_eq(sock_net(asoc->base.sk), net)) {
1002 transport = sctp_assoc_lookup_paddr(asoc, paddr);
1003 if (!transport)
1004 goto out;
1005
1006 if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
1007 sctp_sk(asoc->base.sk)))
1008 goto out;
1009 }
1010 transport = NULL;
1011
1012out:
1013 return transport;
1014}
1015
1016/* Do delayed input processing. This is scheduled by sctp_rcv(). */ 978/* Do delayed input processing. This is scheduled by sctp_rcv(). */
1017static void sctp_assoc_bh_rcv(struct work_struct *work) 979static void sctp_assoc_bh_rcv(struct work_struct *work)
1018{ 980{
@@ -1434,6 +1396,31 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
1434 } 1396 }
1435} 1397}
1436 1398
1399void sctp_assoc_update_frag_point(struct sctp_association *asoc)
1400{
1401 int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
1402 sctp_datachk_len(&asoc->stream));
1403
1404 if (asoc->user_frag)
1405 frag = min_t(int, frag, asoc->user_frag);
1406
1407 frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN -
1408 sctp_datachk_len(&asoc->stream));
1409
1410 asoc->frag_point = SCTP_TRUNC4(frag);
1411}
1412
1413void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
1414{
1415 if (asoc->pathmtu != pmtu) {
1416 asoc->pathmtu = pmtu;
1417 sctp_assoc_update_frag_point(asoc);
1418 }
1419
1420 pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
1421 asoc->pathmtu, asoc->frag_point);
1422}
1423
1437/* Update the association's pmtu and frag_point by going through all the 1424/* Update the association's pmtu and frag_point by going through all the
1438 * transports. This routine is called when a transport's PMTU has changed. 1425 * transports. This routine is called when a transport's PMTU has changed.
1439 */ 1426 */
@@ -1446,24 +1433,16 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
1446 return; 1433 return;
1447 1434
1448 /* Get the lowest pmtu of all the transports. */ 1435 /* Get the lowest pmtu of all the transports. */
1449 list_for_each_entry(t, &asoc->peer.transport_addr_list, 1436 list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
1450 transports) {
1451 if (t->pmtu_pending && t->dst) { 1437 if (t->pmtu_pending && t->dst) {
1452 sctp_transport_update_pmtu( 1438 sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
1453 t, SCTP_TRUNC4(dst_mtu(t->dst)));
1454 t->pmtu_pending = 0; 1439 t->pmtu_pending = 0;
1455 } 1440 }
1456 if (!pmtu || (t->pathmtu < pmtu)) 1441 if (!pmtu || (t->pathmtu < pmtu))
1457 pmtu = t->pathmtu; 1442 pmtu = t->pathmtu;
1458 } 1443 }
1459 1444
1460 if (pmtu) { 1445 sctp_assoc_set_pmtu(asoc, pmtu);
1461 asoc->pathmtu = pmtu;
1462 asoc->frag_point = sctp_frag_point(asoc, pmtu);
1463 }
1464
1465 pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
1466 asoc->pathmtu, asoc->frag_point);
1467} 1446}
1468 1447
1469/* Should we send a SACK to update our peer? */ 1448/* Should we send a SACK to update our peer? */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index be296d633e95..79daa98208c3 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -172,8 +172,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
172 struct list_head *pos, *temp; 172 struct list_head *pos, *temp;
173 struct sctp_chunk *chunk; 173 struct sctp_chunk *chunk;
174 struct sctp_datamsg *msg; 174 struct sctp_datamsg *msg;
175 struct sctp_sock *sp;
176 struct sctp_af *af;
177 int err; 175 int err;
178 176
179 msg = sctp_datamsg_new(GFP_KERNEL); 177 msg = sctp_datamsg_new(GFP_KERNEL);
@@ -192,12 +190,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
192 /* This is the biggest possible DATA chunk that can fit into 190 /* This is the biggest possible DATA chunk that can fit into
193 * the packet 191 * the packet
194 */ 192 */
195 sp = sctp_sk(asoc->base.sk); 193 max_data = asoc->frag_point;
196 af = sp->pf->af;
197 max_data = asoc->pathmtu - af->net_header_len -
198 sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) -
199 af->ip_options_len(asoc->base.sk);
200 max_data = SCTP_TRUNC4(max_data);
201 194
202 /* If the the peer requested that we authenticate DATA chunks 195 /* If the the peer requested that we authenticate DATA chunks
203 * we need to account for bundling of the AUTH chunks along with 196 * we need to account for bundling of the AUTH chunks along with
@@ -222,9 +215,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
222 } 215 }
223 } 216 }
224 217
225 /* Check what's our max considering the above */
226 max_data = min_t(size_t, max_data, asoc->frag_point);
227
228 /* Set first_len and then account for possible bundles on first frag */ 218 /* Set first_len and then account for possible bundles on first frag */
229 first_len = max_data; 219 first_len = max_data;
230 220
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 690d8557bb7b..e672dee302c7 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -90,8 +90,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
90{ 90{
91 struct sctp_transport *tp = packet->transport; 91 struct sctp_transport *tp = packet->transport;
92 struct sctp_association *asoc = tp->asoc; 92 struct sctp_association *asoc = tp->asoc;
93 struct sctp_sock *sp = NULL;
93 struct sock *sk; 94 struct sock *sk;
94 size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr);
95 95
96 pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); 96 pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
97 packet->vtag = vtag; 97 packet->vtag = vtag;
@@ -102,28 +102,20 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
102 102
103 /* set packet max_size with pathmtu, then calculate overhead */ 103 /* set packet max_size with pathmtu, then calculate overhead */
104 packet->max_size = tp->pathmtu; 104 packet->max_size = tp->pathmtu;
105
105 if (asoc) { 106 if (asoc) {
106 struct sctp_sock *sp = sctp_sk(asoc->base.sk); 107 sk = asoc->base.sk;
107 struct sctp_af *af = sp->pf->af; 108 sp = sctp_sk(sk);
108
109 overhead = af->net_header_len +
110 af->ip_options_len(asoc->base.sk);
111 overhead += sizeof(struct sctphdr);
112 packet->overhead = overhead;
113 packet->size = overhead;
114 } else {
115 packet->overhead = overhead;
116 packet->size = overhead;
117 return;
118 } 109 }
110 packet->overhead = sctp_mtu_payload(sp, 0, 0);
111 packet->size = packet->overhead;
112
113 if (!asoc)
114 return;
119 115
120 /* update dst or transport pathmtu if in need */ 116 /* update dst or transport pathmtu if in need */
121 sk = asoc->base.sk;
122 if (!sctp_transport_dst_check(tp)) { 117 if (!sctp_transport_dst_check(tp)) {
123 sctp_transport_route(tp, NULL, sctp_sk(sk)); 118 sctp_transport_route(tp, NULL, sp);
124 if (asoc->param_flags & SPP_PMTUD_ENABLE)
125 sctp_assoc_sync_pmtu(asoc);
126 } else if (!sctp_transport_pmtu_check(tp)) {
127 if (asoc->param_flags & SPP_PMTUD_ENABLE) 119 if (asoc->param_flags & SPP_PMTUD_ENABLE)
128 sctp_assoc_sync_pmtu(asoc); 120 sctp_assoc_sync_pmtu(asoc);
129 } 121 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f211b3db6a35..d68aa33485a9 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -601,14 +601,14 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
601 601
602/* 602/*
603 * Transmit DATA chunks on the retransmit queue. Upon return from 603 * Transmit DATA chunks on the retransmit queue. Upon return from
604 * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which 604 * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
605 * need to be transmitted by the caller. 605 * need to be transmitted by the caller.
606 * We assume that pkt->transport has already been set. 606 * We assume that pkt->transport has already been set.
607 * 607 *
608 * The return value is a normal kernel error return value. 608 * The return value is a normal kernel error return value.
609 */ 609 */
610static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, 610static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
611 int rtx_timeout, int *start_timer) 611 int rtx_timeout, int *start_timer, gfp_t gfp)
612{ 612{
613 struct sctp_transport *transport = pkt->transport; 613 struct sctp_transport *transport = pkt->transport;
614 struct sctp_chunk *chunk, *chunk1; 614 struct sctp_chunk *chunk, *chunk1;
@@ -684,12 +684,12 @@ redo:
684 * control chunks are already freed so there 684 * control chunks are already freed so there
685 * is nothing we can do. 685 * is nothing we can do.
686 */ 686 */
687 sctp_packet_transmit(pkt, GFP_ATOMIC); 687 sctp_packet_transmit(pkt, gfp);
688 goto redo; 688 goto redo;
689 } 689 }
690 690
691 /* Send this packet. */ 691 /* Send this packet. */
692 error = sctp_packet_transmit(pkt, GFP_ATOMIC); 692 error = sctp_packet_transmit(pkt, gfp);
693 693
694 /* If we are retransmitting, we should only 694 /* If we are retransmitting, we should only
695 * send a single packet. 695 * send a single packet.
@@ -705,7 +705,7 @@ redo:
705 705
706 case SCTP_XMIT_RWND_FULL: 706 case SCTP_XMIT_RWND_FULL:
707 /* Send this packet. */ 707 /* Send this packet. */
708 error = sctp_packet_transmit(pkt, GFP_ATOMIC); 708 error = sctp_packet_transmit(pkt, gfp);
709 709
710 /* Stop sending DATA as there is no more room 710 /* Stop sending DATA as there is no more room
711 * at the receiver. 711 * at the receiver.
@@ -715,7 +715,7 @@ redo:
715 715
716 case SCTP_XMIT_DELAY: 716 case SCTP_XMIT_DELAY:
717 /* Send this packet. */ 717 /* Send this packet. */
718 error = sctp_packet_transmit(pkt, GFP_ATOMIC); 718 error = sctp_packet_transmit(pkt, gfp);
719 719
720 /* Stop sending DATA because of nagle delay. */ 720 /* Stop sending DATA because of nagle delay. */
721 done = 1; 721 done = 1;
@@ -776,68 +776,43 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
776 sctp_outq_flush(q, 0, gfp); 776 sctp_outq_flush(q, 0, gfp);
777} 777}
778 778
779 779static int sctp_packet_singleton(struct sctp_transport *transport,
780/* 780 struct sctp_chunk *chunk, gfp_t gfp)
781 * Try to flush an outqueue.
782 *
783 * Description: Send everything in q which we legally can, subject to
784 * congestion limitations.
785 * * Note: This function can be called from multiple contexts so appropriate
786 * locking concerns must be made. Today we use the sock lock to protect
787 * this function.
788 */
789static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
790{ 781{
791 struct sctp_packet *packet; 782 const struct sctp_association *asoc = transport->asoc;
783 const __u16 sport = asoc->base.bind_addr.port;
784 const __u16 dport = asoc->peer.port;
785 const __u32 vtag = asoc->peer.i.init_tag;
792 struct sctp_packet singleton; 786 struct sctp_packet singleton;
793 struct sctp_association *asoc = q->asoc;
794 __u16 sport = asoc->base.bind_addr.port;
795 __u16 dport = asoc->peer.port;
796 __u32 vtag = asoc->peer.i.init_tag;
797 struct sctp_transport *transport = NULL;
798 struct sctp_transport *new_transport;
799 struct sctp_chunk *chunk, *tmp;
800 enum sctp_xmit status;
801 int error = 0;
802 int start_timer = 0;
803 int one_packet = 0;
804 787
788 sctp_packet_init(&singleton, transport, sport, dport);
789 sctp_packet_config(&singleton, vtag, 0);
790 sctp_packet_append_chunk(&singleton, chunk);
791 return sctp_packet_transmit(&singleton, gfp);
792}
793
794/* Struct to hold the context during sctp outq flush */
795struct sctp_flush_ctx {
796 struct sctp_outq *q;
797 /* Current transport being used. It's NOT the same as curr active one */
798 struct sctp_transport *transport;
805 /* These transports have chunks to send. */ 799 /* These transports have chunks to send. */
806 struct list_head transport_list; 800 struct list_head transport_list;
807 struct list_head *ltransport; 801 struct sctp_association *asoc;
808 802 /* Packet on the current transport above */
809 INIT_LIST_HEAD(&transport_list); 803 struct sctp_packet *packet;
810 packet = NULL; 804 gfp_t gfp;
811 805};
812 /*
813 * 6.10 Bundling
814 * ...
815 * When bundling control chunks with DATA chunks, an
816 * endpoint MUST place control chunks first in the outbound
817 * SCTP packet. The transmitter MUST transmit DATA chunks
818 * within a SCTP packet in increasing order of TSN.
819 * ...
820 */
821
822 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
823 /* RFC 5061, 5.3
824 * F1) This means that until such time as the ASCONF
825 * containing the add is acknowledged, the sender MUST
826 * NOT use the new IP address as a source for ANY SCTP
827 * packet except on carrying an ASCONF Chunk.
828 */
829 if (asoc->src_out_of_asoc_ok &&
830 chunk->chunk_hdr->type != SCTP_CID_ASCONF)
831 continue;
832
833 list_del_init(&chunk->list);
834 806
835 /* Pick the right transport to use. */ 807/* transport: current transport */
836 new_transport = chunk->transport; 808static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
809 struct sctp_chunk *chunk)
810{
811 struct sctp_transport *new_transport = chunk->transport;
837 812
838 if (!new_transport) { 813 if (!new_transport) {
839 /* 814 if (!sctp_chunk_is_data(chunk)) {
840 * If we have a prior transport pointer, see if 815 /* If we have a prior transport pointer, see if
841 * the destination address of the chunk 816 * the destination address of the chunk
842 * matches the destination address of the 817 * matches the destination address of the
843 * current transport. If not a match, then 818 * current transport. If not a match, then
@@ -846,22 +821,26 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
846 * after processing ASCONFs, we may have new 821 * after processing ASCONFs, we may have new
847 * transports created. 822 * transports created.
848 */ 823 */
849 if (transport && 824 if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest,
850 sctp_cmp_addr_exact(&chunk->dest, 825 &ctx->transport->ipaddr))
851 &transport->ipaddr)) 826 new_transport = ctx->transport;
852 new_transport = transport;
853 else 827 else
854 new_transport = sctp_assoc_lookup_paddr(asoc, 828 new_transport = sctp_assoc_lookup_paddr(ctx->asoc,
855 &chunk->dest); 829 &chunk->dest);
830 }
856 831
857 /* if we still don't have a new transport, then 832 /* if we still don't have a new transport, then
858 * use the current active path. 833 * use the current active path.
859 */ 834 */
860 if (!new_transport) 835 if (!new_transport)
861 new_transport = asoc->peer.active_path; 836 new_transport = ctx->asoc->peer.active_path;
862 } else if ((new_transport->state == SCTP_INACTIVE) || 837 } else {
863 (new_transport->state == SCTP_UNCONFIRMED) || 838 __u8 type;
864 (new_transport->state == SCTP_PF)) { 839
840 switch (new_transport->state) {
841 case SCTP_INACTIVE:
842 case SCTP_UNCONFIRMED:
843 case SCTP_PF:
865 /* If the chunk is Heartbeat or Heartbeat Ack, 844 /* If the chunk is Heartbeat or Heartbeat Ack,
866 * send it to chunk->transport, even if it's 845 * send it to chunk->transport, even if it's
867 * inactive. 846 * inactive.
@@ -875,29 +854,64 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
875 * 854 *
876 * ASCONF_ACKs also must be sent to the source. 855 * ASCONF_ACKs also must be sent to the source.
877 */ 856 */
878 if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT && 857 type = chunk->chunk_hdr->type;
879 chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK && 858 if (type != SCTP_CID_HEARTBEAT &&
880 chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK) 859 type != SCTP_CID_HEARTBEAT_ACK &&
881 new_transport = asoc->peer.active_path; 860 type != SCTP_CID_ASCONF_ACK)
861 new_transport = ctx->asoc->peer.active_path;
862 break;
863 default:
864 break;
882 } 865 }
866 }
867
868 /* Are we switching transports? Take care of transport locks. */
869 if (new_transport != ctx->transport) {
870 ctx->transport = new_transport;
871 ctx->packet = &ctx->transport->packet;
883 872
884 /* Are we switching transports? 873 if (list_empty(&ctx->transport->send_ready))
885 * Take care of transport locks. 874 list_add_tail(&ctx->transport->send_ready,
875 &ctx->transport_list);
876
877 sctp_packet_config(ctx->packet,
878 ctx->asoc->peer.i.init_tag,
879 ctx->asoc->peer.ecn_capable);
880 /* We've switched transports, so apply the
881 * Burst limit to the new transport.
886 */ 882 */
887 if (new_transport != transport) { 883 sctp_transport_burst_limited(ctx->transport);
888 transport = new_transport; 884 }
889 if (list_empty(&transport->send_ready)) { 885}
890 list_add_tail(&transport->send_ready, 886
891 &transport_list); 887static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
892 } 888{
893 packet = &transport->packet; 889 struct sctp_chunk *chunk, *tmp;
894 sctp_packet_config(packet, vtag, 890 enum sctp_xmit status;
895 asoc->peer.ecn_capable); 891 int one_packet, error;
896 } 892
893 list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) {
894 one_packet = 0;
895
896 /* RFC 5061, 5.3
897 * F1) This means that until such time as the ASCONF
898 * containing the add is acknowledged, the sender MUST
899 * NOT use the new IP address as a source for ANY SCTP
900 * packet except on carrying an ASCONF Chunk.
901 */
902 if (ctx->asoc->src_out_of_asoc_ok &&
903 chunk->chunk_hdr->type != SCTP_CID_ASCONF)
904 continue;
905
906 list_del_init(&chunk->list);
907
908 /* Pick the right transport to use. Should always be true for
909 * the first chunk as we don't have a transport by then.
910 */
911 sctp_outq_select_transport(ctx, chunk);
897 912
898 switch (chunk->chunk_hdr->type) { 913 switch (chunk->chunk_hdr->type) {
899 /* 914 /* 6.10 Bundling
900 * 6.10 Bundling
901 * ... 915 * ...
902 * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN 916 * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN
903 * COMPLETE with any other chunks. [Send them immediately.] 917 * COMPLETE with any other chunks. [Send them immediately.]
@@ -905,20 +919,19 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
905 case SCTP_CID_INIT: 919 case SCTP_CID_INIT:
906 case SCTP_CID_INIT_ACK: 920 case SCTP_CID_INIT_ACK:
907 case SCTP_CID_SHUTDOWN_COMPLETE: 921 case SCTP_CID_SHUTDOWN_COMPLETE:
908 sctp_packet_init(&singleton, transport, sport, dport); 922 error = sctp_packet_singleton(ctx->transport, chunk,
909 sctp_packet_config(&singleton, vtag, 0); 923 ctx->gfp);
910 sctp_packet_append_chunk(&singleton, chunk);
911 error = sctp_packet_transmit(&singleton, gfp);
912 if (error < 0) { 924 if (error < 0) {
913 asoc->base.sk->sk_err = -error; 925 ctx->asoc->base.sk->sk_err = -error;
914 return; 926 return;
915 } 927 }
916 break; 928 break;
917 929
918 case SCTP_CID_ABORT: 930 case SCTP_CID_ABORT:
919 if (sctp_test_T_bit(chunk)) 931 if (sctp_test_T_bit(chunk))
920 packet->vtag = asoc->c.my_vtag; 932 ctx->packet->vtag = ctx->asoc->c.my_vtag;
921 /* fallthru */ 933 /* fallthru */
934
922 /* The following chunks are "response" chunks, i.e. 935 /* The following chunks are "response" chunks, i.e.
923 * they are generated in response to something we 936 * they are generated in response to something we
924 * received. If we are sending these, then we can 937 * received. If we are sending these, then we can
@@ -942,27 +955,27 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
942 case SCTP_CID_FWD_TSN: 955 case SCTP_CID_FWD_TSN:
943 case SCTP_CID_I_FWD_TSN: 956 case SCTP_CID_I_FWD_TSN:
944 case SCTP_CID_RECONF: 957 case SCTP_CID_RECONF:
945 status = sctp_packet_transmit_chunk(packet, chunk, 958 status = sctp_packet_transmit_chunk(ctx->packet, chunk,
946 one_packet, gfp); 959 one_packet, ctx->gfp);
947 if (status != SCTP_XMIT_OK) { 960 if (status != SCTP_XMIT_OK) {
948 /* put the chunk back */ 961 /* put the chunk back */
949 list_add(&chunk->list, &q->control_chunk_list); 962 list_add(&chunk->list, &ctx->q->control_chunk_list);
950 break; 963 break;
951 } 964 }
952 965
953 asoc->stats.octrlchunks++; 966 ctx->asoc->stats.octrlchunks++;
954 /* PR-SCTP C5) If a FORWARD TSN is sent, the 967 /* PR-SCTP C5) If a FORWARD TSN is sent, the
955 * sender MUST assure that at least one T3-rtx 968 * sender MUST assure that at least one T3-rtx
956 * timer is running. 969 * timer is running.
957 */ 970 */
958 if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || 971 if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
959 chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { 972 chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
960 sctp_transport_reset_t3_rtx(transport); 973 sctp_transport_reset_t3_rtx(ctx->transport);
961 transport->last_time_sent = jiffies; 974 ctx->transport->last_time_sent = jiffies;
962 } 975 }
963 976
964 if (chunk == asoc->strreset_chunk) 977 if (chunk == ctx->asoc->strreset_chunk)
965 sctp_transport_reset_reconf_timer(transport); 978 sctp_transport_reset_reconf_timer(ctx->transport);
966 979
967 break; 980 break;
968 981
@@ -971,232 +984,186 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
971 BUG(); 984 BUG();
972 } 985 }
973 } 986 }
987}
974 988
975 if (q->asoc->src_out_of_asoc_ok) 989/* Returns false if new data shouldn't be sent */
976 goto sctp_flush_out; 990static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
991 int rtx_timeout)
992{
993 int error, start_timer = 0;
994
995 if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
996 return false;
997
998 if (ctx->transport != ctx->asoc->peer.retran_path) {
999 /* Switch transports & prepare the packet. */
1000 ctx->transport = ctx->asoc->peer.retran_path;
1001 ctx->packet = &ctx->transport->packet;
1002
1003 if (list_empty(&ctx->transport->send_ready))
1004 list_add_tail(&ctx->transport->send_ready,
1005 &ctx->transport_list);
1006
1007 sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag,
1008 ctx->asoc->peer.ecn_capable);
1009 }
1010
1011 error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout,
1012 &start_timer, ctx->gfp);
1013 if (error < 0)
1014 ctx->asoc->base.sk->sk_err = -error;
1015
1016 if (start_timer) {
1017 sctp_transport_reset_t3_rtx(ctx->transport);
1018 ctx->transport->last_time_sent = jiffies;
1019 }
1020
1021 /* This can happen on COOKIE-ECHO resend. Only
1022 * one chunk can get bundled with a COOKIE-ECHO.
1023 */
1024 if (ctx->packet->has_cookie_echo)
1025 return false;
1026
1027 /* Don't send new data if there is still data
1028 * waiting to retransmit.
1029 */
1030 if (!list_empty(&ctx->q->retransmit))
1031 return false;
1032
1033 return true;
1034}
1035
1036static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
1037 int rtx_timeout)
1038{
1039 struct sctp_chunk *chunk;
1040 enum sctp_xmit status;
977 1041
978 /* Is it OK to send data chunks? */ 1042 /* Is it OK to send data chunks? */
979 switch (asoc->state) { 1043 switch (ctx->asoc->state) {
980 case SCTP_STATE_COOKIE_ECHOED: 1044 case SCTP_STATE_COOKIE_ECHOED:
981 /* Only allow bundling when this packet has a COOKIE-ECHO 1045 /* Only allow bundling when this packet has a COOKIE-ECHO
982 * chunk. 1046 * chunk.
983 */ 1047 */
984 if (!packet || !packet->has_cookie_echo) 1048 if (!ctx->packet || !ctx->packet->has_cookie_echo)
985 break; 1049 return;
986 1050
987 /* fallthru */ 1051 /* fallthru */
988 case SCTP_STATE_ESTABLISHED: 1052 case SCTP_STATE_ESTABLISHED:
989 case SCTP_STATE_SHUTDOWN_PENDING: 1053 case SCTP_STATE_SHUTDOWN_PENDING:
990 case SCTP_STATE_SHUTDOWN_RECEIVED: 1054 case SCTP_STATE_SHUTDOWN_RECEIVED:
991 /* 1055 break;
992 * RFC 2960 6.1 Transmission of DATA Chunks
993 *
994 * C) When the time comes for the sender to transmit,
995 * before sending new DATA chunks, the sender MUST
996 * first transmit any outstanding DATA chunks which
997 * are marked for retransmission (limited by the
998 * current cwnd).
999 */
1000 if (!list_empty(&q->retransmit)) {
1001 if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
1002 goto sctp_flush_out;
1003 if (transport == asoc->peer.retran_path)
1004 goto retran;
1005
1006 /* Switch transports & prepare the packet. */
1007
1008 transport = asoc->peer.retran_path;
1009 1056
1010 if (list_empty(&transport->send_ready)) { 1057 default:
1011 list_add_tail(&transport->send_ready, 1058 /* Do nothing. */
1012 &transport_list); 1059 return;
1013 } 1060 }
1014 1061
1015 packet = &transport->packet; 1062 /* RFC 2960 6.1 Transmission of DATA Chunks
1016 sctp_packet_config(packet, vtag, 1063 *
1017 asoc->peer.ecn_capable); 1064 * C) When the time comes for the sender to transmit,
1018 retran: 1065 * before sending new DATA chunks, the sender MUST
1019 error = sctp_outq_flush_rtx(q, packet, 1066 * first transmit any outstanding DATA chunks which
1020 rtx_timeout, &start_timer); 1067 * are marked for retransmission (limited by the
1021 if (error < 0) 1068 * current cwnd).
1022 asoc->base.sk->sk_err = -error; 1069 */
1070 if (!list_empty(&ctx->q->retransmit) &&
1071 !sctp_outq_flush_rtx(ctx, rtx_timeout))
1072 return;
1023 1073
1024 if (start_timer) { 1074 /* Apply Max.Burst limitation to the current transport in
1025 sctp_transport_reset_t3_rtx(transport); 1075 * case it will be used for new data. We are going to
1026 transport->last_time_sent = jiffies; 1076 * rest it before we return, but we want to apply the limit
1027 } 1077 * to the currently queued data.
1078 */
1079 if (ctx->transport)
1080 sctp_transport_burst_limited(ctx->transport);
1028 1081
1029 /* This can happen on COOKIE-ECHO resend. Only 1082 /* Finally, transmit new packets. */
1030 * one chunk can get bundled with a COOKIE-ECHO. 1083 while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) {
1031 */ 1084 __u32 sid = ntohs(chunk->subh.data_hdr->stream);
1032 if (packet->has_cookie_echo)
1033 goto sctp_flush_out;
1034 1085
1035 /* Don't send new data if there is still data 1086 /* Has this chunk expired? */
1036 * waiting to retransmit. 1087 if (sctp_chunk_abandoned(chunk)) {
1037 */ 1088 sctp_sched_dequeue_done(ctx->q, chunk);
1038 if (!list_empty(&q->retransmit)) 1089 sctp_chunk_fail(chunk, 0);
1039 goto sctp_flush_out; 1090 sctp_chunk_free(chunk);
1091 continue;
1040 } 1092 }
1041 1093
1042 /* Apply Max.Burst limitation to the current transport in 1094 if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
1043 * case it will be used for new data. We are going to 1095 sctp_outq_head_data(ctx->q, chunk);
1044 * rest it before we return, but we want to apply the limit 1096 break;
1045 * to the currently queued data. 1097 }
1046 */
1047 if (transport)
1048 sctp_transport_burst_limited(transport);
1049
1050 /* Finally, transmit new packets. */
1051 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
1052 __u32 sid = ntohs(chunk->subh.data_hdr->stream);
1053
1054 /* Has this chunk expired? */
1055 if (sctp_chunk_abandoned(chunk)) {
1056 sctp_sched_dequeue_done(q, chunk);
1057 sctp_chunk_fail(chunk, 0);
1058 sctp_chunk_free(chunk);
1059 continue;
1060 }
1061 1098
1062 if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { 1099 sctp_outq_select_transport(ctx, chunk);
1063 sctp_outq_head_data(q, chunk);
1064 goto sctp_flush_out;
1065 }
1066 1100
1067 /* If there is a specified transport, use it. 1101 pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n",
1068 * Otherwise, we want to use the active path. 1102 __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ?
1103 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
1104 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
1105 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
1106 refcount_read(&chunk->skb->users) : -1);
1107
1108 /* Add the chunk to the packet. */
1109 status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0,
1110 ctx->gfp);
1111 if (status != SCTP_XMIT_OK) {
1112 /* We could not append this chunk, so put
1113 * the chunk back on the output queue.
1069 */ 1114 */
1070 new_transport = chunk->transport; 1115 pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
1071 if (!new_transport || 1116 __func__, ntohl(chunk->subh.data_hdr->tsn),
1072 ((new_transport->state == SCTP_INACTIVE) || 1117 status);
1073 (new_transport->state == SCTP_UNCONFIRMED) ||
1074 (new_transport->state == SCTP_PF)))
1075 new_transport = asoc->peer.active_path;
1076 if (new_transport->state == SCTP_UNCONFIRMED) {
1077 WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
1078 sctp_sched_dequeue_done(q, chunk);
1079 sctp_chunk_fail(chunk, 0);
1080 sctp_chunk_free(chunk);
1081 continue;
1082 }
1083
1084 /* Change packets if necessary. */
1085 if (new_transport != transport) {
1086 transport = new_transport;
1087 1118
1088 /* Schedule to have this transport's 1119 sctp_outq_head_data(ctx->q, chunk);
1089 * packet flushed. 1120 break;
1090 */ 1121 }
1091 if (list_empty(&transport->send_ready)) {
1092 list_add_tail(&transport->send_ready,
1093 &transport_list);
1094 }
1095
1096 packet = &transport->packet;
1097 sctp_packet_config(packet, vtag,
1098 asoc->peer.ecn_capable);
1099 /* We've switched transports, so apply the
1100 * Burst limit to the new transport.
1101 */
1102 sctp_transport_burst_limited(transport);
1103 }
1104
1105 pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
1106 "skb->users:%d\n",
1107 __func__, q, chunk, chunk && chunk->chunk_hdr ?
1108 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
1109 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
1110 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
1111 refcount_read(&chunk->skb->users) : -1);
1112
1113 /* Add the chunk to the packet. */
1114 status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
1115
1116 switch (status) {
1117 case SCTP_XMIT_PMTU_FULL:
1118 case SCTP_XMIT_RWND_FULL:
1119 case SCTP_XMIT_DELAY:
1120 /* We could not append this chunk, so put
1121 * the chunk back on the output queue.
1122 */
1123 pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
1124 __func__, ntohl(chunk->subh.data_hdr->tsn),
1125 status);
1126
1127 sctp_outq_head_data(q, chunk);
1128 goto sctp_flush_out;
1129
1130 case SCTP_XMIT_OK:
1131 /* The sender is in the SHUTDOWN-PENDING state,
1132 * The sender MAY set the I-bit in the DATA
1133 * chunk header.
1134 */
1135 if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
1136 chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
1137 if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
1138 asoc->stats.ouodchunks++;
1139 else
1140 asoc->stats.oodchunks++;
1141
1142 /* Only now it's safe to consider this
1143 * chunk as sent, sched-wise.
1144 */
1145 sctp_sched_dequeue_done(q, chunk);
1146
1147 break;
1148 1122
1149 default: 1123 /* The sender is in the SHUTDOWN-PENDING state,
1150 BUG(); 1124 * The sender MAY set the I-bit in the DATA
1151 } 1125 * chunk header.
1126 */
1127 if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
1128 chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
1129 if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
1130 ctx->asoc->stats.ouodchunks++;
1131 else
1132 ctx->asoc->stats.oodchunks++;
1152 1133
1153 /* BUG: We assume that the sctp_packet_transmit() 1134 /* Only now it's safe to consider this
1154 * call below will succeed all the time and add the 1135 * chunk as sent, sched-wise.
1155 * chunk to the transmitted list and restart the 1136 */
1156 * timers. 1137 sctp_sched_dequeue_done(ctx->q, chunk);
1157 * It is possible that the call can fail under OOM
1158 * conditions.
1159 *
1160 * Is this really a problem? Won't this behave
1161 * like a lost TSN?
1162 */
1163 list_add_tail(&chunk->transmitted_list,
1164 &transport->transmitted);
1165 1138
1166 sctp_transport_reset_t3_rtx(transport); 1139 list_add_tail(&chunk->transmitted_list,
1167 transport->last_time_sent = jiffies; 1140 &ctx->transport->transmitted);
1168 1141
1169 /* Only let one DATA chunk get bundled with a 1142 sctp_transport_reset_t3_rtx(ctx->transport);
1170 * COOKIE-ECHO chunk. 1143 ctx->transport->last_time_sent = jiffies;
1171 */
1172 if (packet->has_cookie_echo)
1173 goto sctp_flush_out;
1174 }
1175 break;
1176 1144
1177 default: 1145 /* Only let one DATA chunk get bundled with a
1178 /* Do nothing. */ 1146 * COOKIE-ECHO chunk.
1179 break; 1147 */
1148 if (ctx->packet->has_cookie_echo)
1149 break;
1180 } 1150 }
1151}
1181 1152
1182sctp_flush_out: 1153static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
1154{
1155 struct list_head *ltransport;
1156 struct sctp_packet *packet;
1157 struct sctp_transport *t;
1158 int error = 0;
1183 1159
1184 /* Before returning, examine all the transports touched in 1160 while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) {
1185 * this call. Right now, we bluntly force clear all the 1161 t = list_entry(ltransport, struct sctp_transport, send_ready);
1186 * transports. Things might change after we implement Nagle.
1187 * But such an examination is still required.
1188 *
1189 * --xguo
1190 */
1191 while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) {
1192 struct sctp_transport *t = list_entry(ltransport,
1193 struct sctp_transport,
1194 send_ready);
1195 packet = &t->packet; 1162 packet = &t->packet;
1196 if (!sctp_packet_empty(packet)) { 1163 if (!sctp_packet_empty(packet)) {
1197 error = sctp_packet_transmit(packet, gfp); 1164 error = sctp_packet_transmit(packet, ctx->gfp);
1198 if (error < 0) 1165 if (error < 0)
1199 asoc->base.sk->sk_err = -error; 1166 ctx->q->asoc->base.sk->sk_err = -error;
1200 } 1167 }
1201 1168
1202 /* Clear the burst limited state, if any */ 1169 /* Clear the burst limited state, if any */
@@ -1204,6 +1171,47 @@ sctp_flush_out:
1204 } 1171 }
1205} 1172}
1206 1173
1174/* Try to flush an outqueue.
1175 *
1176 * Description: Send everything in q which we legally can, subject to
1177 * congestion limitations.
1178 * * Note: This function can be called from multiple contexts so appropriate
1179 * locking concerns must be made. Today we use the sock lock to protect
1180 * this function.
1181 */
1182
1183static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1184{
1185 struct sctp_flush_ctx ctx = {
1186 .q = q,
1187 .transport = NULL,
1188 .transport_list = LIST_HEAD_INIT(ctx.transport_list),
1189 .asoc = q->asoc,
1190 .packet = NULL,
1191 .gfp = gfp,
1192 };
1193
1194 /* 6.10 Bundling
1195 * ...
1196 * When bundling control chunks with DATA chunks, an
1197 * endpoint MUST place control chunks first in the outbound
1198 * SCTP packet. The transmitter MUST transmit DATA chunks
1199 * within a SCTP packet in increasing order of TSN.
1200 * ...
1201 */
1202
1203 sctp_outq_flush_ctrl(&ctx);
1204
1205 if (q->asoc->src_out_of_asoc_ok)
1206 goto sctp_flush_out;
1207
1208 sctp_outq_flush_data(&ctx, rtx_timeout);
1209
1210sctp_flush_out:
1211
1212 sctp_outq_flush_transports(&ctx);
1213}
1214
1207/* Update unack_data based on the incoming SACK chunk */ 1215/* Update unack_data based on the incoming SACK chunk */
1208static void sctp_sack_update_unack_data(struct sctp_association *assoc, 1216static void sctp_sack_update_unack_data(struct sctp_association *assoc,
1209 struct sctp_sackhdr *sack) 1217 struct sctp_sackhdr *sack)
@@ -1457,7 +1465,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1457 * the outstanding bytes for this chunk, so only 1465 * the outstanding bytes for this chunk, so only
1458 * count bytes associated with a transport. 1466 * count bytes associated with a transport.
1459 */ 1467 */
1460 if (transport) { 1468 if (transport && !tchunk->tsn_gap_acked) {
1461 /* If this chunk is being used for RTT 1469 /* If this chunk is being used for RTT
1462 * measurement, calculate the RTT and update 1470 * measurement, calculate the RTT and update
1463 * the RTO using this value. 1471 * the RTO using this value.
@@ -1469,14 +1477,34 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1469 * first instance of the packet or a later 1477 * first instance of the packet or a later
1470 * instance). 1478 * instance).
1471 */ 1479 */
1472 if (!tchunk->tsn_gap_acked && 1480 if (!sctp_chunk_retransmitted(tchunk) &&
1473 !sctp_chunk_retransmitted(tchunk) &&
1474 tchunk->rtt_in_progress) { 1481 tchunk->rtt_in_progress) {
1475 tchunk->rtt_in_progress = 0; 1482 tchunk->rtt_in_progress = 0;
1476 rtt = jiffies - tchunk->sent_at; 1483 rtt = jiffies - tchunk->sent_at;
1477 sctp_transport_update_rto(transport, 1484 sctp_transport_update_rto(transport,
1478 rtt); 1485 rtt);
1479 } 1486 }
1487
1488 if (TSN_lte(tsn, sack_ctsn)) {
1489 /*
1490 * SFR-CACC algorithm:
1491 * 2) If the SACK contains gap acks
1492 * and the flag CHANGEOVER_ACTIVE is
1493 * set the receiver of the SACK MUST
1494 * take the following action:
1495 *
1496 * B) For each TSN t being acked that
1497 * has not been acked in any SACK so
1498 * far, set cacc_saw_newack to 1 for
1499 * the destination that the TSN was
1500 * sent to.
1501 */
1502 if (sack->num_gap_ack_blocks &&
1503 q->asoc->peer.primary_path->cacc.
1504 changeover_active)
1505 transport->cacc.cacc_saw_newack
1506 = 1;
1507 }
1480 } 1508 }
1481 1509
1482 /* If the chunk hasn't been marked as ACKED, 1510 /* If the chunk hasn't been marked as ACKED,
@@ -1508,28 +1536,6 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1508 restart_timer = 1; 1536 restart_timer = 1;
1509 forward_progress = true; 1537 forward_progress = true;
1510 1538
1511 if (!tchunk->tsn_gap_acked) {
1512 /*
1513 * SFR-CACC algorithm:
1514 * 2) If the SACK contains gap acks
1515 * and the flag CHANGEOVER_ACTIVE is
1516 * set the receiver of the SACK MUST
1517 * take the following action:
1518 *
1519 * B) For each TSN t being acked that
1520 * has not been acked in any SACK so
1521 * far, set cacc_saw_newack to 1 for
1522 * the destination that the TSN was
1523 * sent to.
1524 */
1525 if (transport &&
1526 sack->num_gap_ack_blocks &&
1527 q->asoc->peer.primary_path->cacc.
1528 changeover_active)
1529 transport->cacc.cacc_saw_newack
1530 = 1;
1531 }
1532
1533 list_add_tail(&tchunk->transmitted_list, 1539 list_add_tail(&tchunk->transmitted_list,
1534 &q->sacked); 1540 &q->sacked);
1535 } else { 1541 } else {
@@ -1756,7 +1762,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
1756 if (TSN_lte(tsn, ctsn)) 1762 if (TSN_lte(tsn, ctsn))
1757 goto pass; 1763 goto pass;
1758 1764
1759 /* 3.3.4 Selective Acknowledgement (SACK) (3): 1765 /* 3.3.4 Selective Acknowledgment (SACK) (3):
1760 * 1766 *
1761 * Gap Ack Blocks: 1767 * Gap Ack Blocks:
1762 * These fields contain the Gap Ack Blocks. They are repeated 1768 * These fields contain the Gap Ack Blocks. They are repeated
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e62addb60434..4a4fd1971255 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -81,8 +81,6 @@ static int sctp_process_param(struct sctp_association *asoc,
81 gfp_t gfp); 81 gfp_t gfp);
82static void *sctp_addto_param(struct sctp_chunk *chunk, int len, 82static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
83 const void *data); 83 const void *data);
84static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
85 const void *data);
86 84
87/* Control chunk destructor */ 85/* Control chunk destructor */
88static void sctp_control_release_owner(struct sk_buff *skb) 86static void sctp_control_release_owner(struct sk_buff *skb)
@@ -154,12 +152,11 @@ static const struct sctp_paramhdr prsctp_param = {
154 cpu_to_be16(sizeof(struct sctp_paramhdr)), 152 cpu_to_be16(sizeof(struct sctp_paramhdr)),
155}; 153};
156 154
157/* A helper to initialize an op error inside a 155/* A helper to initialize an op error inside a provided chunk, as most
158 * provided chunk, as most cause codes will be embedded inside an 156 * cause codes will be embedded inside an abort chunk.
159 * abort chunk.
160 */ 157 */
161void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, 158int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
162 size_t paylen) 159 size_t paylen)
163{ 160{
164 struct sctp_errhdr err; 161 struct sctp_errhdr err;
165 __u16 len; 162 __u16 len;
@@ -167,33 +164,16 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
167 /* Cause code constants are now defined in network order. */ 164 /* Cause code constants are now defined in network order. */
168 err.cause = cause_code; 165 err.cause = cause_code;
169 len = sizeof(err) + paylen; 166 len = sizeof(err) + paylen;
170 err.length = htons(len); 167 err.length = htons(len);
171 chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
172}
173
174/* A helper to initialize an op error inside a
175 * provided chunk, as most cause codes will be embedded inside an
176 * abort chunk. Differs from sctp_init_cause in that it won't oops
177 * if there isn't enough space in the op error chunk
178 */
179static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
180 size_t paylen)
181{
182 struct sctp_errhdr err;
183 __u16 len;
184
185 /* Cause code constants are now defined in network order. */
186 err.cause = cause_code;
187 len = sizeof(err) + paylen;
188 err.length = htons(len);
189 168
190 if (skb_tailroom(chunk->skb) < len) 169 if (skb_tailroom(chunk->skb) < len)
191 return -ENOSPC; 170 return -ENOSPC;
192 171
193 chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err); 172 chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
194 173
195 return 0; 174 return 0;
196} 175}
176
197/* 3.3.2 Initiation (INIT) (1) 177/* 3.3.2 Initiation (INIT) (1)
198 * 178 *
199 * This chunk is used to initiate a SCTP association between two 179 * This chunk is used to initiate a SCTP association between two
@@ -779,10 +759,9 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
779 * association. This reports on which TSN's we've seen to date, 759 * association. This reports on which TSN's we've seen to date,
780 * including duplicates and gaps. 760 * including duplicates and gaps.
781 */ 761 */
782struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) 762struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc)
783{ 763{
784 struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; 764 struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
785 struct sctp_association *aptr = (struct sctp_association *)asoc;
786 struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; 765 struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
787 __u16 num_gabs, num_dup_tsns; 766 __u16 num_gabs, num_dup_tsns;
788 struct sctp_transport *trans; 767 struct sctp_transport *trans;
@@ -857,7 +836,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
857 836
858 /* Add the duplicate TSN information. */ 837 /* Add the duplicate TSN information. */
859 if (num_dup_tsns) { 838 if (num_dup_tsns) {
860 aptr->stats.idupchunks += num_dup_tsns; 839 asoc->stats.idupchunks += num_dup_tsns;
861 sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, 840 sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
862 sctp_tsnmap_get_dups(map)); 841 sctp_tsnmap_get_dups(map));
863 } 842 }
@@ -869,11 +848,11 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
869 * association so no transport will match after a wrap event like this, 848 * association so no transport will match after a wrap event like this,
870 * Until the next sack 849 * Until the next sack
871 */ 850 */
872 if (++aptr->peer.sack_generation == 0) { 851 if (++asoc->peer.sack_generation == 0) {
873 list_for_each_entry(trans, &asoc->peer.transport_addr_list, 852 list_for_each_entry(trans, &asoc->peer.transport_addr_list,
874 transports) 853 transports)
875 trans->sack_generation = 0; 854 trans->sack_generation = 0;
876 aptr->peer.sack_generation = 1; 855 asoc->peer.sack_generation = 1;
877 } 856 }
878nodata: 857nodata:
879 return retval; 858 return retval;
@@ -1258,20 +1237,26 @@ nodata:
1258 return retval; 1237 return retval;
1259} 1238}
1260 1239
1261/* Create an Operation Error chunk of a fixed size, 1240/* Create an Operation Error chunk of a fixed size, specifically,
1262 * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) 1241 * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
1263 * This is a helper function to allocate an error chunk for 1242 * This is a helper function to allocate an error chunk for for those
1264 * for those invalid parameter codes in which we may not want 1243 * invalid parameter codes in which we may not want to report all the
1265 * to report all the errors, if the incoming chunk is large 1244 * errors, if the incoming chunk is large. If it can't fit in a single
1245 * packet, we ignore it.
1266 */ 1246 */
1267static inline struct sctp_chunk *sctp_make_op_error_fixed( 1247static inline struct sctp_chunk *sctp_make_op_error_limited(
1268 const struct sctp_association *asoc, 1248 const struct sctp_association *asoc,
1269 const struct sctp_chunk *chunk) 1249 const struct sctp_chunk *chunk)
1270{ 1250{
1271 size_t size = asoc ? asoc->pathmtu : 0; 1251 size_t size = SCTP_DEFAULT_MAXSEGMENT;
1252 struct sctp_sock *sp = NULL;
1272 1253
1273 if (!size) 1254 if (asoc) {
1274 size = SCTP_DEFAULT_MAXSEGMENT; 1255 size = min_t(size_t, size, asoc->pathmtu);
1256 sp = sctp_sk(asoc->base.sk);
1257 }
1258
1259 size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));
1275 1260
1276 return sctp_make_op_error_space(asoc, chunk, size); 1261 return sctp_make_op_error_space(asoc, chunk, size);
1277} 1262}
@@ -1523,18 +1508,6 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
1523 return target; 1508 return target;
1524} 1509}
1525 1510
1526/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
1527 * space in the chunk
1528 */
1529static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
1530 int len, const void *data)
1531{
1532 if (skb_tailroom(chunk->skb) >= len)
1533 return sctp_addto_chunk(chunk, len, data);
1534 else
1535 return NULL;
1536}
1537
1538/* Append bytes from user space to the end of a chunk. Will panic if 1511/* Append bytes from user space to the end of a chunk. Will panic if
1539 * chunk is not big enough. 1512 * chunk is not big enough.
1540 * Returns a kernel err value. 1513 * Returns a kernel err value.
@@ -1829,6 +1802,9 @@ no_hmac:
1829 kt = ktime_get_real(); 1802 kt = ktime_get_real();
1830 1803
1831 if (!asoc && ktime_before(bear_cookie->expiration, kt)) { 1804 if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
1805 suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
1806 __be32 n = htonl(usecs);
1807
1832 /* 1808 /*
1833 * Section 3.3.10.3 Stale Cookie Error (3) 1809 * Section 3.3.10.3 Stale Cookie Error (3)
1834 * 1810 *
@@ -1837,17 +1813,12 @@ no_hmac:
1837 * Stale Cookie Error: Indicates the receipt of a valid State 1813 * Stale Cookie Error: Indicates the receipt of a valid State
1838 * Cookie that has expired. 1814 * Cookie that has expired.
1839 */ 1815 */
1840 len = ntohs(chunk->chunk_hdr->length); 1816 *errp = sctp_make_op_error(asoc, chunk,
1841 *errp = sctp_make_op_error_space(asoc, chunk, len); 1817 SCTP_ERROR_STALE_COOKIE, &n,
1842 if (*errp) { 1818 sizeof(n), 0);
1843 suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); 1819 if (*errp)
1844 __be32 n = htonl(usecs);
1845
1846 sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
1847 sizeof(n));
1848 sctp_addto_chunk(*errp, sizeof(n), &n);
1849 *error = -SCTP_IERROR_STALE_COOKIE; 1820 *error = -SCTP_IERROR_STALE_COOKIE;
1850 } else 1821 else
1851 *error = -SCTP_IERROR_NOMEM; 1822 *error = -SCTP_IERROR_NOMEM;
1852 1823
1853 goto fail; 1824 goto fail;
@@ -1998,12 +1969,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
1998 if (*errp) 1969 if (*errp)
1999 sctp_chunk_free(*errp); 1970 sctp_chunk_free(*errp);
2000 1971
2001 *errp = sctp_make_op_error_space(asoc, chunk, len); 1972 *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED,
2002 1973 param.v, len, 0);
2003 if (*errp) {
2004 sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
2005 sctp_addto_chunk(*errp, len, param.v);
2006 }
2007 1974
2008 /* Stop processing this chunk. */ 1975 /* Stop processing this chunk. */
2009 return 0; 1976 return 0;
@@ -2128,23 +2095,23 @@ static enum sctp_ierror sctp_process_unk_param(
2128 /* Make an ERROR chunk, preparing enough room for 2095 /* Make an ERROR chunk, preparing enough room for
2129 * returning multiple unknown parameters. 2096 * returning multiple unknown parameters.
2130 */ 2097 */
2131 if (NULL == *errp) 2098 if (!*errp) {
2132 *errp = sctp_make_op_error_fixed(asoc, chunk); 2099 *errp = sctp_make_op_error_limited(asoc, chunk);
2133 2100 if (!*errp) {
2134 if (*errp) { 2101 /* If there is no memory for generating the
2135 if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, 2102 * ERROR report as specified, an ABORT will be
2136 SCTP_PAD4(ntohs(param.p->length)))) 2103 * triggered to the peer and the association
2137 sctp_addto_chunk_fixed(*errp, 2104 * won't be established.
2138 SCTP_PAD4(ntohs(param.p->length)), 2105 */
2139 param.v); 2106 retval = SCTP_IERROR_NOMEM;
2140 } else { 2107 break;
2141 /* If there is no memory for generating the ERROR 2108 }
2142 * report as specified, an ABORT will be triggered
2143 * to the peer and the association won't be
2144 * established.
2145 */
2146 retval = SCTP_IERROR_NOMEM;
2147 } 2109 }
2110
2111 if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
2112 ntohs(param.p->length)))
2113 sctp_addto_chunk(*errp, ntohs(param.p->length),
2114 param.v);
2148 break; 2115 break;
2149 default: 2116 default:
2150 break; 2117 break;
@@ -2220,10 +2187,10 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
2220 * MUST be aborted. The ABORT chunk SHOULD contain the error 2187 * MUST be aborted. The ABORT chunk SHOULD contain the error
2221 * cause 'Protocol Violation'. 2188 * cause 'Protocol Violation'.
2222 */ 2189 */
2223 if (SCTP_AUTH_RANDOM_LENGTH != 2190 if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) -
2224 ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { 2191 sizeof(struct sctp_paramhdr)) {
2225 sctp_process_inv_paramlength(asoc, param.p, 2192 sctp_process_inv_paramlength(asoc, param.p,
2226 chunk, err_chunk); 2193 chunk, err_chunk);
2227 retval = SCTP_IERROR_ABORT; 2194 retval = SCTP_IERROR_ABORT;
2228 } 2195 }
2229 break; 2196 break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf747094d26b..d20f7addee19 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -644,16 +644,15 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
644 644
645 list_for_each_entry(trans, 645 list_for_each_entry(trans,
646 &asoc->peer.transport_addr_list, transports) { 646 &asoc->peer.transport_addr_list, transports) {
647 /* Clear the source and route cache */
648 sctp_transport_dst_release(trans);
649 trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 647 trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
650 2*asoc->pathmtu, 4380)); 648 2*asoc->pathmtu, 4380));
651 trans->ssthresh = asoc->peer.i.a_rwnd; 649 trans->ssthresh = asoc->peer.i.a_rwnd;
652 trans->rto = asoc->rto_initial; 650 trans->rto = asoc->rto_initial;
653 sctp_max_rto(asoc, trans); 651 sctp_max_rto(asoc, trans);
654 trans->rtt = trans->srtt = trans->rttvar = 0; 652 trans->rtt = trans->srtt = trans->rttvar = 0;
653 /* Clear the source and route cache */
655 sctp_transport_route(trans, NULL, 654 sctp_transport_route(trans, NULL,
656 sctp_sk(asoc->base.sk)); 655 sctp_sk(asoc->base.sk));
657 } 656 }
658 } 657 }
659 retval = sctp_send_asconf(asoc, chunk); 658 retval = sctp_send_asconf(asoc, chunk);
@@ -896,7 +895,6 @@ skip_mkasconf:
896 */ 895 */
897 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 896 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
898 transports) { 897 transports) {
899 sctp_transport_dst_release(transport);
900 sctp_transport_route(transport, NULL, 898 sctp_transport_route(transport, NULL,
901 sctp_sk(asoc->base.sk)); 899 sctp_sk(asoc->base.sk));
902 } 900 }
@@ -1894,6 +1892,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
1894 struct sctp_sndrcvinfo *sinfo) 1892 struct sctp_sndrcvinfo *sinfo)
1895{ 1893{
1896 struct sock *sk = asoc->base.sk; 1894 struct sock *sk = asoc->base.sk;
1895 struct sctp_sock *sp = sctp_sk(sk);
1897 struct net *net = sock_net(sk); 1896 struct net *net = sock_net(sk);
1898 struct sctp_datamsg *datamsg; 1897 struct sctp_datamsg *datamsg;
1899 bool wait_connect = false; 1898 bool wait_connect = false;
@@ -1912,13 +1911,16 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
1912 goto err; 1911 goto err;
1913 } 1912 }
1914 1913
1915 if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) { 1914 if (sp->disable_fragments && msg_len > asoc->frag_point) {
1916 err = -EMSGSIZE; 1915 err = -EMSGSIZE;
1917 goto err; 1916 goto err;
1918 } 1917 }
1919 1918
1920 if (asoc->pmtu_pending) 1919 if (asoc->pmtu_pending) {
1921 sctp_assoc_pending_pmtu(asoc); 1920 if (sp->param_flags & SPP_PMTUD_ENABLE)
1921 sctp_assoc_sync_pmtu(asoc);
1922 asoc->pmtu_pending = 0;
1923 }
1922 1924
1923 if (sctp_wspace(asoc) < msg_len) 1925 if (sctp_wspace(asoc) < msg_len)
1924 sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); 1926 sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
@@ -1935,7 +1937,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
1935 if (err) 1937 if (err)
1936 goto err; 1938 goto err;
1937 1939
1938 if (sctp_sk(sk)->strm_interleave) { 1940 if (sp->strm_interleave) {
1939 timeo = sock_sndtimeo(sk, 0); 1941 timeo = sock_sndtimeo(sk, 0);
1940 err = sctp_wait_for_connect(asoc, &timeo); 1942 err = sctp_wait_for_connect(asoc, &timeo);
1941 if (err) 1943 if (err)
@@ -2538,7 +2540,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2538 trans->pathmtu = params->spp_pathmtu; 2540 trans->pathmtu = params->spp_pathmtu;
2539 sctp_assoc_sync_pmtu(asoc); 2541 sctp_assoc_sync_pmtu(asoc);
2540 } else if (asoc) { 2542 } else if (asoc) {
2541 asoc->pathmtu = params->spp_pathmtu; 2543 sctp_assoc_set_pmtu(asoc, params->spp_pathmtu);
2542 } else { 2544 } else {
2543 sp->pathmtu = params->spp_pathmtu; 2545 sp->pathmtu = params->spp_pathmtu;
2544 } 2546 }
@@ -3208,7 +3210,6 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign
3208static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) 3210static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
3209{ 3211{
3210 struct sctp_sock *sp = sctp_sk(sk); 3212 struct sctp_sock *sp = sctp_sk(sk);
3211 struct sctp_af *af = sp->pf->af;
3212 struct sctp_assoc_value params; 3213 struct sctp_assoc_value params;
3213 struct sctp_association *asoc; 3214 struct sctp_association *asoc;
3214 int val; 3215 int val;
@@ -3230,30 +3231,24 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
3230 return -EINVAL; 3231 return -EINVAL;
3231 } 3232 }
3232 3233
3234 asoc = sctp_id2assoc(sk, params.assoc_id);
3235
3233 if (val) { 3236 if (val) {
3234 int min_len, max_len; 3237 int min_len, max_len;
3238 __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
3239 sizeof(struct sctp_data_chunk);
3235 3240
3236 min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len; 3241 min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
3237 min_len -= af->ip_options_len(sk); 3242 datasize);
3238 min_len -= sizeof(struct sctphdr) + 3243 max_len = SCTP_MAX_CHUNK_LEN - datasize;
3239 sizeof(struct sctp_data_chunk);
3240
3241 max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
3242 3244
3243 if (val < min_len || val > max_len) 3245 if (val < min_len || val > max_len)
3244 return -EINVAL; 3246 return -EINVAL;
3245 } 3247 }
3246 3248
3247 asoc = sctp_id2assoc(sk, params.assoc_id);
3248 if (asoc) { 3249 if (asoc) {
3249 if (val == 0) {
3250 val = asoc->pathmtu - af->net_header_len;
3251 val -= af->ip_options_len(sk);
3252 val -= sizeof(struct sctphdr) +
3253 sctp_datachk_len(&asoc->stream);
3254 }
3255 asoc->user_frag = val; 3250 asoc->user_frag = val;
3256 asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); 3251 sctp_assoc_update_frag_point(asoc);
3257 } else { 3252 } else {
3258 if (params.assoc_id && sctp_style(sk, UDP)) 3253 if (params.assoc_id && sctp_style(sk, UDP))
3259 return -EINVAL; 3254 return -EINVAL;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 47f82bd794d9..445b7ef61677 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -242,9 +242,18 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
242 &transport->fl, sk); 242 &transport->fl, sk);
243 } 243 }
244 244
245 if (transport->dst) { 245 if (transport->param_flags & SPP_PMTUD_DISABLE) {
246 transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); 246 struct sctp_association *asoc = transport->asoc;
247 } else 247
248 if (!transport->pathmtu && asoc && asoc->pathmtu)
249 transport->pathmtu = asoc->pathmtu;
250 if (transport->pathmtu)
251 return;
252 }
253
254 if (transport->dst)
255 transport->pathmtu = sctp_dst_mtu(transport->dst);
256 else
248 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; 257 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
249} 258}
250 259
@@ -290,6 +299,7 @@ void sctp_transport_route(struct sctp_transport *transport,
290 struct sctp_association *asoc = transport->asoc; 299 struct sctp_association *asoc = transport->asoc;
291 struct sctp_af *af = transport->af_specific; 300 struct sctp_af *af = transport->af_specific;
292 301
302 sctp_transport_dst_release(transport);
293 af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt)); 303 af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));
294 304
295 if (saddr) 305 if (saddr)
@@ -297,21 +307,14 @@ void sctp_transport_route(struct sctp_transport *transport,
297 else 307 else
298 af->get_saddr(opt, transport, &transport->fl); 308 af->get_saddr(opt, transport, &transport->fl);
299 309
300 if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { 310 sctp_transport_pmtu(transport, sctp_opt2sk(opt));
301 return;
302 }
303 if (transport->dst) {
304 transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
305 311
306 /* Initialize sk->sk_rcv_saddr, if the transport is the 312 /* Initialize sk->sk_rcv_saddr, if the transport is the
307 * association's active path for getsockname(). 313 * association's active path for getsockname().
308 */ 314 */
309 if (asoc && (!asoc->peer.primary_path || 315 if (transport->dst && asoc &&
310 (transport == asoc->peer.active_path))) 316 (!asoc->peer.primary_path || transport == asoc->peer.active_path))
311 opt->pf->to_sk_saddr(&transport->saddr, 317 opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk);
312 asoc->base.sk);
313 } else
314 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
315} 318}
316 319
317/* Hold a reference to a transport. */ 320/* Hold a reference to a transport. */
@@ -634,7 +637,7 @@ unsigned long sctp_transport_timeout(struct sctp_transport *trans)
634 trans->state != SCTP_PF) 637 trans->state != SCTP_PF)
635 timeout += trans->hbinterval; 638 timeout += trans->hbinterval;
636 639
637 return timeout; 640 return max_t(unsigned long, timeout, HZ / 5);
638} 641}
639 642
640/* Reset transport variables to their initial values */ 643/* Reset transport variables to their initial values */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 544bab42f925..973b4471b532 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -8,8 +8,6 @@
8 * 8 *
9 * Initial restrictions: 9 * Initial restrictions:
10 * - support for alternate links postponed 10 * - support for alternate links postponed
11 * - partial support for non-blocking sockets only
12 * - support for urgent data postponed
13 * 11 *
14 * Copyright IBM Corp. 2016, 2018 12 * Copyright IBM Corp. 2016, 2018
15 * 13 *
@@ -29,6 +27,7 @@
29#include <net/sock.h> 27#include <net/sock.h>
30#include <net/tcp.h> 28#include <net/tcp.h>
31#include <net/smc.h> 29#include <net/smc.h>
30#include <asm/ioctls.h>
32 31
33#include "smc.h" 32#include "smc.h"
34#include "smc_clc.h" 33#include "smc_clc.h"
@@ -45,11 +44,6 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
45 * creation 44 * creation
46 */ 45 */
47 46
48struct smc_lgr_list smc_lgr_list = { /* established link groups */
49 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
50 .list = LIST_HEAD_INIT(smc_lgr_list.list),
51};
52
53static void smc_tcp_listen_work(struct work_struct *); 47static void smc_tcp_listen_work(struct work_struct *);
54 48
55static void smc_set_keepalive(struct sock *sk, int val) 49static void smc_set_keepalive(struct sock *sk, int val)
@@ -192,8 +186,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
192 sk->sk_protocol = protocol; 186 sk->sk_protocol = protocol;
193 smc = smc_sk(sk); 187 smc = smc_sk(sk);
194 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 188 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
189 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
195 INIT_LIST_HEAD(&smc->accept_q); 190 INIT_LIST_HEAD(&smc->accept_q);
196 spin_lock_init(&smc->accept_q_lock); 191 spin_lock_init(&smc->accept_q_lock);
192 spin_lock_init(&smc->conn.send_lock);
197 sk->sk_prot->hash(sk); 193 sk->sk_prot->hash(sk);
198 sk_refcnt_debug_inc(sk); 194 sk_refcnt_debug_inc(sk);
199 195
@@ -292,19 +288,28 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
292 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 288 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
293} 289}
294 290
295/* register a new rmb */ 291/* register a new rmb, optionally send confirm_rkey msg to register with peer */
296static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) 292static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
293 bool conf_rkey)
297{ 294{
298 /* register memory region for new rmb */ 295 /* register memory region for new rmb */
299 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { 296 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
300 rmb_desc->regerr = 1; 297 rmb_desc->regerr = 1;
301 return -EFAULT; 298 return -EFAULT;
302 } 299 }
300 if (!conf_rkey)
301 return 0;
302 /* exchange confirm_rkey msg with peer */
303 if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
304 rmb_desc->regerr = 1;
305 return -EFAULT;
306 }
303 return 0; 307 return 0;
304} 308}
305 309
306static int smc_clnt_conf_first_link(struct smc_sock *smc) 310static int smc_clnt_conf_first_link(struct smc_sock *smc)
307{ 311{
312 struct net *net = sock_net(smc->clcsock->sk);
308 struct smc_link_group *lgr = smc->conn.lgr; 313 struct smc_link_group *lgr = smc->conn.lgr;
309 struct smc_link *link; 314 struct smc_link *link;
310 int rest; 315 int rest;
@@ -332,7 +337,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
332 337
333 smc_wr_remember_qp_attr(link); 338 smc_wr_remember_qp_attr(link);
334 339
335 if (smc_reg_rmb(link, smc->conn.rmb_desc)) 340 if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
336 return SMC_CLC_DECL_INTERR; 341 return SMC_CLC_DECL_INTERR;
337 342
338 /* send CONFIRM LINK response over RoCE fabric */ 343 /* send CONFIRM LINK response over RoCE fabric */
@@ -362,7 +367,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
362 if (rc < 0) 367 if (rc < 0)
363 return SMC_CLC_DECL_TCL; 368 return SMC_CLC_DECL_TCL;
364 369
365 link->state = SMC_LNK_ACTIVE; 370 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
366 371
367 return 0; 372 return 0;
368} 373}
@@ -370,10 +375,13 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
370static void smc_conn_save_peer_info(struct smc_sock *smc, 375static void smc_conn_save_peer_info(struct smc_sock *smc,
371 struct smc_clc_msg_accept_confirm *clc) 376 struct smc_clc_msg_accept_confirm *clc)
372{ 377{
373 smc->conn.peer_conn_idx = clc->conn_idx; 378 int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
379
380 smc->conn.peer_rmbe_idx = clc->rmbe_idx;
374 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 381 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
375 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 382 smc->conn.peer_rmbe_size = bufsize;
376 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 383 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
384 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
377} 385}
378 386
379static void smc_link_save_peer_info(struct smc_link *link, 387static void smc_link_save_peer_info(struct smc_link *link,
@@ -386,160 +394,186 @@ static void smc_link_save_peer_info(struct smc_link *link,
386 link->peer_mtu = clc->qp_mtu; 394 link->peer_mtu = clc->qp_mtu;
387} 395}
388 396
389/* setup for RDMA connection of client */ 397/* fall back during connect */
390static int smc_connect_rdma(struct smc_sock *smc) 398static int smc_connect_fallback(struct smc_sock *smc)
391{ 399{
392 struct smc_clc_msg_accept_confirm aclc; 400 smc->use_fallback = true;
393 int local_contact = SMC_FIRST_CONTACT; 401 smc_copy_sock_settings_to_clc(smc);
394 struct smc_ib_device *smcibdev; 402 if (smc->sk.sk_state == SMC_INIT)
395 struct smc_link *link; 403 smc->sk.sk_state = SMC_ACTIVE;
396 u8 srv_first_contact; 404 return 0;
397 int reason_code = 0; 405}
398 int rc = 0;
399 u8 ibport;
400 406
401 sock_hold(&smc->sk); /* sock put in passive closing */ 407/* decline and fall back during connect */
408static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
409{
410 int rc;
402 411
403 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 412 if (reason_code < 0) /* error, fallback is not possible */
404 /* peer has not signalled SMC-capability */ 413 return reason_code;
405 smc->use_fallback = true; 414 if (reason_code != SMC_CLC_DECL_REPLY) {
406 goto out_connected; 415 rc = smc_clc_send_decline(smc, reason_code);
416 if (rc < 0)
417 return rc;
407 } 418 }
419 return smc_connect_fallback(smc);
420}
408 421
409 /* IPSec connections opt out of SMC-R optimizations */ 422/* abort connecting */
410 if (using_ipsec(smc)) { 423static int smc_connect_abort(struct smc_sock *smc, int reason_code,
411 reason_code = SMC_CLC_DECL_IPSEC; 424 int local_contact)
412 goto decline_rdma; 425{
413 } 426 if (local_contact == SMC_FIRST_CONTACT)
427 smc_lgr_forget(smc->conn.lgr);
428 mutex_unlock(&smc_create_lgr_pending);
429 smc_conn_free(&smc->conn);
430 if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
431 sock_put(&smc->sk); /* passive closing */
432 return reason_code;
433}
434
435/* check if there is a rdma device available for this connection. */
436/* called for connect and listen */
437static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
438 u8 *ibport)
439{
440 int reason_code = 0;
414 441
415 /* PNET table look up: search active ib_device and port 442 /* PNET table look up: search active ib_device and port
416 * within same PNETID that also contains the ethernet device 443 * within same PNETID that also contains the ethernet device
417 * used for the internal TCP socket 444 * used for the internal TCP socket
418 */ 445 */
419 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 446 smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
420 if (!smcibdev) { 447 if (!(*ibdev))
421 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 448 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
422 goto decline_rdma; 449
423 } 450 return reason_code;
451}
452
453/* CLC handshake during connect */
454static int smc_connect_clc(struct smc_sock *smc,
455 struct smc_clc_msg_accept_confirm *aclc,
456 struct smc_ib_device *ibdev, u8 ibport)
457{
458 int rc = 0;
424 459
425 /* do inband token exchange */ 460 /* do inband token exchange */
426 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 461 rc = smc_clc_send_proposal(smc, ibdev, ibport);
427 if (reason_code < 0) { 462 if (rc)
428 rc = reason_code; 463 return rc;
429 goto out_err;
430 }
431 if (reason_code > 0) /* configuration error */
432 goto decline_rdma;
433 /* receive SMC Accept CLC message */ 464 /* receive SMC Accept CLC message */
434 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 465 return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
435 SMC_CLC_ACCEPT); 466}
436 if (reason_code < 0) { 467
437 rc = reason_code; 468/* setup for RDMA connection of client */
438 goto out_err; 469static int smc_connect_rdma(struct smc_sock *smc,
439 } 470 struct smc_clc_msg_accept_confirm *aclc,
440 if (reason_code > 0) 471 struct smc_ib_device *ibdev, u8 ibport)
441 goto decline_rdma; 472{
473 int local_contact = SMC_FIRST_CONTACT;
474 struct smc_link *link;
475 int reason_code = 0;
442 476
443 srv_first_contact = aclc.hdr.flag;
444 mutex_lock(&smc_create_lgr_pending); 477 mutex_lock(&smc_create_lgr_pending);
445 local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, 478 local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
446 srv_first_contact); 479 aclc->hdr.flag);
447 if (local_contact < 0) { 480 if (local_contact < 0) {
448 rc = local_contact; 481 if (local_contact == -ENOMEM)
449 if (rc == -ENOMEM)
450 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 482 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
451 else if (rc == -ENOLINK) 483 else if (local_contact == -ENOLINK)
452 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 484 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
453 goto decline_rdma_unlock; 485 else
486 reason_code = SMC_CLC_DECL_INTERR; /* other error */
487 return smc_connect_abort(smc, reason_code, 0);
454 } 488 }
455 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 489 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
456 490
457 smc_conn_save_peer_info(smc, &aclc); 491 smc_conn_save_peer_info(smc, aclc);
458 492
459 /* create send buffer and rmb */ 493 /* create send buffer and rmb */
460 rc = smc_buf_create(smc); 494 if (smc_buf_create(smc))
461 if (rc) { 495 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
462 reason_code = SMC_CLC_DECL_MEM;
463 goto decline_rdma_unlock;
464 }
465 496
466 if (local_contact == SMC_FIRST_CONTACT) 497 if (local_contact == SMC_FIRST_CONTACT)
467 smc_link_save_peer_info(link, &aclc); 498 smc_link_save_peer_info(link, aclc);
468 499
469 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 500 if (smc_rmb_rtoken_handling(&smc->conn, aclc))
470 if (rc) { 501 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
471 reason_code = SMC_CLC_DECL_INTERR; 502 local_contact);
472 goto decline_rdma_unlock;
473 }
474 503
475 smc_close_init(smc); 504 smc_close_init(smc);
476 smc_rx_init(smc); 505 smc_rx_init(smc);
477 506
478 if (local_contact == SMC_FIRST_CONTACT) { 507 if (local_contact == SMC_FIRST_CONTACT) {
479 rc = smc_ib_ready_link(link); 508 if (smc_ib_ready_link(link))
480 if (rc) { 509 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
481 reason_code = SMC_CLC_DECL_INTERR; 510 local_contact);
482 goto decline_rdma_unlock;
483 }
484 } else { 511 } else {
485 if (!smc->conn.rmb_desc->reused) { 512 if (!smc->conn.rmb_desc->reused &&
486 if (smc_reg_rmb(link, smc->conn.rmb_desc)) { 513 smc_reg_rmb(link, smc->conn.rmb_desc, true))
487 reason_code = SMC_CLC_DECL_INTERR; 514 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
488 goto decline_rdma_unlock; 515 local_contact);
489 }
490 }
491 } 516 }
492 smc_rmb_sync_sg_for_device(&smc->conn); 517 smc_rmb_sync_sg_for_device(&smc->conn);
493 518
494 rc = smc_clc_send_confirm(smc); 519 reason_code = smc_clc_send_confirm(smc);
495 if (rc) 520 if (reason_code)
496 goto out_err_unlock; 521 return smc_connect_abort(smc, reason_code, local_contact);
522
523 smc_tx_init(smc);
497 524
498 if (local_contact == SMC_FIRST_CONTACT) { 525 if (local_contact == SMC_FIRST_CONTACT) {
499 /* QP confirmation over RoCE fabric */ 526 /* QP confirmation over RoCE fabric */
500 reason_code = smc_clnt_conf_first_link(smc); 527 reason_code = smc_clnt_conf_first_link(smc);
501 if (reason_code < 0) { 528 if (reason_code)
502 rc = reason_code; 529 return smc_connect_abort(smc, reason_code,
503 goto out_err_unlock; 530 local_contact);
504 }
505 if (reason_code > 0)
506 goto decline_rdma_unlock;
507 } 531 }
508
509 mutex_unlock(&smc_create_lgr_pending); 532 mutex_unlock(&smc_create_lgr_pending);
510 smc_tx_init(smc);
511 533
512out_connected:
513 smc_copy_sock_settings_to_clc(smc); 534 smc_copy_sock_settings_to_clc(smc);
514 if (smc->sk.sk_state == SMC_INIT) 535 if (smc->sk.sk_state == SMC_INIT)
515 smc->sk.sk_state = SMC_ACTIVE; 536 smc->sk.sk_state = SMC_ACTIVE;
516 537
517 return rc ? rc : local_contact; 538 return 0;
539}
518 540
519decline_rdma_unlock: 541/* perform steps before actually connecting */
520 if (local_contact == SMC_FIRST_CONTACT) 542static int __smc_connect(struct smc_sock *smc)
521 smc_lgr_forget(smc->conn.lgr); 543{
522 mutex_unlock(&smc_create_lgr_pending); 544 struct smc_clc_msg_accept_confirm aclc;
523 smc_conn_free(&smc->conn); 545 struct smc_ib_device *ibdev;
524decline_rdma: 546 int rc = 0;
525 /* RDMA setup failed, switch back to TCP */ 547 u8 ibport;
526 smc->use_fallback = true;
527 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
528 rc = smc_clc_send_decline(smc, reason_code);
529 if (rc < 0)
530 goto out_err;
531 }
532 goto out_connected;
533 548
534out_err_unlock: 549 sock_hold(&smc->sk); /* sock put in passive closing */
535 if (local_contact == SMC_FIRST_CONTACT) 550
536 smc_lgr_forget(smc->conn.lgr); 551 if (smc->use_fallback)
537 mutex_unlock(&smc_create_lgr_pending); 552 return smc_connect_fallback(smc);
538 smc_conn_free(&smc->conn); 553
539out_err: 554 /* if peer has not signalled SMC-capability, fall back */
540 if (smc->sk.sk_state == SMC_INIT) 555 if (!tcp_sk(smc->clcsock->sk)->syn_smc)
541 sock_put(&smc->sk); /* passive closing */ 556 return smc_connect_fallback(smc);
542 return rc; 557
558 /* IPSec connections opt out of SMC-R optimizations */
559 if (using_ipsec(smc))
560 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
561
562 /* check if a RDMA device is available; if not, fall back */
563 if (smc_check_rdma(smc, &ibdev, &ibport))
564 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
565
566 /* perform CLC handshake */
567 rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
568 if (rc)
569 return smc_connect_decline_fallback(smc, rc);
570
571 /* connect using rdma */
572 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
573 if (rc)
574 return smc_connect_decline_fallback(smc, rc);
575
576 return 0;
543} 577}
544 578
545static int smc_connect(struct socket *sock, struct sockaddr *addr, 579static int smc_connect(struct socket *sock, struct sockaddr *addr,
@@ -575,8 +609,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
575 if (rc) 609 if (rc)
576 goto out; 610 goto out;
577 611
578 /* setup RDMA connection */ 612 rc = __smc_connect(smc);
579 rc = smc_connect_rdma(smc);
580 if (rc < 0) 613 if (rc < 0)
581 goto out; 614 goto out;
582 else 615 else
@@ -716,6 +749,7 @@ void smc_close_non_accepted(struct sock *sk)
716 749
717static int smc_serv_conf_first_link(struct smc_sock *smc) 750static int smc_serv_conf_first_link(struct smc_sock *smc)
718{ 751{
752 struct net *net = sock_net(smc->clcsock->sk);
719 struct smc_link_group *lgr = smc->conn.lgr; 753 struct smc_link_group *lgr = smc->conn.lgr;
720 struct smc_link *link; 754 struct smc_link *link;
721 int rest; 755 int rest;
@@ -723,7 +757,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
723 757
724 link = &lgr->lnk[SMC_SINGLE_LINK]; 758 link = &lgr->lnk[SMC_SINGLE_LINK];
725 759
726 if (smc_reg_rmb(link, smc->conn.rmb_desc)) 760 if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
727 return SMC_CLC_DECL_INTERR; 761 return SMC_CLC_DECL_INTERR;
728 762
729 /* send CONFIRM LINK request to client over the RoCE fabric */ 763 /* send CONFIRM LINK request to client over the RoCE fabric */
@@ -768,184 +802,244 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
768 return rc; 802 return rc;
769 } 803 }
770 804
771 link->state = SMC_LNK_ACTIVE; 805 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
772 806
773 return 0; 807 return 0;
774} 808}
775 809
776/* setup for RDMA connection of server */ 810/* listen worker: finish */
777static void smc_listen_work(struct work_struct *work) 811static void smc_listen_out(struct smc_sock *new_smc)
778{ 812{
779 struct smc_sock *new_smc = container_of(work, struct smc_sock,
780 smc_listen_work);
781 struct smc_clc_msg_proposal_prefix *pclc_prfx;
782 struct socket *newclcsock = new_smc->clcsock;
783 struct smc_sock *lsmc = new_smc->listen_smc; 813 struct smc_sock *lsmc = new_smc->listen_smc;
784 struct smc_clc_msg_accept_confirm cclc;
785 int local_contact = SMC_REUSE_CONTACT;
786 struct sock *newsmcsk = &new_smc->sk; 814 struct sock *newsmcsk = &new_smc->sk;
787 struct smc_clc_msg_proposal *pclc;
788 struct smc_ib_device *smcibdev;
789 u8 buf[SMC_CLC_MAX_LEN];
790 struct smc_link *link;
791 int reason_code = 0;
792 int rc = 0;
793 u8 ibport;
794 815
795 /* check if peer is smc capable */ 816 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
796 if (!tcp_sk(newclcsock->sk)->syn_smc) { 817 if (lsmc->sk.sk_state == SMC_LISTEN) {
797 new_smc->use_fallback = true; 818 smc_accept_enqueue(&lsmc->sk, newsmcsk);
798 goto out_connected; 819 } else { /* no longer listening */
820 smc_close_non_accepted(newsmcsk);
799 } 821 }
822 release_sock(&lsmc->sk);
800 823
801 /* do inband token exchange - 824 /* Wake up accept */
802 *wait for and receive SMC Proposal CLC message 825 lsmc->sk.sk_data_ready(&lsmc->sk);
803 */ 826 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
804 reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), 827}
805 SMC_CLC_PROPOSAL);
806 if (reason_code < 0)
807 goto out_err;
808 if (reason_code > 0)
809 goto decline_rdma;
810 828
811 /* IPSec connections opt out of SMC-R optimizations */ 829/* listen worker: finish in state connected */
812 if (using_ipsec(new_smc)) { 830static void smc_listen_out_connected(struct smc_sock *new_smc)
813 reason_code = SMC_CLC_DECL_IPSEC; 831{
814 goto decline_rdma; 832 struct sock *newsmcsk = &new_smc->sk;
815 }
816 833
817 /* PNET table look up: search active ib_device and port 834 sk_refcnt_debug_inc(newsmcsk);
818 * within same PNETID that also contains the ethernet device 835 if (newsmcsk->sk_state == SMC_INIT)
819 * used for the internal TCP socket 836 newsmcsk->sk_state = SMC_ACTIVE;
820 */ 837
821 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 838 smc_listen_out(new_smc);
822 if (!smcibdev) { 839}
823 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 840
824 goto decline_rdma; 841/* listen worker: finish in error state */
842static void smc_listen_out_err(struct smc_sock *new_smc)
843{
844 struct sock *newsmcsk = &new_smc->sk;
845
846 if (newsmcsk->sk_state == SMC_INIT)
847 sock_put(&new_smc->sk); /* passive closing */
848 newsmcsk->sk_state = SMC_CLOSED;
849 smc_conn_free(&new_smc->conn);
850
851 smc_listen_out(new_smc);
852}
853
854/* listen worker: decline and fall back if possible */
855static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
856 int local_contact)
857{
858 /* RDMA setup failed, switch back to TCP */
859 if (local_contact == SMC_FIRST_CONTACT)
860 smc_lgr_forget(new_smc->conn.lgr);
861 if (reason_code < 0) { /* error, no fallback possible */
862 smc_listen_out_err(new_smc);
863 return;
864 }
865 smc_conn_free(&new_smc->conn);
866 new_smc->use_fallback = true;
867 if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
868 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
869 smc_listen_out_err(new_smc);
870 return;
871 }
825 } 872 }
873 smc_listen_out_connected(new_smc);
874}
875
876/* listen worker: check prefixes */
877static int smc_listen_rdma_check(struct smc_sock *new_smc,
878 struct smc_clc_msg_proposal *pclc)
879{
880 struct smc_clc_msg_proposal_prefix *pclc_prfx;
881 struct socket *newclcsock = new_smc->clcsock;
826 882
827 pclc = (struct smc_clc_msg_proposal *)&buf;
828 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 883 pclc_prfx = smc_clc_proposal_get_prefix(pclc);
884 if (smc_clc_prfx_match(newclcsock, pclc_prfx))
885 return SMC_CLC_DECL_CNFERR;
829 886
830 rc = smc_clc_prfx_match(newclcsock, pclc_prfx); 887 return 0;
831 if (rc) { 888}
832 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
833 goto decline_rdma;
834 }
835 889
890/* listen worker: initialize connection and buffers */
891static int smc_listen_rdma_init(struct smc_sock *new_smc,
892 struct smc_clc_msg_proposal *pclc,
893 struct smc_ib_device *ibdev, u8 ibport,
894 int *local_contact)
895{
836 /* allocate connection / link group */ 896 /* allocate connection / link group */
837 mutex_lock(&smc_create_lgr_pending); 897 *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
838 local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, 898 if (*local_contact < 0) {
839 0); 899 if (*local_contact == -ENOMEM)
840 if (local_contact < 0) { 900 return SMC_CLC_DECL_MEM;/* insufficient memory*/
841 rc = local_contact; 901 return SMC_CLC_DECL_INTERR; /* other error */
842 if (rc == -ENOMEM)
843 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
844 goto decline_rdma_unlock;
845 } 902 }
846 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
847 903
848 /* create send buffer and rmb */ 904 /* create send buffer and rmb */
849 rc = smc_buf_create(new_smc); 905 if (smc_buf_create(new_smc))
850 if (rc) { 906 return SMC_CLC_DECL_MEM;
851 reason_code = SMC_CLC_DECL_MEM;
852 goto decline_rdma_unlock;
853 }
854 907
855 smc_close_init(new_smc); 908 return 0;
856 smc_rx_init(new_smc); 909}
910
911/* listen worker: register buffers */
912static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
913{
914 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
857 915
858 if (local_contact != SMC_FIRST_CONTACT) { 916 if (local_contact != SMC_FIRST_CONTACT) {
859 if (!new_smc->conn.rmb_desc->reused) { 917 if (!new_smc->conn.rmb_desc->reused) {
860 if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) { 918 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
861 reason_code = SMC_CLC_DECL_INTERR; 919 return SMC_CLC_DECL_INTERR;
862 goto decline_rdma_unlock;
863 }
864 } 920 }
865 } 921 }
866 smc_rmb_sync_sg_for_device(&new_smc->conn); 922 smc_rmb_sync_sg_for_device(&new_smc->conn);
867 923
868 rc = smc_clc_send_accept(new_smc, local_contact); 924 return 0;
869 if (rc) 925}
870 goto out_err_unlock; 926
927/* listen worker: finish RDMA setup */
928static void smc_listen_rdma_finish(struct smc_sock *new_smc,
929 struct smc_clc_msg_accept_confirm *cclc,
930 int local_contact)
931{
932 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
933 int reason_code = 0;
871 934
872 /* receive SMC Confirm CLC message */
873 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
874 SMC_CLC_CONFIRM);
875 if (reason_code < 0)
876 goto out_err_unlock;
877 if (reason_code > 0)
878 goto decline_rdma_unlock;
879 smc_conn_save_peer_info(new_smc, &cclc);
880 if (local_contact == SMC_FIRST_CONTACT) 935 if (local_contact == SMC_FIRST_CONTACT)
881 smc_link_save_peer_info(link, &cclc); 936 smc_link_save_peer_info(link, cclc);
882 937
883 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 938 if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
884 if (rc) {
885 reason_code = SMC_CLC_DECL_INTERR; 939 reason_code = SMC_CLC_DECL_INTERR;
886 goto decline_rdma_unlock; 940 goto decline;
887 } 941 }
888 942
889 if (local_contact == SMC_FIRST_CONTACT) { 943 if (local_contact == SMC_FIRST_CONTACT) {
890 rc = smc_ib_ready_link(link); 944 if (smc_ib_ready_link(link)) {
891 if (rc) {
892 reason_code = SMC_CLC_DECL_INTERR; 945 reason_code = SMC_CLC_DECL_INTERR;
893 goto decline_rdma_unlock; 946 goto decline;
894 } 947 }
895 /* QP confirmation over RoCE fabric */ 948 /* QP confirmation over RoCE fabric */
896 reason_code = smc_serv_conf_first_link(new_smc); 949 reason_code = smc_serv_conf_first_link(new_smc);
897 if (reason_code < 0) 950 if (reason_code)
898 /* peer is not aware of a problem */ 951 goto decline;
899 goto out_err_unlock;
900 if (reason_code > 0)
901 goto decline_rdma_unlock;
902 } 952 }
953 return;
903 954
904 smc_tx_init(new_smc); 955decline:
905 mutex_unlock(&smc_create_lgr_pending); 956 mutex_unlock(&smc_create_lgr_pending);
957 smc_listen_decline(new_smc, reason_code, local_contact);
958}
906 959
907out_connected: 960/* setup for RDMA connection of server */
908 sk_refcnt_debug_inc(newsmcsk); 961static void smc_listen_work(struct work_struct *work)
909 if (newsmcsk->sk_state == SMC_INIT) 962{
910 newsmcsk->sk_state = SMC_ACTIVE; 963 struct smc_sock *new_smc = container_of(work, struct smc_sock,
911enqueue: 964 smc_listen_work);
912 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 965 struct socket *newclcsock = new_smc->clcsock;
913 if (lsmc->sk.sk_state == SMC_LISTEN) { 966 struct smc_clc_msg_accept_confirm cclc;
914 smc_accept_enqueue(&lsmc->sk, newsmcsk); 967 struct smc_clc_msg_proposal *pclc;
915 } else { /* no longer listening */ 968 struct smc_ib_device *ibdev;
916 smc_close_non_accepted(newsmcsk); 969 u8 buf[SMC_CLC_MAX_LEN];
970 int local_contact = 0;
971 int reason_code = 0;
972 int rc = 0;
973 u8 ibport;
974
975 if (new_smc->use_fallback) {
976 smc_listen_out_connected(new_smc);
977 return;
917 } 978 }
918 release_sock(&lsmc->sk);
919 979
920 /* Wake up accept */ 980 /* check if peer is smc capable */
921 lsmc->sk.sk_data_ready(&lsmc->sk); 981 if (!tcp_sk(newclcsock->sk)->syn_smc) {
922 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 982 new_smc->use_fallback = true;
923 return; 983 smc_listen_out_connected(new_smc);
984 return;
985 }
924 986
925decline_rdma_unlock: 987 /* do inband token exchange -
926 if (local_contact == SMC_FIRST_CONTACT) 988 * wait for and receive SMC Proposal CLC message
927 smc_lgr_forget(new_smc->conn.lgr); 989 */
928 mutex_unlock(&smc_create_lgr_pending); 990 pclc = (struct smc_clc_msg_proposal *)&buf;
929decline_rdma: 991 reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
930 /* RDMA setup failed, switch back to TCP */ 992 SMC_CLC_PROPOSAL);
931 smc_conn_free(&new_smc->conn); 993 if (reason_code) {
932 new_smc->use_fallback = true; 994 smc_listen_decline(new_smc, reason_code, 0);
933 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 995 return;
934 if (smc_clc_send_decline(new_smc, reason_code) < 0)
935 goto out_err;
936 } 996 }
937 goto out_connected;
938 997
939out_err_unlock: 998 /* IPSec connections opt out of SMC-R optimizations */
940 if (local_contact == SMC_FIRST_CONTACT) 999 if (using_ipsec(new_smc)) {
941 smc_lgr_forget(new_smc->conn.lgr); 1000 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1001 return;
1002 }
1003
1004 mutex_lock(&smc_create_lgr_pending);
1005 smc_close_init(new_smc);
1006 smc_rx_init(new_smc);
1007 smc_tx_init(new_smc);
1008
1009 /* check if RDMA is available */
1010 if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
1011 smc_listen_rdma_check(new_smc, pclc) ||
1012 smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1013 &local_contact) ||
1014 smc_listen_rdma_reg(new_smc, local_contact)) {
1015 /* SMC not supported, decline */
1016 mutex_unlock(&smc_create_lgr_pending);
1017 smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
1018 return;
1019 }
1020
1021 /* send SMC Accept CLC message */
1022 rc = smc_clc_send_accept(new_smc, local_contact);
1023 if (rc) {
1024 mutex_unlock(&smc_create_lgr_pending);
1025 smc_listen_decline(new_smc, rc, local_contact);
1026 return;
1027 }
1028
1029 /* receive SMC Confirm CLC message */
1030 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1031 SMC_CLC_CONFIRM);
1032 if (reason_code) {
1033 mutex_unlock(&smc_create_lgr_pending);
1034 smc_listen_decline(new_smc, reason_code, local_contact);
1035 return;
1036 }
1037
1038 /* finish worker */
1039 smc_listen_rdma_finish(new_smc, &cclc, local_contact);
1040 smc_conn_save_peer_info(new_smc, &cclc);
942 mutex_unlock(&smc_create_lgr_pending); 1041 mutex_unlock(&smc_create_lgr_pending);
943out_err: 1042 smc_listen_out_connected(new_smc);
944 if (newsmcsk->sk_state == SMC_INIT)
945 sock_put(&new_smc->sk); /* passive closing */
946 newsmcsk->sk_state = SMC_CLOSED;
947 smc_conn_free(&new_smc->conn);
948 goto enqueue; /* queue new sock with sk_err set */
949} 1043}
950 1044
951static void smc_tcp_listen_work(struct work_struct *work) 1045static void smc_tcp_listen_work(struct work_struct *work)
@@ -965,7 +1059,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
965 continue; 1059 continue;
966 1060
967 new_smc->listen_smc = lsmc; 1061 new_smc->listen_smc = lsmc;
968 new_smc->use_fallback = false; /* assume rdma capability first*/ 1062 new_smc->use_fallback = lsmc->use_fallback;
969 sock_hold(lsk); /* sock_put in smc_listen_work */ 1063 sock_hold(lsk); /* sock_put in smc_listen_work */
970 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 1064 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
971 smc_copy_sock_settings_to_smc(new_smc); 1065 smc_copy_sock_settings_to_smc(new_smc);
@@ -1001,7 +1095,8 @@ static int smc_listen(struct socket *sock, int backlog)
1001 * them to the clc socket -- copy smc socket options to clc socket 1095 * them to the clc socket -- copy smc socket options to clc socket
1002 */ 1096 */
1003 smc_copy_sock_settings_to_clc(smc); 1097 smc_copy_sock_settings_to_clc(smc);
1004 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1098 if (!smc->use_fallback)
1099 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1005 1100
1006 rc = kernel_listen(smc->clcsock, backlog); 1101 rc = kernel_listen(smc->clcsock, backlog);
1007 if (rc) 1102 if (rc)
@@ -1034,6 +1129,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
1034 1129
1035 if (lsmc->sk.sk_state != SMC_LISTEN) { 1130 if (lsmc->sk.sk_state != SMC_LISTEN) {
1036 rc = -EINVAL; 1131 rc = -EINVAL;
1132 release_sock(sk);
1037 goto out; 1133 goto out;
1038 } 1134 }
1039 1135
@@ -1061,9 +1157,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
1061 1157
1062 if (!rc) 1158 if (!rc)
1063 rc = sock_error(nsk); 1159 rc = sock_error(nsk);
1160 release_sock(sk);
1161 if (rc)
1162 goto out;
1163
1164 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1165 /* wait till data arrives on the socket */
1166 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1167 MSEC_PER_SEC);
1168 if (smc_sk(nsk)->use_fallback) {
1169 struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1170
1171 lock_sock(clcsk);
1172 if (skb_queue_empty(&clcsk->sk_receive_queue))
1173 sk_wait_data(clcsk, &timeo, NULL);
1174 release_sock(clcsk);
1175 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1176 lock_sock(nsk);
1177 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1178 release_sock(nsk);
1179 }
1180 }
1064 1181
1065out: 1182out:
1066 release_sock(sk);
1067 sock_put(sk); /* sock_hold above */ 1183 sock_put(sk); /* sock_hold above */
1068 return rc; 1184 return rc;
1069} 1185}
@@ -1094,6 +1210,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1094 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1210 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1095 (sk->sk_state != SMC_INIT)) 1211 (sk->sk_state != SMC_INIT))
1096 goto out; 1212 goto out;
1213
1214 if (msg->msg_flags & MSG_FASTOPEN) {
1215 if (sk->sk_state == SMC_INIT) {
1216 smc->use_fallback = true;
1217 } else {
1218 rc = -EINVAL;
1219 goto out;
1220 }
1221 }
1222
1097 if (smc->use_fallback) 1223 if (smc->use_fallback)
1098 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1224 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1099 else 1225 else
@@ -1122,10 +1248,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1122 goto out; 1248 goto out;
1123 } 1249 }
1124 1250
1125 if (smc->use_fallback) 1251 if (smc->use_fallback) {
1126 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1252 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1127 else 1253 } else {
1128 rc = smc_rx_recvmsg(smc, msg, len, flags); 1254 msg->msg_namelen = 0;
1255 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1256 }
1129 1257
1130out: 1258out:
1131 release_sock(sk); 1259 release_sock(sk);
@@ -1172,7 +1300,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
1172 if (sk->sk_state == SMC_INIT && 1300 if (sk->sk_state == SMC_INIT &&
1173 mask & EPOLLOUT && 1301 mask & EPOLLOUT &&
1174 smc->clcsock->sk->sk_state != TCP_CLOSE) { 1302 smc->clcsock->sk->sk_state != TCP_CLOSE) {
1175 rc = smc_connect_rdma(smc); 1303 rc = __smc_connect(smc);
1176 if (rc < 0) 1304 if (rc < 0)
1177 mask |= EPOLLERR; 1305 mask |= EPOLLERR;
1178 /* success cases including fallback */ 1306 /* success cases including fallback */
@@ -1208,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
1208 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1336 if (sk->sk_state == SMC_APPCLOSEWAIT1)
1209 mask |= EPOLLIN; 1337 mask |= EPOLLIN;
1210 } 1338 }
1339 if (smc->conn.urg_state == SMC_URG_VALID)
1340 mask |= EPOLLPRI;
1211 1341
1212 } 1342 }
1213 release_sock(sk); 1343 release_sock(sk);
@@ -1273,14 +1403,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
1273{ 1403{
1274 struct sock *sk = sock->sk; 1404 struct sock *sk = sock->sk;
1275 struct smc_sock *smc; 1405 struct smc_sock *smc;
1406 int val, rc;
1276 1407
1277 smc = smc_sk(sk); 1408 smc = smc_sk(sk);
1278 1409
1279 /* generic setsockopts reaching us here always apply to the 1410 /* generic setsockopts reaching us here always apply to the
1280 * CLC socket 1411 * CLC socket
1281 */ 1412 */
1282 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1413 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1283 optval, optlen); 1414 optval, optlen);
1415 if (smc->clcsock->sk->sk_err) {
1416 sk->sk_err = smc->clcsock->sk->sk_err;
1417 sk->sk_error_report(sk);
1418 }
1419 if (rc)
1420 return rc;
1421
1422 if (optlen < sizeof(int))
1423 return -EINVAL;
1424 get_user(val, (int __user *)optval);
1425
1426 lock_sock(sk);
1427 switch (optname) {
1428 case TCP_ULP:
1429 case TCP_FASTOPEN:
1430 case TCP_FASTOPEN_CONNECT:
1431 case TCP_FASTOPEN_KEY:
1432 case TCP_FASTOPEN_NO_COOKIE:
1433 /* option not supported by SMC */
1434 if (sk->sk_state == SMC_INIT) {
1435 smc->use_fallback = true;
1436 } else {
1437 if (!smc->use_fallback)
1438 rc = -EINVAL;
1439 }
1440 break;
1441 case TCP_NODELAY:
1442 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1443 if (val && !smc->use_fallback)
1444 mod_delayed_work(system_wq, &smc->conn.tx_work,
1445 0);
1446 }
1447 break;
1448 case TCP_CORK:
1449 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1450 if (!val && !smc->use_fallback)
1451 mod_delayed_work(system_wq, &smc->conn.tx_work,
1452 0);
1453 }
1454 break;
1455 case TCP_DEFER_ACCEPT:
1456 smc->sockopt_defer_accept = val;
1457 break;
1458 default:
1459 break;
1460 }
1461 release_sock(sk);
1462
1463 return rc;
1284} 1464}
1285 1465
1286static int smc_getsockopt(struct socket *sock, int level, int optname, 1466static int smc_getsockopt(struct socket *sock, int level, int optname,
@@ -1297,13 +1477,71 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
1297static int smc_ioctl(struct socket *sock, unsigned int cmd, 1477static int smc_ioctl(struct socket *sock, unsigned int cmd,
1298 unsigned long arg) 1478 unsigned long arg)
1299{ 1479{
1480 union smc_host_cursor cons, urg;
1481 struct smc_connection *conn;
1300 struct smc_sock *smc; 1482 struct smc_sock *smc;
1483 int answ;
1301 1484
1302 smc = smc_sk(sock->sk); 1485 smc = smc_sk(sock->sk);
1303 if (smc->use_fallback) 1486 conn = &smc->conn;
1487 if (smc->use_fallback) {
1488 if (!smc->clcsock)
1489 return -EBADF;
1304 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1490 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1305 else 1491 }
1306 return sock_no_ioctl(sock, cmd, arg); 1492 switch (cmd) {
1493 case SIOCINQ: /* same as FIONREAD */
1494 if (smc->sk.sk_state == SMC_LISTEN)
1495 return -EINVAL;
1496 if (smc->sk.sk_state == SMC_INIT ||
1497 smc->sk.sk_state == SMC_CLOSED)
1498 answ = 0;
1499 else
1500 answ = atomic_read(&smc->conn.bytes_to_rcv);
1501 break;
1502 case SIOCOUTQ:
1503 /* output queue size (not send + not acked) */
1504 if (smc->sk.sk_state == SMC_LISTEN)
1505 return -EINVAL;
1506 if (smc->sk.sk_state == SMC_INIT ||
1507 smc->sk.sk_state == SMC_CLOSED)
1508 answ = 0;
1509 else
1510 answ = smc->conn.sndbuf_desc->len -
1511 atomic_read(&smc->conn.sndbuf_space);
1512 break;
1513 case SIOCOUTQNSD:
1514 /* output queue size (not send only) */
1515 if (smc->sk.sk_state == SMC_LISTEN)
1516 return -EINVAL;
1517 if (smc->sk.sk_state == SMC_INIT ||
1518 smc->sk.sk_state == SMC_CLOSED)
1519 answ = 0;
1520 else
1521 answ = smc_tx_prepared_sends(&smc->conn);
1522 break;
1523 case SIOCATMARK:
1524 if (smc->sk.sk_state == SMC_LISTEN)
1525 return -EINVAL;
1526 if (smc->sk.sk_state == SMC_INIT ||
1527 smc->sk.sk_state == SMC_CLOSED) {
1528 answ = 0;
1529 } else {
1530 smc_curs_write(&cons,
1531 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
1532 conn);
1533 smc_curs_write(&urg,
1534 smc_curs_read(&conn->urg_curs, conn),
1535 conn);
1536 answ = smc_curs_diff(conn->rmb_desc->len,
1537 &cons, &urg) == 1;
1538 }
1539 break;
1540 default:
1541 return -ENOIOCTLCMD;
1542 }
1543
1544 return put_user(answ, (int __user *)arg);
1307} 1545}
1308 1546
1309static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1547static ssize_t smc_sendpage(struct socket *sock, struct page *page,
@@ -1330,9 +1568,15 @@ out:
1330 return rc; 1568 return rc;
1331} 1569}
1332 1570
1571/* Map the affected portions of the rmbe into an spd, note the number of bytes
1572 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1573 * updates till whenever a respective page has been fully processed.
1574 * Note that subsequent recv() calls have to wait till all splice() processing
1575 * completed.
1576 */
1333static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1577static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1334 struct pipe_inode_info *pipe, size_t len, 1578 struct pipe_inode_info *pipe, size_t len,
1335 unsigned int flags) 1579 unsigned int flags)
1336{ 1580{
1337 struct sock *sk = sock->sk; 1581 struct sock *sk = sock->sk;
1338 struct smc_sock *smc; 1582 struct smc_sock *smc;
@@ -1340,16 +1584,34 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1340 1584
1341 smc = smc_sk(sk); 1585 smc = smc_sk(sk);
1342 lock_sock(sk); 1586 lock_sock(sk);
1343 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) 1587
1588 if (sk->sk_state == SMC_INIT ||
1589 sk->sk_state == SMC_LISTEN ||
1590 sk->sk_state == SMC_CLOSED)
1591 goto out;
1592
1593 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1594 rc = 0;
1344 goto out; 1595 goto out;
1596 }
1597
1345 if (smc->use_fallback) { 1598 if (smc->use_fallback) {
1346 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1599 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1347 pipe, len, flags); 1600 pipe, len, flags);
1348 } else { 1601 } else {
1349 rc = -EOPNOTSUPP; 1602 if (*ppos) {
1603 rc = -ESPIPE;
1604 goto out;
1605 }
1606 if (flags & SPLICE_F_NONBLOCK)
1607 flags = MSG_DONTWAIT;
1608 else
1609 flags = 0;
1610 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1350 } 1611 }
1351out: 1612out:
1352 release_sock(sk); 1613 release_sock(sk);
1614
1353 return rc; 1615 return rc;
1354} 1616}
1355 1617
@@ -1482,18 +1744,7 @@ out_pnet:
1482 1744
1483static void __exit smc_exit(void) 1745static void __exit smc_exit(void)
1484{ 1746{
1485 struct smc_link_group *lgr, *lg; 1747 smc_core_exit();
1486 LIST_HEAD(lgr_freeing_list);
1487
1488 spin_lock_bh(&smc_lgr_list.lock);
1489 if (!list_empty(&smc_lgr_list.list))
1490 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1491 spin_unlock_bh(&smc_lgr_list.lock);
1492 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1493 list_del_init(&lgr->list);
1494 cancel_delayed_work_sync(&lgr->free_work);
1495 smc_lgr_free(lgr); /* free link group */
1496 }
1497 static_branch_disable(&tcp_have_smc); 1748 static_branch_disable(&tcp_have_smc);
1498 smc_ib_unregister_client(); 1749 smc_ib_unregister_client();
1499 sock_unregister(PF_SMC); 1750 sock_unregister(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e4829a2f46ba..51ae1f10d81a 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -114,11 +114,17 @@ struct smc_host_cdc_msg { /* Connection Data Control message */
114 u8 reserved[18]; 114 u8 reserved[18];
115} __aligned(8); 115} __aligned(8);
116 116
117enum smc_urg_state {
118 SMC_URG_VALID, /* data present */
119 SMC_URG_NOTYET, /* data pending */
120 SMC_URG_READ /* data was already read */
121};
122
117struct smc_connection { 123struct smc_connection {
118 struct rb_node alert_node; 124 struct rb_node alert_node;
119 struct smc_link_group *lgr; /* link group of connection */ 125 struct smc_link_group *lgr; /* link group of connection */
120 u32 alert_token_local; /* unique conn. id */ 126 u32 alert_token_local; /* unique conn. id */
121 u8 peer_conn_idx; /* from tcp handshake */ 127 u8 peer_rmbe_idx; /* from tcp handshake */
122 int peer_rmbe_size; /* size of peer rx buffer */ 128 int peer_rmbe_size; /* size of peer rx buffer */
123 atomic_t peer_rmbe_space;/* remaining free bytes in peer 129 atomic_t peer_rmbe_space;/* remaining free bytes in peer
124 * rmbe 130 * rmbe
@@ -126,9 +132,7 @@ struct smc_connection {
126 int rtoken_idx; /* idx to peer RMB rkey/addr */ 132 int rtoken_idx; /* idx to peer RMB rkey/addr */
127 133
128 struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ 134 struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
129 int sndbuf_size; /* sndbuf size <== sock wmem */
130 struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ 135 struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
131 int rmbe_size; /* RMBE size <== sock rmem */
132 int rmbe_size_short;/* compressed notation */ 136 int rmbe_size_short;/* compressed notation */
133 int rmbe_update_limit; 137 int rmbe_update_limit;
134 /* lower limit for consumer 138 /* lower limit for consumer
@@ -153,6 +157,7 @@ struct smc_connection {
153 u16 tx_cdc_seq; /* sequence # for CDC send */ 157 u16 tx_cdc_seq; /* sequence # for CDC send */
154 spinlock_t send_lock; /* protect wr_sends */ 158 spinlock_t send_lock; /* protect wr_sends */
155 struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ 159 struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
160 u32 tx_off; /* base offset in peer rmb */
156 161
157 struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. 162 struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
158 * .prod cf. TCP rcv_nxt 163 * .prod cf. TCP rcv_nxt
@@ -161,9 +166,21 @@ struct smc_connection {
161 union smc_host_cursor rx_curs_confirmed; /* confirmed to peer 166 union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
162 * source of snd_una ? 167 * source of snd_una ?
163 */ 168 */
169 union smc_host_cursor urg_curs; /* points at urgent byte */
170 enum smc_urg_state urg_state;
171 bool urg_tx_pend; /* urgent data staged */
172 bool urg_rx_skip_pend;
173 /* indicate urgent oob data
174 * read, but previous regular
175 * data still pending
176 */
177 char urg_rx_byte; /* urgent byte */
164 atomic_t bytes_to_rcv; /* arrived data, 178 atomic_t bytes_to_rcv; /* arrived data,
165 * not yet received 179 * not yet received
166 */ 180 */
181 atomic_t splice_pending; /* number of spliced bytes
182 * pending processing
183 */
167#ifndef KERNEL_HAS_ATOMIC64 184#ifndef KERNEL_HAS_ATOMIC64
168 spinlock_t acurs_lock; /* protect cursors */ 185 spinlock_t acurs_lock; /* protect cursors */
169#endif 186#endif
@@ -180,6 +197,10 @@ struct smc_sock { /* smc sock container */
180 struct list_head accept_q; /* sockets to be accepted */ 197 struct list_head accept_q; /* sockets to be accepted */
181 spinlock_t accept_q_lock; /* protects accept_q */ 198 spinlock_t accept_q_lock; /* protects accept_q */
182 bool use_fallback; /* fallback to tcp */ 199 bool use_fallback; /* fallback to tcp */
200 int sockopt_defer_accept;
201 /* sockopt TCP_DEFER_ACCEPT
202 * value
203 */
183 u8 wait_close_tx_prepared : 1; 204 u8 wait_close_tx_prepared : 1;
184 /* shutdown wr or close 205 /* shutdown wr or close
185 * started, waiting for unsent 206 * started, waiting for unsent
@@ -214,41 +235,6 @@ static inline u32 ntoh24(u8 *net)
214 return be32_to_cpu(t); 235 return be32_to_cpu(t);
215} 236}
216 237
217#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
218
219#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */
220/* theoretically, the RFC states that largest size would be 512K,
221 * i.e. compressed 5 and thus 6 sizes (0..5), despite
222 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
223 */
224
225/* convert the RMB size into the compressed notation - minimum 16K.
226 * In contrast to plain ilog2, this rounds towards the next power of 2,
227 * so the socket application gets at least its desired sndbuf / rcvbuf size.
228 */
229static inline u8 smc_compress_bufsize(int size)
230{
231 u8 compressed;
232
233 if (size <= SMC_BUF_MIN_SIZE)
234 return 0;
235
236 size = (size - 1) >> 14;
237 compressed = ilog2(size) + 1;
238 if (compressed >= SMC_RMBE_SIZES)
239 compressed = SMC_RMBE_SIZES - 1;
240 return compressed;
241}
242
243/* convert the RMB size from compressed notation into integer */
244static inline int smc_uncompress_bufsize(u8 compressed)
245{
246 u32 size;
247
248 size = 0x00000001 << (((int)compressed) + 14);
249 return (int)size;
250}
251
252#ifdef CONFIG_XFRM 238#ifdef CONFIG_XFRM
253static inline bool using_ipsec(struct smc_sock *smc) 239static inline bool using_ipsec(struct smc_sock *smc)
254{ 240{
@@ -262,12 +248,6 @@ static inline bool using_ipsec(struct smc_sock *smc)
262} 248}
263#endif 249#endif
264 250
265struct smc_clc_msg_local;
266
267void smc_conn_free(struct smc_connection *conn);
268int smc_conn_create(struct smc_sock *smc,
269 struct smc_ib_device *smcibdev, u8 ibport,
270 struct smc_clc_msg_local *lcl, int srv_first_contact);
271struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); 251struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
272void smc_close_non_accepted(struct sock *sk); 252void smc_close_non_accepted(struct sock *sk);
273 253
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index b42395d24cba..a7e8d63fc8ae 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -44,13 +44,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
44 smc = container_of(cdcpend->conn, struct smc_sock, conn); 44 smc = container_of(cdcpend->conn, struct smc_sock, conn);
45 bh_lock_sock(&smc->sk); 45 bh_lock_sock(&smc->sk);
46 if (!wc_status) { 46 if (!wc_status) {
47 diff = smc_curs_diff(cdcpend->conn->sndbuf_size, 47 diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
48 &cdcpend->conn->tx_curs_fin, 48 &cdcpend->conn->tx_curs_fin,
49 &cdcpend->cursor); 49 &cdcpend->cursor);
50 /* sndbuf_space is decreased in smc_sendmsg */ 50 /* sndbuf_space is decreased in smc_sendmsg */
51 smp_mb__before_atomic(); 51 smp_mb__before_atomic();
52 atomic_add(diff, &cdcpend->conn->sndbuf_space); 52 atomic_add(diff, &cdcpend->conn->sndbuf_space);
53 /* guarantee 0 <= sndbuf_space <= sndbuf_size */ 53 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
54 smp_mb__after_atomic(); 54 smp_mb__after_atomic();
55 smc_curs_write(&cdcpend->conn->tx_curs_fin, 55 smc_curs_write(&cdcpend->conn->tx_curs_fin,
56 smc_curs_read(&cdcpend->cursor, cdcpend->conn), 56 smc_curs_read(&cdcpend->cursor, cdcpend->conn),
@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
82 sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, 82 sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
83 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); 83 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
84 BUILD_BUG_ON_MSG( 84 BUILD_BUG_ON_MSG(
85 offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, 85 sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
86 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); 86 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
87 BUILD_BUG_ON_MSG( 87 BUILD_BUG_ON_MSG(
88 sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, 88 sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
@@ -164,20 +164,35 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
164 return (s16)(seq1 - seq2) < 0; 164 return (s16)(seq1 - seq2) < 0;
165} 165}
166 166
167static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
168 int *diff_prod)
169{
170 struct smc_connection *conn = &smc->conn;
171 char *base;
172
173 /* new data included urgent business */
174 smc_curs_write(&conn->urg_curs,
175 smc_curs_read(&conn->local_rx_ctrl.prod, conn),
176 conn);
177 conn->urg_state = SMC_URG_VALID;
178 if (!sock_flag(&smc->sk, SOCK_URGINLINE))
179 /* we'll skip the urgent byte, so don't account for it */
180 (*diff_prod)--;
181 base = (char *)conn->rmb_desc->cpu_addr;
182 if (conn->urg_curs.count)
183 conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
184 else
185 conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
186 sk_send_sigurg(&smc->sk);
187}
188
167static void smc_cdc_msg_recv_action(struct smc_sock *smc, 189static void smc_cdc_msg_recv_action(struct smc_sock *smc,
168 struct smc_link *link,
169 struct smc_cdc_msg *cdc) 190 struct smc_cdc_msg *cdc)
170{ 191{
171 union smc_host_cursor cons_old, prod_old; 192 union smc_host_cursor cons_old, prod_old;
172 struct smc_connection *conn = &smc->conn; 193 struct smc_connection *conn = &smc->conn;
173 int diff_cons, diff_prod; 194 int diff_cons, diff_prod;
174 195
175 if (!cdc->prod_flags.failover_validation) {
176 if (smc_cdc_before(ntohs(cdc->seqno),
177 conn->local_rx_ctrl.seqno))
178 /* received seqno is old */
179 return;
180 }
181 smc_curs_write(&prod_old, 196 smc_curs_write(&prod_old,
182 smc_curs_read(&conn->local_rx_ctrl.prod, conn), 197 smc_curs_read(&conn->local_rx_ctrl.prod, conn),
183 conn); 198 conn);
@@ -198,18 +213,28 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
198 smp_mb__after_atomic(); 213 smp_mb__after_atomic();
199 } 214 }
200 215
201 diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old, 216 diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
202 &conn->local_rx_ctrl.prod); 217 &conn->local_rx_ctrl.prod);
203 if (diff_prod) { 218 if (diff_prod) {
219 if (conn->local_rx_ctrl.prod_flags.urg_data_present)
220 smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
204 /* bytes_to_rcv is decreased in smc_recvmsg */ 221 /* bytes_to_rcv is decreased in smc_recvmsg */
205 smp_mb__before_atomic(); 222 smp_mb__before_atomic();
206 atomic_add(diff_prod, &conn->bytes_to_rcv); 223 atomic_add(diff_prod, &conn->bytes_to_rcv);
207 /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ 224 /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
208 smp_mb__after_atomic(); 225 smp_mb__after_atomic();
209 smc->sk.sk_data_ready(&smc->sk); 226 smc->sk.sk_data_ready(&smc->sk);
210 } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) || 227 } else {
211 (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) { 228 if (conn->local_rx_ctrl.prod_flags.write_blocked ||
212 smc->sk.sk_data_ready(&smc->sk); 229 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
230 conn->local_rx_ctrl.prod_flags.urg_data_pending) {
231 if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
232 conn->urg_state = SMC_URG_NOTYET;
233 /* force immediate tx of current consumer cursor, but
234 * under send_lock to guarantee arrival in seqno-order
235 */
236 smc_tx_sndbuf_nonempty(conn);
237 }
213 } 238 }
214 239
215 /* piggy backed tx info */ 240 /* piggy backed tx info */
@@ -219,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
219 /* trigger socket release if connection closed */ 244 /* trigger socket release if connection closed */
220 smc_close_wake_tx_prepared(smc); 245 smc_close_wake_tx_prepared(smc);
221 } 246 }
247 if (diff_cons && conn->urg_tx_pend &&
248 atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
249 /* urg data confirmed by peer, indicate we're ready for more */
250 conn->urg_tx_pend = false;
251 smc->sk.sk_write_space(&smc->sk);
252 }
222 253
223 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { 254 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
224 smc->sk.sk_err = ECONNRESET; 255 smc->sk.sk_err = ECONNRESET;
@@ -236,26 +267,11 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
236} 267}
237 268
238/* called under tasklet context */ 269/* called under tasklet context */
239static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc, 270static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
240 struct smc_link *link, u64 wr_id)
241{ 271{
242 struct smc_link_group *lgr = container_of(link, struct smc_link_group,
243 lnk[SMC_SINGLE_LINK]);
244 struct smc_connection *connection;
245 struct smc_sock *smc;
246
247 /* lookup connection */
248 read_lock_bh(&lgr->conns_lock);
249 connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
250 if (!connection) {
251 read_unlock_bh(&lgr->conns_lock);
252 return;
253 }
254 smc = container_of(connection, struct smc_sock, conn);
255 sock_hold(&smc->sk); 272 sock_hold(&smc->sk);
256 read_unlock_bh(&lgr->conns_lock);
257 bh_lock_sock(&smc->sk); 273 bh_lock_sock(&smc->sk);
258 smc_cdc_msg_recv_action(smc, link, cdc); 274 smc_cdc_msg_recv_action(smc, cdc);
259 bh_unlock_sock(&smc->sk); 275 bh_unlock_sock(&smc->sk);
260 sock_put(&smc->sk); /* no free sk in softirq-context */ 276 sock_put(&smc->sk); /* no free sk in softirq-context */
261} 277}
@@ -266,12 +282,31 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
266{ 282{
267 struct smc_link *link = (struct smc_link *)wc->qp->qp_context; 283 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
268 struct smc_cdc_msg *cdc = buf; 284 struct smc_cdc_msg *cdc = buf;
285 struct smc_connection *conn;
286 struct smc_link_group *lgr;
287 struct smc_sock *smc;
269 288
270 if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) 289 if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
271 return; /* short message */ 290 return; /* short message */
272 if (cdc->len != SMC_WR_TX_SIZE) 291 if (cdc->len != SMC_WR_TX_SIZE)
273 return; /* invalid message */ 292 return; /* invalid message */
274 smc_cdc_msg_recv(cdc, link, wc->wr_id); 293
294 /* lookup connection */
295 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
296 read_lock_bh(&lgr->conns_lock);
297 conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
298 read_unlock_bh(&lgr->conns_lock);
299 if (!conn)
300 return;
301 smc = container_of(conn, struct smc_sock, conn);
302
303 if (!cdc->prod_flags.failover_validation) {
304 if (smc_cdc_before(ntohs(cdc->seqno),
305 conn->local_rx_ctrl.seqno))
306 /* received seqno is old */
307 return;
308 }
309 smc_cdc_msg_recv(smc, cdc);
275} 310}
276 311
277static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { 312static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index ab240b37ad11..f60082fee5b8 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -48,7 +48,7 @@ struct smc_cdc_msg {
48 struct smc_cdc_producer_flags prod_flags; 48 struct smc_cdc_producer_flags prod_flags;
49 struct smc_cdc_conn_state_flags conn_state_flags; 49 struct smc_cdc_conn_state_flags conn_state_flags;
50 u8 reserved[18]; 50 u8 reserved[18];
51} __aligned(8); 51} __packed; /* format defined in RFC7609 */
52 52
53static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) 53static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
54{ 54{
@@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size,
146 return max_t(int, 0, (new->count - old->count)); 146 return max_t(int, 0, (new->count - old->count));
147} 147}
148 148
149/* calculate cursor difference between old and new - returns negative
150 * value in case old > new
151 */
152static inline int smc_curs_comp(unsigned int size,
153 union smc_host_cursor *old,
154 union smc_host_cursor *new)
155{
156 if (old->wrap > new->wrap ||
157 (old->wrap == new->wrap && old->count > new->count))
158 return -smc_curs_diff(size, new, old);
159 return smc_curs_diff(size, old, new);
160}
161
149static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, 162static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
150 union smc_host_cursor *local, 163 union smc_host_cursor *local,
151 struct smc_connection *conn) 164 struct smc_connection *conn)
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 3a988c22f627..717449b1da0b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -316,7 +316,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
316 if (clcm->type == SMC_CLC_DECLINE) { 316 if (clcm->type == SMC_CLC_DECLINE) {
317 reason_code = SMC_CLC_DECL_REPLY; 317 reason_code = SMC_CLC_DECL_REPLY;
318 if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { 318 if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
319 smc->conn.lgr->sync_err = true; 319 smc->conn.lgr->sync_err = 1;
320 smc_lgr_terminate(smc->conn.lgr); 320 smc_lgr_terminate(smc->conn.lgr);
321 } 321 }
322 } 322 }
@@ -442,7 +442,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
442 hton24(cclc.qpn, link->roce_qp->qp_num); 442 hton24(cclc.qpn, link->roce_qp->qp_num);
443 cclc.rmb_rkey = 443 cclc.rmb_rkey =
444 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); 444 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
445 cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ 445 cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
446 cclc.rmbe_alert_token = htonl(conn->alert_token_local); 446 cclc.rmbe_alert_token = htonl(conn->alert_token_local);
447 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); 447 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
448 cclc.rmbe_size = conn->rmbe_size_short; 448 cclc.rmbe_size = conn->rmbe_size_short;
@@ -494,7 +494,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
494 hton24(aclc.qpn, link->roce_qp->qp_num); 494 hton24(aclc.qpn, link->roce_qp->qp_num);
495 aclc.rmb_rkey = 495 aclc.rmb_rkey =
496 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); 496 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
497 aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ 497 aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
498 aclc.rmbe_alert_token = htonl(conn->alert_token_local); 498 aclc.rmbe_alert_token = htonl(conn->alert_token_local);
499 aclc.qp_mtu = link->path_mtu; 499 aclc.qp_mtu = link->path_mtu;
500 aclc.rmbe_size = conn->rmbe_size_short, 500 aclc.rmbe_size = conn->rmbe_size_short,
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 63bf1dc2c1f9..41ff9ea96139 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -97,7 +97,7 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
97 struct smc_clc_msg_local lcl; 97 struct smc_clc_msg_local lcl;
98 u8 qpn[3]; /* QP number */ 98 u8 qpn[3]; /* QP number */
99 __be32 rmb_rkey; /* RMB rkey */ 99 __be32 rmb_rkey; /* RMB rkey */
100 u8 conn_idx; /* Connection index, which RMBE in RMB */ 100 u8 rmbe_idx; /* Index of RMBE in RMB */
101 __be32 rmbe_alert_token;/* unique connection id */ 101 __be32 rmbe_alert_token;/* unique connection id */
102#if defined(__BIG_ENDIAN_BITFIELD) 102#if defined(__BIG_ENDIAN_BITFIELD)
103 u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ 103 u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d4bd01bb44e1..add82b0266f3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -28,12 +28,16 @@
28 28
29#define SMC_LGR_NUM_INCR 256 29#define SMC_LGR_NUM_INCR 256
30#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) 30#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
31#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) 31#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
32 32
33static u32 smc_lgr_num; /* unique link group number */ 33static struct smc_lgr_list smc_lgr_list = { /* established link groups */
34 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
35 .list = LIST_HEAD_INIT(smc_lgr_list.list),
36 .num = 0,
37};
34 38
35static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, 39static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
36 bool is_rmb); 40 struct smc_buf_desc *buf_desc);
37 41
38static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) 42static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
39{ 43{
@@ -148,8 +152,11 @@ static void smc_lgr_free_work(struct work_struct *work)
148 list_del_init(&lgr->list); /* remove from smc_lgr_list */ 152 list_del_init(&lgr->list); /* remove from smc_lgr_list */
149free: 153free:
150 spin_unlock_bh(&smc_lgr_list.lock); 154 spin_unlock_bh(&smc_lgr_list.lock);
151 if (!delayed_work_pending(&lgr->free_work)) 155 if (!delayed_work_pending(&lgr->free_work)) {
156 if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
157 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
152 smc_lgr_free(lgr); 158 smc_lgr_free(lgr);
159 }
153} 160}
154 161
155/* create a new SMC link group */ 162/* create a new SMC link group */
@@ -169,7 +176,7 @@ static int smc_lgr_create(struct smc_sock *smc,
169 goto out; 176 goto out;
170 } 177 }
171 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 178 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
172 lgr->sync_err = false; 179 lgr->sync_err = 0;
173 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); 180 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
174 lgr->vlan_id = vlan_id; 181 lgr->vlan_id = vlan_id;
175 rwlock_init(&lgr->sndbufs_lock); 182 rwlock_init(&lgr->sndbufs_lock);
@@ -178,8 +185,8 @@ static int smc_lgr_create(struct smc_sock *smc,
178 INIT_LIST_HEAD(&lgr->sndbufs[i]); 185 INIT_LIST_HEAD(&lgr->sndbufs[i]);
179 INIT_LIST_HEAD(&lgr->rmbs[i]); 186 INIT_LIST_HEAD(&lgr->rmbs[i]);
180 } 187 }
181 smc_lgr_num += SMC_LGR_NUM_INCR; 188 smc_lgr_list.num += SMC_LGR_NUM_INCR;
182 memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); 189 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
183 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); 190 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
184 lgr->conns_all = RB_ROOT; 191 lgr->conns_all = RB_ROOT;
185 192
@@ -194,9 +201,12 @@ static int smc_lgr_create(struct smc_sock *smc,
194 smc_ib_setup_per_ibdev(smcibdev); 201 smc_ib_setup_per_ibdev(smcibdev);
195 get_random_bytes(rndvec, sizeof(rndvec)); 202 get_random_bytes(rndvec, sizeof(rndvec));
196 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); 203 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
197 rc = smc_wr_alloc_link_mem(lnk); 204 rc = smc_llc_link_init(lnk);
198 if (rc) 205 if (rc)
199 goto free_lgr; 206 goto free_lgr;
207 rc = smc_wr_alloc_link_mem(lnk);
208 if (rc)
209 goto clear_llc_lnk;
200 rc = smc_ib_create_protection_domain(lnk); 210 rc = smc_ib_create_protection_domain(lnk);
201 if (rc) 211 if (rc)
202 goto free_link_mem; 212 goto free_link_mem;
@@ -206,10 +216,6 @@ static int smc_lgr_create(struct smc_sock *smc,
206 rc = smc_wr_create_link(lnk); 216 rc = smc_wr_create_link(lnk);
207 if (rc) 217 if (rc)
208 goto destroy_qp; 218 goto destroy_qp;
209 init_completion(&lnk->llc_confirm);
210 init_completion(&lnk->llc_confirm_resp);
211 init_completion(&lnk->llc_add);
212 init_completion(&lnk->llc_add_resp);
213 219
214 smc->conn.lgr = lgr; 220 smc->conn.lgr = lgr;
215 rwlock_init(&lgr->conns_lock); 221 rwlock_init(&lgr->conns_lock);
@@ -224,6 +230,8 @@ dealloc_pd:
224 smc_ib_dealloc_protection_domain(lnk); 230 smc_ib_dealloc_protection_domain(lnk);
225free_link_mem: 231free_link_mem:
226 smc_wr_free_link_mem(lnk); 232 smc_wr_free_link_mem(lnk);
233clear_llc_lnk:
234 smc_llc_link_clear(lnk);
227free_lgr: 235free_lgr:
228 kfree(lgr); 236 kfree(lgr);
229out: 237out:
@@ -232,26 +240,21 @@ out:
232 240
233static void smc_buf_unuse(struct smc_connection *conn) 241static void smc_buf_unuse(struct smc_connection *conn)
234{ 242{
235 if (conn->sndbuf_desc) { 243 if (conn->sndbuf_desc)
236 conn->sndbuf_desc->used = 0; 244 conn->sndbuf_desc->used = 0;
237 conn->sndbuf_size = 0;
238 }
239 if (conn->rmb_desc) { 245 if (conn->rmb_desc) {
240 if (!conn->rmb_desc->regerr) { 246 if (!conn->rmb_desc->regerr) {
241 conn->rmb_desc->reused = 1; 247 conn->rmb_desc->reused = 1;
242 conn->rmb_desc->used = 0; 248 conn->rmb_desc->used = 0;
243 conn->rmbe_size = 0;
244 } else { 249 } else {
245 /* buf registration failed, reuse not possible */ 250 /* buf registration failed, reuse not possible */
246 struct smc_link_group *lgr = conn->lgr; 251 struct smc_link_group *lgr = conn->lgr;
247 struct smc_link *lnk;
248 252
249 write_lock_bh(&lgr->rmbs_lock); 253 write_lock_bh(&lgr->rmbs_lock);
250 list_del(&conn->rmb_desc->list); 254 list_del(&conn->rmb_desc->list);
251 write_unlock_bh(&lgr->rmbs_lock); 255 write_unlock_bh(&lgr->rmbs_lock);
252 256
253 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 257 smc_buf_free(lgr, true, conn->rmb_desc);
254 smc_buf_free(conn->rmb_desc, lnk, true);
255 } 258 }
256 } 259 }
257} 260}
@@ -269,6 +272,7 @@ void smc_conn_free(struct smc_connection *conn)
269static void smc_link_clear(struct smc_link *lnk) 272static void smc_link_clear(struct smc_link *lnk)
270{ 273{
271 lnk->peer_qpn = 0; 274 lnk->peer_qpn = 0;
275 smc_llc_link_clear(lnk);
272 smc_ib_modify_qp_reset(lnk); 276 smc_ib_modify_qp_reset(lnk);
273 smc_wr_free_link(lnk); 277 smc_wr_free_link(lnk);
274 smc_ib_destroy_queue_pair(lnk); 278 smc_ib_destroy_queue_pair(lnk);
@@ -276,9 +280,11 @@ static void smc_link_clear(struct smc_link *lnk)
276 smc_wr_free_link_mem(lnk); 280 smc_wr_free_link_mem(lnk);
277} 281}
278 282
279static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, 283static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
280 bool is_rmb) 284 struct smc_buf_desc *buf_desc)
281{ 285{
286 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
287
282 if (is_rmb) { 288 if (is_rmb) {
283 if (buf_desc->mr_rx[SMC_SINGLE_LINK]) 289 if (buf_desc->mr_rx[SMC_SINGLE_LINK])
284 smc_ib_put_memory_region( 290 smc_ib_put_memory_region(
@@ -290,14 +296,13 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
290 DMA_TO_DEVICE); 296 DMA_TO_DEVICE);
291 } 297 }
292 sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); 298 sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
293 if (buf_desc->cpu_addr) 299 if (buf_desc->pages)
294 free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order); 300 __free_pages(buf_desc->pages, buf_desc->order);
295 kfree(buf_desc); 301 kfree(buf_desc);
296} 302}
297 303
298static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) 304static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
299{ 305{
300 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
301 struct smc_buf_desc *buf_desc, *bf_desc; 306 struct smc_buf_desc *buf_desc, *bf_desc;
302 struct list_head *buf_list; 307 struct list_head *buf_list;
303 int i; 308 int i;
@@ -310,7 +315,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
310 list_for_each_entry_safe(buf_desc, bf_desc, buf_list, 315 list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
311 list) { 316 list) {
312 list_del(&buf_desc->list); 317 list_del(&buf_desc->list);
313 smc_buf_free(buf_desc, lnk, is_rmb); 318 smc_buf_free(lgr, is_rmb, buf_desc);
314 } 319 }
315 } 320 }
316} 321}
@@ -341,13 +346,18 @@ void smc_lgr_forget(struct smc_link_group *lgr)
341} 346}
342 347
343/* terminate linkgroup abnormally */ 348/* terminate linkgroup abnormally */
344void smc_lgr_terminate(struct smc_link_group *lgr) 349static void __smc_lgr_terminate(struct smc_link_group *lgr)
345{ 350{
346 struct smc_connection *conn; 351 struct smc_connection *conn;
347 struct smc_sock *smc; 352 struct smc_sock *smc;
348 struct rb_node *node; 353 struct rb_node *node;
349 354
350 smc_lgr_forget(lgr); 355 if (lgr->terminating)
356 return; /* lgr already terminating */
357 lgr->terminating = 1;
358 if (!list_empty(&lgr->list)) /* forget lgr */
359 list_del_init(&lgr->list);
360 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
351 361
352 write_lock_bh(&lgr->conns_lock); 362 write_lock_bh(&lgr->conns_lock);
353 node = rb_first(&lgr->conns_all); 363 node = rb_first(&lgr->conns_all);
@@ -368,13 +378,35 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
368 smc_lgr_schedule_free_work(lgr); 378 smc_lgr_schedule_free_work(lgr);
369} 379}
370 380
381void smc_lgr_terminate(struct smc_link_group *lgr)
382{
383 spin_lock_bh(&smc_lgr_list.lock);
384 __smc_lgr_terminate(lgr);
385 spin_unlock_bh(&smc_lgr_list.lock);
386}
387
388/* Called when IB port is terminated */
389void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
390{
391 struct smc_link_group *lgr, *l;
392
393 spin_lock_bh(&smc_lgr_list.lock);
394 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
395 if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
396 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
397 __smc_lgr_terminate(lgr);
398 }
399 spin_unlock_bh(&smc_lgr_list.lock);
400}
401
371/* Determine vlan of internal TCP socket. 402/* Determine vlan of internal TCP socket.
372 * @vlan_id: address to store the determined vlan id into 403 * @vlan_id: address to store the determined vlan id into
373 */ 404 */
374static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) 405static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
375{ 406{
376 struct dst_entry *dst = sk_dst_get(clcsock->sk); 407 struct dst_entry *dst = sk_dst_get(clcsock->sk);
377 int rc = 0; 408 struct net_device *ndev;
409 int i, nest_lvl, rc = 0;
378 410
379 *vlan_id = 0; 411 *vlan_id = 0;
380 if (!dst) { 412 if (!dst) {
@@ -386,8 +418,27 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
386 goto out_rel; 418 goto out_rel;
387 } 419 }
388 420
389 if (is_vlan_dev(dst->dev)) 421 ndev = dst->dev;
390 *vlan_id = vlan_dev_vlan_id(dst->dev); 422 if (is_vlan_dev(ndev)) {
423 *vlan_id = vlan_dev_vlan_id(ndev);
424 goto out_rel;
425 }
426
427 rtnl_lock();
428 nest_lvl = dev_get_nest_level(ndev);
429 for (i = 0; i < nest_lvl; i++) {
430 struct list_head *lower = &ndev->adj_list.lower;
431
432 if (list_empty(lower))
433 break;
434 lower = lower->next;
435 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
436 if (is_vlan_dev(ndev)) {
437 *vlan_id = vlan_dev_vlan_id(ndev);
438 break;
439 }
440 }
441 rtnl_unlock();
391 442
392out_rel: 443out_rel:
393 dst_release(dst); 444 dst_release(dst);
@@ -432,10 +483,10 @@ int smc_conn_create(struct smc_sock *smc,
432 struct smc_clc_msg_local *lcl, int srv_first_contact) 483 struct smc_clc_msg_local *lcl, int srv_first_contact)
433{ 484{
434 struct smc_connection *conn = &smc->conn; 485 struct smc_connection *conn = &smc->conn;
486 int local_contact = SMC_FIRST_CONTACT;
435 struct smc_link_group *lgr; 487 struct smc_link_group *lgr;
436 unsigned short vlan_id; 488 unsigned short vlan_id;
437 enum smc_lgr_role role; 489 enum smc_lgr_role role;
438 int local_contact = SMC_FIRST_CONTACT;
439 int rc = 0; 490 int rc = 0;
440 491
441 role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 492 role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
@@ -493,6 +544,7 @@ create:
493 } 544 }
494 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 545 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
495 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 546 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
547 conn->urg_state = SMC_URG_READ;
496#ifndef KERNEL_HAS_ATOMIC64 548#ifndef KERNEL_HAS_ATOMIC64
497 spin_lock_init(&conn->acurs_lock); 549 spin_lock_init(&conn->acurs_lock);
498#endif 550#endif
@@ -501,14 +553,39 @@ out:
501 return rc ? rc : local_contact; 553 return rc ? rc : local_contact;
502} 554}
503 555
556/* convert the RMB size into the compressed notation - minimum 16K.
557 * In contrast to plain ilog2, this rounds towards the next power of 2,
558 * so the socket application gets at least its desired sndbuf / rcvbuf size.
559 */
560static u8 smc_compress_bufsize(int size)
561{
562 u8 compressed;
563
564 if (size <= SMC_BUF_MIN_SIZE)
565 return 0;
566
567 size = (size - 1) >> 14;
568 compressed = ilog2(size) + 1;
569 if (compressed >= SMC_RMBE_SIZES)
570 compressed = SMC_RMBE_SIZES - 1;
571 return compressed;
572}
573
574/* convert the RMB size from compressed notation into integer */
575int smc_uncompress_bufsize(u8 compressed)
576{
577 u32 size;
578
579 size = 0x00000001 << (((int)compressed) + 14);
580 return (int)size;
581}
582
504/* try to reuse a sndbuf or rmb description slot for a certain 583/* try to reuse a sndbuf or rmb description slot for a certain
505 * buffer size; if not available, return NULL 584 * buffer size; if not available, return NULL
506 */ 585 */
507static inline 586static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
508struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, 587 rwlock_t *lock,
509 int compressed_bufsize, 588 struct list_head *buf_list)
510 rwlock_t *lock,
511 struct list_head *buf_list)
512{ 589{
513 struct smc_buf_desc *buf_slot; 590 struct smc_buf_desc *buf_slot;
514 591
@@ -544,23 +621,23 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
544 if (!buf_desc) 621 if (!buf_desc)
545 return ERR_PTR(-ENOMEM); 622 return ERR_PTR(-ENOMEM);
546 623
547 buf_desc->cpu_addr = 624 buf_desc->order = get_order(bufsize);
548 (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | 625 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
549 __GFP_NOMEMALLOC | 626 __GFP_NOMEMALLOC | __GFP_COMP |
550 __GFP_NORETRY | __GFP_ZERO, 627 __GFP_NORETRY | __GFP_ZERO,
551 get_order(bufsize)); 628 buf_desc->order);
552 if (!buf_desc->cpu_addr) { 629 if (!buf_desc->pages) {
553 kfree(buf_desc); 630 kfree(buf_desc);
554 return ERR_PTR(-EAGAIN); 631 return ERR_PTR(-EAGAIN);
555 } 632 }
556 buf_desc->order = get_order(bufsize); 633 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
557 634
558 /* build the sg table from the pages */ 635 /* build the sg table from the pages */
559 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 636 lnk = &lgr->lnk[SMC_SINGLE_LINK];
560 rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, 637 rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
561 GFP_KERNEL); 638 GFP_KERNEL);
562 if (rc) { 639 if (rc) {
563 smc_buf_free(buf_desc, lnk, is_rmb); 640 smc_buf_free(lgr, is_rmb, buf_desc);
564 return ERR_PTR(rc); 641 return ERR_PTR(rc);
565 } 642 }
566 sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, 643 sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
@@ -571,7 +648,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
571 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 648 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
572 /* SMC protocol depends on mapping to one DMA address only */ 649 /* SMC protocol depends on mapping to one DMA address only */
573 if (rc != 1) { 650 if (rc != 1) {
574 smc_buf_free(buf_desc, lnk, is_rmb); 651 smc_buf_free(lgr, is_rmb, buf_desc);
575 return ERR_PTR(-EAGAIN); 652 return ERR_PTR(-EAGAIN);
576 } 653 }
577 654
@@ -582,19 +659,20 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
582 IB_ACCESS_LOCAL_WRITE, 659 IB_ACCESS_LOCAL_WRITE,
583 buf_desc); 660 buf_desc);
584 if (rc) { 661 if (rc) {
585 smc_buf_free(buf_desc, lnk, is_rmb); 662 smc_buf_free(lgr, is_rmb, buf_desc);
586 return ERR_PTR(rc); 663 return ERR_PTR(rc);
587 } 664 }
588 } 665 }
589 666
667 buf_desc->len = bufsize;
590 return buf_desc; 668 return buf_desc;
591} 669}
592 670
593static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) 671static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
594{ 672{
673 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
595 struct smc_connection *conn = &smc->conn; 674 struct smc_connection *conn = &smc->conn;
596 struct smc_link_group *lgr = conn->lgr; 675 struct smc_link_group *lgr = conn->lgr;
597 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
598 struct list_head *buf_list; 676 struct list_head *buf_list;
599 int bufsize, bufsize_short; 677 int bufsize, bufsize_short;
600 int sk_buf_size; 678 int sk_buf_size;
@@ -622,7 +700,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
622 continue; 700 continue;
623 701
624 /* check for reusable slot in the link group */ 702 /* check for reusable slot in the link group */
625 buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); 703 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
626 if (buf_desc) { 704 if (buf_desc) {
627 memset(buf_desc->cpu_addr, 0, bufsize); 705 memset(buf_desc->cpu_addr, 0, bufsize);
628 break; /* found reusable slot */ 706 break; /* found reusable slot */
@@ -646,14 +724,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
646 724
647 if (is_rmb) { 725 if (is_rmb) {
648 conn->rmb_desc = buf_desc; 726 conn->rmb_desc = buf_desc;
649 conn->rmbe_size = bufsize;
650 conn->rmbe_size_short = bufsize_short; 727 conn->rmbe_size_short = bufsize_short;
651 smc->sk.sk_rcvbuf = bufsize * 2; 728 smc->sk.sk_rcvbuf = bufsize * 2;
652 atomic_set(&conn->bytes_to_rcv, 0); 729 atomic_set(&conn->bytes_to_rcv, 0);
653 conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); 730 conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
654 } else { 731 } else {
655 conn->sndbuf_desc = buf_desc; 732 conn->sndbuf_desc = buf_desc;
656 conn->sndbuf_size = bufsize;
657 smc->sk.sk_sndbuf = bufsize * 2; 733 smc->sk.sk_sndbuf = bufsize * 2;
658 atomic_set(&conn->sndbuf_space, bufsize); 734 atomic_set(&conn->sndbuf_space, bufsize);
659 } 735 }
@@ -709,8 +785,7 @@ int smc_buf_create(struct smc_sock *smc)
709 /* create rmb */ 785 /* create rmb */
710 rc = __smc_buf_create(smc, true); 786 rc = __smc_buf_create(smc, true);
711 if (rc) 787 if (rc)
712 smc_buf_free(smc->conn.sndbuf_desc, 788 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
713 &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false);
714 return rc; 789 return rc;
715} 790}
716 791
@@ -777,3 +852,21 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
777 return conn->rtoken_idx; 852 return conn->rtoken_idx;
778 return 0; 853 return 0;
779} 854}
855
856/* Called (from smc_exit) when module is removed */
857void smc_core_exit(void)
858{
859 struct smc_link_group *lgr, *lg;
860 LIST_HEAD(lgr_freeing_list);
861
862 spin_lock_bh(&smc_lgr_list.lock);
863 if (!list_empty(&smc_lgr_list.list))
864 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
865 spin_unlock_bh(&smc_lgr_list.lock);
866 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
867 list_del_init(&lgr->list);
868 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
869 cancel_delayed_work_sync(&lgr->free_work);
870 smc_lgr_free(lgr); /* free link group */
871 }
872}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 5dfcb15d529f..93cb3523bf50 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -23,10 +23,9 @@
23struct smc_lgr_list { /* list of link group definition */ 23struct smc_lgr_list { /* list of link group definition */
24 struct list_head list; 24 struct list_head list;
25 spinlock_t lock; /* protects list of link groups */ 25 spinlock_t lock; /* protects list of link groups */
26 u32 num; /* unique link group number */
26}; 27};
27 28
28extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
29
30enum smc_lgr_role { /* possible roles of a link group */ 29enum smc_lgr_role { /* possible roles of a link group */
31 SMC_CLNT, /* client */ 30 SMC_CLNT, /* client */
32 SMC_SERV /* server */ 31 SMC_SERV /* server */
@@ -79,6 +78,7 @@ struct smc_link {
79 dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ 78 dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
80 u64 wr_rx_id; /* seq # of last recv WR */ 79 u64 wr_rx_id; /* seq # of last recv WR */
81 u32 wr_rx_cnt; /* number of WR recv buffers */ 80 u32 wr_rx_cnt; /* number of WR recv buffers */
81 unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
82 82
83 struct ib_reg_wr wr_reg; /* WR register memory region */ 83 struct ib_reg_wr wr_reg; /* WR register memory region */
84 wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ 84 wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
@@ -95,12 +95,18 @@ struct smc_link {
95 u8 link_id; /* unique # within link group */ 95 u8 link_id; /* unique # within link group */
96 96
97 enum smc_link_state state; /* state of link */ 97 enum smc_link_state state; /* state of link */
98 struct workqueue_struct *llc_wq; /* single thread work queue */
98 struct completion llc_confirm; /* wait for rx of conf link */ 99 struct completion llc_confirm; /* wait for rx of conf link */
99 struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ 100 struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
100 int llc_confirm_rc; /* rc from confirm link msg */ 101 int llc_confirm_rc; /* rc from confirm link msg */
101 int llc_confirm_resp_rc; /* rc from conf_resp msg */ 102 int llc_confirm_resp_rc; /* rc from conf_resp msg */
102 struct completion llc_add; /* wait for rx of add link */ 103 struct completion llc_add; /* wait for rx of add link */
103 struct completion llc_add_resp; /* wait for rx of add link rsp*/ 104 struct completion llc_add_resp; /* wait for rx of add link rsp*/
105 struct delayed_work llc_testlink_wrk; /* testlink worker */
106 struct completion llc_testlink_resp; /* wait for rx of testlink */
107 int llc_testlink_time; /* testlink interval */
108 struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */
109 int llc_confirm_rkey_rc; /* rc from cnf rkey msg */
104}; 110};
105 111
106/* For now we just allow one parallel link per link group. The SMC protocol 112/* For now we just allow one parallel link per link group. The SMC protocol
@@ -116,6 +122,8 @@ struct smc_link {
116struct smc_buf_desc { 122struct smc_buf_desc {
117 struct list_head list; 123 struct list_head list;
118 void *cpu_addr; /* virtual address of buffer */ 124 void *cpu_addr; /* virtual address of buffer */
125 struct page *pages;
126 int len; /* length of buffer */
119 struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ 127 struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
120 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; 128 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
121 /* for rmb only: memory region 129 /* for rmb only: memory region
@@ -133,6 +141,12 @@ struct smc_rtoken { /* address/key of remote RMB */
133}; 141};
134 142
135#define SMC_LGR_ID_SIZE 4 143#define SMC_LGR_ID_SIZE 4
144#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
145#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
146/* theoretically, the RFC states that largest size would be 512K,
147 * i.e. compressed 5 and thus 6 sizes (0..5), despite
148 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
149 */
136 150
137struct smc_link_group { 151struct smc_link_group {
138 struct list_head list; 152 struct list_head list;
@@ -158,7 +172,8 @@ struct smc_link_group {
158 172
159 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ 173 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
160 struct delayed_work free_work; /* delayed freeing of an lgr */ 174 struct delayed_work free_work; /* delayed freeing of an lgr */
161 bool sync_err; /* lgr no longer fits to peer */ 175 u8 sync_err : 1; /* lgr no longer fits to peer */
176 u8 terminating : 1;/* lgr is terminating */
162}; 177};
163 178
164/* Find the connection associated with the given alert token in the link group. 179/* Find the connection associated with the given alert token in the link group.
@@ -196,11 +211,14 @@ static inline struct smc_connection *smc_lgr_find_conn(
196 211
197struct smc_sock; 212struct smc_sock;
198struct smc_clc_msg_accept_confirm; 213struct smc_clc_msg_accept_confirm;
214struct smc_clc_msg_local;
199 215
200void smc_lgr_free(struct smc_link_group *lgr); 216void smc_lgr_free(struct smc_link_group *lgr);
201void smc_lgr_forget(struct smc_link_group *lgr); 217void smc_lgr_forget(struct smc_link_group *lgr);
202void smc_lgr_terminate(struct smc_link_group *lgr); 218void smc_lgr_terminate(struct smc_link_group *lgr);
219void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
203int smc_buf_create(struct smc_sock *smc); 220int smc_buf_create(struct smc_sock *smc);
221int smc_uncompress_bufsize(u8 compressed);
204int smc_rmb_rtoken_handling(struct smc_connection *conn, 222int smc_rmb_rtoken_handling(struct smc_connection *conn,
205 struct smc_clc_msg_accept_confirm *clc); 223 struct smc_clc_msg_accept_confirm *clc);
206int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); 224int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey);
@@ -209,4 +227,9 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
209void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); 227void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
210void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); 228void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
211void smc_rmb_sync_sg_for_device(struct smc_connection *conn); 229void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
230void smc_conn_free(struct smc_connection *conn);
231int smc_conn_create(struct smc_sock *smc,
232 struct smc_ib_device *smcibdev, u8 ibport,
233 struct smc_clc_msg_local *lcl, int srv_first_contact);
234void smc_core_exit(void);
212#endif 235#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 427b91c1c964..839354402215 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -38,17 +38,27 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
38{ 38{
39 struct smc_sock *smc = smc_sk(sk); 39 struct smc_sock *smc = smc_sk(sk);
40 40
41 r->diag_family = sk->sk_family;
42 if (!smc->clcsock) 41 if (!smc->clcsock)
43 return; 42 return;
44 r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); 43 r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
45 r->id.idiag_dport = smc->clcsock->sk->sk_dport; 44 r->id.idiag_dport = smc->clcsock->sk->sk_dport;
46 r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; 45 r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
47 sock_diag_save_cookie(sk, r->id.idiag_cookie); 46 sock_diag_save_cookie(sk, r->id.idiag_cookie);
48 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); 47 if (sk->sk_protocol == SMCPROTO_SMC) {
49 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); 48 r->diag_family = PF_INET;
50 r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; 49 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
51 r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; 50 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
51 r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
52 r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
53#if IS_ENABLED(CONFIG_IPV6)
54 } else if (sk->sk_protocol == SMCPROTO_SMC6) {
55 r->diag_family = PF_INET6;
56 memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
57 sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
58 memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
59 sizeof(smc->clcsock->sk->sk_v6_daddr));
60#endif
61 }
52} 62}
53 63
54static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, 64static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
@@ -91,8 +101,9 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
91 struct smc_connection *conn = &smc->conn; 101 struct smc_connection *conn = &smc->conn;
92 struct smc_diag_conninfo cinfo = { 102 struct smc_diag_conninfo cinfo = {
93 .token = conn->alert_token_local, 103 .token = conn->alert_token_local,
94 .sndbuf_size = conn->sndbuf_size, 104 .sndbuf_size = conn->sndbuf_desc ?
95 .rmbe_size = conn->rmbe_size, 105 conn->sndbuf_desc->len : 0,
106 .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0,
96 .peer_rmbe_size = conn->peer_rmbe_size, 107 .peer_rmbe_size = conn->peer_rmbe_size,
97 108
98 .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, 109 .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
@@ -153,7 +164,8 @@ errout:
153 return -EMSGSIZE; 164 return -EMSGSIZE;
154} 165}
155 166
156static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) 167static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
168 struct netlink_callback *cb)
157{ 169{
158 struct net *net = sock_net(skb->sk); 170 struct net *net = sock_net(skb->sk);
159 struct nlattr *bc = NULL; 171 struct nlattr *bc = NULL;
@@ -161,8 +173,8 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
161 struct sock *sk; 173 struct sock *sk;
162 int rc = 0; 174 int rc = 0;
163 175
164 read_lock(&smc_proto.h.smc_hash->lock); 176 read_lock(&prot->h.smc_hash->lock);
165 head = &smc_proto.h.smc_hash->ht; 177 head = &prot->h.smc_hash->ht;
166 if (hlist_empty(head)) 178 if (hlist_empty(head))
167 goto out; 179 goto out;
168 180
@@ -175,7 +187,17 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
175 } 187 }
176 188
177out: 189out:
178 read_unlock(&smc_proto.h.smc_hash->lock); 190 read_unlock(&prot->h.smc_hash->lock);
191 return rc;
192}
193
194static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
195{
196 int rc = 0;
197
198 rc = smc_diag_dump_proto(&smc_proto, skb, cb);
199 if (!rc)
200 rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
179 return rc; 201 return rc;
180} 202}
181 203
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 26df554f7588..0eed7ab9f28b 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -143,17 +143,6 @@ out:
143 return rc; 143 return rc;
144} 144}
145 145
146static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
147{
148 struct smc_link_group *lgr, *l;
149
150 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
151 if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
152 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
153 smc_lgr_terminate(lgr);
154 }
155}
156
157/* process context wrapper for might_sleep smc_ib_remember_port_attr */ 146/* process context wrapper for might_sleep smc_ib_remember_port_attr */
158static void smc_ib_port_event_work(struct work_struct *work) 147static void smc_ib_port_event_work(struct work_struct *work)
159{ 148{
@@ -165,7 +154,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
165 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 154 smc_ib_remember_port_attr(smcibdev, port_idx + 1);
166 clear_bit(port_idx, &smcibdev->port_event_mask); 155 clear_bit(port_idx, &smcibdev->port_event_mask);
167 if (!smc_ib_port_active(smcibdev, port_idx + 1)) 156 if (!smc_ib_port_active(smcibdev, port_idx + 1))
168 smc_ib_port_terminate(smcibdev, port_idx + 1); 157 smc_port_terminate(smcibdev, port_idx + 1);
169 } 158 }
170} 159}
171 160
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index ea4b21981b4b..5800a6b43d83 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -214,12 +214,11 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
214 return rc; 214 return rc;
215} 215}
216 216
217/* send ADD LINK request or response */ 217/* send LLC confirm rkey request */
218int smc_llc_send_add_link(struct smc_link *link, u8 mac[], 218static int smc_llc_send_confirm_rkey(struct smc_link *link,
219 union ib_gid *gid, 219 struct smc_buf_desc *rmb_desc)
220 enum smc_llc_reqresp reqresp)
221{ 220{
222 struct smc_llc_msg_add_link *addllc; 221 struct smc_llc_msg_confirm_rkey *rkeyllc;
223 struct smc_wr_tx_pend_priv *pend; 222 struct smc_wr_tx_pend_priv *pend;
224 struct smc_wr_buf *wr_buf; 223 struct smc_wr_buf *wr_buf;
225 int rc; 224 int rc;
@@ -227,7 +226,25 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
227 rc = smc_llc_add_pending_send(link, &wr_buf, &pend); 226 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
228 if (rc) 227 if (rc)
229 return rc; 228 return rc;
230 addllc = (struct smc_llc_msg_add_link *)wr_buf; 229 rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
230 memset(rkeyllc, 0, sizeof(*rkeyllc));
231 rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
232 rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
233 rkeyllc->rtoken[0].rmb_key =
234 htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
235 rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
236 (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
237 /* send llc message */
238 rc = smc_wr_tx_send(link, pend);
239 return rc;
240}
241
242/* prepare an add link message */
243static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
244 struct smc_link *link, u8 mac[],
245 union ib_gid *gid,
246 enum smc_llc_reqresp reqresp)
247{
231 memset(addllc, 0, sizeof(*addllc)); 248 memset(addllc, 0, sizeof(*addllc));
232 addllc->hd.common.type = SMC_LLC_ADD_LINK; 249 addllc->hd.common.type = SMC_LLC_ADD_LINK;
233 addllc->hd.length = sizeof(struct smc_llc_msg_add_link); 250 addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
@@ -239,16 +256,14 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
239 } 256 }
240 memcpy(addllc->sender_mac, mac, ETH_ALEN); 257 memcpy(addllc->sender_mac, mac, ETH_ALEN);
241 memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); 258 memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
242 /* send llc message */
243 rc = smc_wr_tx_send(link, pend);
244 return rc;
245} 259}
246 260
247/* send DELETE LINK request or response */ 261/* send ADD LINK request or response */
248int smc_llc_send_delete_link(struct smc_link *link, 262int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
249 enum smc_llc_reqresp reqresp) 263 union ib_gid *gid,
264 enum smc_llc_reqresp reqresp)
250{ 265{
251 struct smc_llc_msg_del_link *delllc; 266 struct smc_llc_msg_add_link *addllc;
252 struct smc_wr_tx_pend_priv *pend; 267 struct smc_wr_tx_pend_priv *pend;
253 struct smc_wr_buf *wr_buf; 268 struct smc_wr_buf *wr_buf;
254 int rc; 269 int rc;
@@ -256,7 +271,18 @@ int smc_llc_send_delete_link(struct smc_link *link,
256 rc = smc_llc_add_pending_send(link, &wr_buf, &pend); 271 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
257 if (rc) 272 if (rc)
258 return rc; 273 return rc;
259 delllc = (struct smc_llc_msg_del_link *)wr_buf; 274 addllc = (struct smc_llc_msg_add_link *)wr_buf;
275 smc_llc_prep_add_link(addllc, link, mac, gid, reqresp);
276 /* send llc message */
277 rc = smc_wr_tx_send(link, pend);
278 return rc;
279}
280
281/* prepare a delete link message */
282static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
283 struct smc_link *link,
284 enum smc_llc_reqresp reqresp)
285{
260 memset(delllc, 0, sizeof(*delllc)); 286 memset(delllc, 0, sizeof(*delllc));
261 delllc->hd.common.type = SMC_LLC_DELETE_LINK; 287 delllc->hd.common.type = SMC_LLC_DELETE_LINK;
262 delllc->hd.length = sizeof(struct smc_llc_msg_add_link); 288 delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
@@ -266,14 +292,29 @@ int smc_llc_send_delete_link(struct smc_link *link,
266 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; 292 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
267 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; 293 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
268 delllc->link_num = link->link_id; 294 delllc->link_num = link->link_id;
295}
296
297/* send DELETE LINK request or response */
298int smc_llc_send_delete_link(struct smc_link *link,
299 enum smc_llc_reqresp reqresp)
300{
301 struct smc_llc_msg_del_link *delllc;
302 struct smc_wr_tx_pend_priv *pend;
303 struct smc_wr_buf *wr_buf;
304 int rc;
305
306 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
307 if (rc)
308 return rc;
309 delllc = (struct smc_llc_msg_del_link *)wr_buf;
310 smc_llc_prep_delete_link(delllc, link, reqresp);
269 /* send llc message */ 311 /* send llc message */
270 rc = smc_wr_tx_send(link, pend); 312 rc = smc_wr_tx_send(link, pend);
271 return rc; 313 return rc;
272} 314}
273 315
274/* send LLC test link request or response */ 316/* send LLC test link request */
275int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], 317static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
276 enum smc_llc_reqresp reqresp)
277{ 318{
278 struct smc_llc_msg_test_link *testllc; 319 struct smc_llc_msg_test_link *testllc;
279 struct smc_wr_tx_pend_priv *pend; 320 struct smc_wr_tx_pend_priv *pend;
@@ -287,28 +328,52 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
287 memset(testllc, 0, sizeof(*testllc)); 328 memset(testllc, 0, sizeof(*testllc));
288 testllc->hd.common.type = SMC_LLC_TEST_LINK; 329 testllc->hd.common.type = SMC_LLC_TEST_LINK;
289 testllc->hd.length = sizeof(struct smc_llc_msg_test_link); 330 testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
290 if (reqresp == SMC_LLC_RESP)
291 testllc->hd.flags |= SMC_LLC_FLAG_RESP;
292 memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); 331 memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
293 /* send llc message */ 332 /* send llc message */
294 rc = smc_wr_tx_send(link, pend); 333 rc = smc_wr_tx_send(link, pend);
295 return rc; 334 return rc;
296} 335}
297 336
298/* send a prepared message */ 337struct smc_llc_send_work {
299static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) 338 struct work_struct work;
339 struct smc_link *link;
340 int llclen;
341 union smc_llc_msg llcbuf;
342};
343
344/* worker that sends a prepared message */
345static void smc_llc_send_message_work(struct work_struct *work)
300{ 346{
347 struct smc_llc_send_work *llcwrk = container_of(work,
348 struct smc_llc_send_work, work);
301 struct smc_wr_tx_pend_priv *pend; 349 struct smc_wr_tx_pend_priv *pend;
302 struct smc_wr_buf *wr_buf; 350 struct smc_wr_buf *wr_buf;
303 int rc; 351 int rc;
304 352
305 rc = smc_llc_add_pending_send(link, &wr_buf, &pend); 353 if (llcwrk->link->state == SMC_LNK_INACTIVE)
354 goto out;
355 rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend);
306 if (rc) 356 if (rc)
307 return rc; 357 goto out;
308 memcpy(wr_buf, llcbuf, llclen); 358 memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen);
309 /* send llc message */ 359 smc_wr_tx_send(llcwrk->link, pend);
310 rc = smc_wr_tx_send(link, pend); 360out:
311 return rc; 361 kfree(llcwrk);
362}
363
364/* copy llcbuf and schedule an llc send on link */
365static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
366{
367 struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
368
369 if (!wrk)
370 return -ENOMEM;
371 INIT_WORK(&wrk->work, smc_llc_send_message_work);
372 wrk->link = link;
373 wrk->llclen = llclen;
374 memcpy(&wrk->llcbuf, llcbuf, llclen);
375 queue_work(link->llc_wq, &wrk->work);
376 return 0;
312} 377}
313 378
314/********************************* receive ***********************************/ 379/********************************* receive ***********************************/
@@ -359,17 +424,18 @@ static void smc_llc_rx_add_link(struct smc_link *link,
359 } 424 }
360 425
361 if (lgr->role == SMC_SERV) { 426 if (lgr->role == SMC_SERV) {
362 smc_llc_send_add_link(link, 427 smc_llc_prep_add_link(llc, link,
363 link->smcibdev->mac[link->ibport - 1], 428 link->smcibdev->mac[link->ibport - 1],
364 &link->smcibdev->gid[link->ibport - 1], 429 &link->smcibdev->gid[link->ibport - 1],
365 SMC_LLC_REQ); 430 SMC_LLC_REQ);
366 431
367 } else { 432 } else {
368 smc_llc_send_add_link(link, 433 smc_llc_prep_add_link(llc, link,
369 link->smcibdev->mac[link->ibport - 1], 434 link->smcibdev->mac[link->ibport - 1],
370 &link->smcibdev->gid[link->ibport - 1], 435 &link->smcibdev->gid[link->ibport - 1],
371 SMC_LLC_RESP); 436 SMC_LLC_RESP);
372 } 437 }
438 smc_llc_send_message(link, llc, sizeof(*llc));
373 } 439 }
374} 440}
375 441
@@ -385,9 +451,11 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
385 } else { 451 } else {
386 if (lgr->role == SMC_SERV) { 452 if (lgr->role == SMC_SERV) {
387 smc_lgr_forget(lgr); 453 smc_lgr_forget(lgr);
388 smc_llc_send_delete_link(link, SMC_LLC_REQ); 454 smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ);
455 smc_llc_send_message(link, llc, sizeof(*llc));
389 } else { 456 } else {
390 smc_llc_send_delete_link(link, SMC_LLC_RESP); 457 smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP);
458 smc_llc_send_message(link, llc, sizeof(*llc));
391 smc_lgr_terminate(lgr); 459 smc_lgr_terminate(lgr);
392 } 460 }
393 } 461 }
@@ -397,9 +465,11 @@ static void smc_llc_rx_test_link(struct smc_link *link,
397 struct smc_llc_msg_test_link *llc) 465 struct smc_llc_msg_test_link *llc)
398{ 466{
399 if (llc->hd.flags & SMC_LLC_FLAG_RESP) { 467 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
400 /* unused as long as we don't send this type of msg */ 468 if (link->state == SMC_LNK_ACTIVE)
469 complete(&link->llc_testlink_resp);
401 } else { 470 } else {
402 smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP); 471 llc->hd.flags |= SMC_LLC_FLAG_RESP;
472 smc_llc_send_message(link, llc, sizeof(*llc));
403 } 473 }
404} 474}
405 475
@@ -412,7 +482,9 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
412 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); 482 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
413 483
414 if (llc->hd.flags & SMC_LLC_FLAG_RESP) { 484 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
415 /* unused as long as we don't send this type of msg */ 485 link->llc_confirm_rkey_rc = llc->hd.flags &
486 SMC_LLC_FLAG_RKEY_NEG;
487 complete(&link->llc_confirm_rkey);
416 } else { 488 } else {
417 rc = smc_rtoken_add(lgr, 489 rc = smc_rtoken_add(lgr,
418 llc->rtoken[0].rmb_vaddr, 490 llc->rtoken[0].rmb_vaddr,
@@ -423,7 +495,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
423 llc->hd.flags |= SMC_LLC_FLAG_RESP; 495 llc->hd.flags |= SMC_LLC_FLAG_RESP;
424 if (rc < 0) 496 if (rc < 0)
425 llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; 497 llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
426 smc_llc_send_message(link, (void *)llc, sizeof(*llc)); 498 smc_llc_send_message(link, llc, sizeof(*llc));
427 } 499 }
428} 500}
429 501
@@ -435,7 +507,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
435 } else { 507 } else {
436 /* ignore rtokens for other links, we have only one link */ 508 /* ignore rtokens for other links, we have only one link */
437 llc->hd.flags |= SMC_LLC_FLAG_RESP; 509 llc->hd.flags |= SMC_LLC_FLAG_RESP;
438 smc_llc_send_message(link, (void *)llc, sizeof(*llc)); 510 smc_llc_send_message(link, llc, sizeof(*llc));
439 } 511 }
440} 512}
441 513
@@ -463,7 +535,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
463 } 535 }
464 536
465 llc->hd.flags |= SMC_LLC_FLAG_RESP; 537 llc->hd.flags |= SMC_LLC_FLAG_RESP;
466 smc_llc_send_message(link, (void *)llc, sizeof(*llc)); 538 smc_llc_send_message(link, llc, sizeof(*llc));
467 } 539 }
468} 540}
469 541
@@ -476,6 +548,8 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
476 return; /* short message */ 548 return; /* short message */
477 if (llc->raw.hdr.length != sizeof(*llc)) 549 if (llc->raw.hdr.length != sizeof(*llc))
478 return; /* invalid message */ 550 return; /* invalid message */
551 if (link->state == SMC_LNK_INACTIVE)
552 return; /* link not active, drop msg */
479 553
480 switch (llc->raw.hdr.common.type) { 554 switch (llc->raw.hdr.common.type) {
481 case SMC_LLC_TEST_LINK: 555 case SMC_LLC_TEST_LINK:
@@ -502,6 +576,100 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
502 } 576 }
503} 577}
504 578
579/***************************** worker, utils *********************************/
580
581static void smc_llc_testlink_work(struct work_struct *work)
582{
583 struct smc_link *link = container_of(to_delayed_work(work),
584 struct smc_link, llc_testlink_wrk);
585 unsigned long next_interval;
586 struct smc_link_group *lgr;
587 unsigned long expire_time;
588 u8 user_data[16] = { 0 };
589 int rc;
590
591 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
592 if (link->state != SMC_LNK_ACTIVE)
593 return; /* don't reschedule worker */
594 expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
595 if (time_is_after_jiffies(expire_time)) {
596 next_interval = expire_time - jiffies;
597 goto out;
598 }
599 reinit_completion(&link->llc_testlink_resp);
600 smc_llc_send_test_link(link, user_data);
601 /* receive TEST LINK response over RoCE fabric */
602 rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
603 SMC_LLC_WAIT_TIME);
604 if (rc <= 0) {
605 smc_lgr_terminate(lgr);
606 return;
607 }
608 next_interval = link->llc_testlink_time;
609out:
610 queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
611 next_interval);
612}
613
614int smc_llc_link_init(struct smc_link *link)
615{
616 struct smc_link_group *lgr = container_of(link, struct smc_link_group,
617 lnk[SMC_SINGLE_LINK]);
618 link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
619 *((u32 *)lgr->id),
620 link->link_id);
621 if (!link->llc_wq)
622 return -ENOMEM;
623 init_completion(&link->llc_confirm);
624 init_completion(&link->llc_confirm_resp);
625 init_completion(&link->llc_add);
626 init_completion(&link->llc_add_resp);
627 init_completion(&link->llc_confirm_rkey);
628 init_completion(&link->llc_testlink_resp);
629 INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
630 return 0;
631}
632
633void smc_llc_link_active(struct smc_link *link, int testlink_time)
634{
635 link->state = SMC_LNK_ACTIVE;
636 if (testlink_time) {
637 link->llc_testlink_time = testlink_time * HZ;
638 queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
639 link->llc_testlink_time);
640 }
641}
642
643/* called in tasklet context */
644void smc_llc_link_inactive(struct smc_link *link)
645{
646 link->state = SMC_LNK_INACTIVE;
647 cancel_delayed_work(&link->llc_testlink_wrk);
648}
649
650/* called in worker context */
651void smc_llc_link_clear(struct smc_link *link)
652{
653 flush_workqueue(link->llc_wq);
654 destroy_workqueue(link->llc_wq);
655}
656
657/* register a new rtoken at the remote peer */
658int smc_llc_do_confirm_rkey(struct smc_link *link,
659 struct smc_buf_desc *rmb_desc)
660{
661 int rc;
662
663 reinit_completion(&link->llc_confirm_rkey);
664 smc_llc_send_confirm_rkey(link, rmb_desc);
665 /* receive CONFIRM RKEY response from server over RoCE fabric */
666 rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey,
667 SMC_LLC_WAIT_TIME);
668 if (rc <= 0 || link->llc_confirm_rkey_rc)
669 return -EFAULT;
670 return 0;
671}
672
505/***************************** init, exit, misc ******************************/ 673/***************************** init, exit, misc ******************************/
506 674
507static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { 675static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index e4a7d5e234d5..65c8645e96a1 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -42,8 +42,12 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
42 enum smc_llc_reqresp reqresp); 42 enum smc_llc_reqresp reqresp);
43int smc_llc_send_delete_link(struct smc_link *link, 43int smc_llc_send_delete_link(struct smc_link *link,
44 enum smc_llc_reqresp reqresp); 44 enum smc_llc_reqresp reqresp);
45int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16], 45int smc_llc_link_init(struct smc_link *link);
46 enum smc_llc_reqresp reqresp); 46void smc_llc_link_active(struct smc_link *link, int testlink_time);
47void smc_llc_link_inactive(struct smc_link *link);
48void smc_llc_link_clear(struct smc_link *link);
49int smc_llc_do_confirm_rkey(struct smc_link *link,
50 struct smc_buf_desc *rmb_desc);
47int smc_llc_init(void) __init; 51int smc_llc_init(void) __init;
48 52
49#endif /* SMC_LLC_H */ 53#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index eff4e0d0bb31..3d77b383cccd 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -22,11 +22,10 @@
22#include "smc_tx.h" /* smc_tx_consumer_update() */ 22#include "smc_tx.h" /* smc_tx_consumer_update() */
23#include "smc_rx.h" 23#include "smc_rx.h"
24 24
25/* callback implementation for sk.sk_data_ready() 25/* callback implementation to wakeup consumers blocked with smc_rx_wait().
26 * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
27 * indirectly called by smc_cdc_msg_recv_action(). 26 * indirectly called by smc_cdc_msg_recv_action().
28 */ 27 */
29static void smc_rx_data_ready(struct sock *sk) 28static void smc_rx_wake_up(struct sock *sk)
30{ 29{
31 struct socket_wq *wq; 30 struct socket_wq *wq;
32 31
@@ -44,28 +43,180 @@ static void smc_rx_data_ready(struct sock *sk)
44 rcu_read_unlock(); 43 rcu_read_unlock();
45} 44}
46 45
46/* Update consumer cursor
47 * @conn connection to update
48 * @cons consumer cursor
49 * @len number of Bytes consumed
50 * Returns:
51 * 1 if we should end our receive, 0 otherwise
52 */
53static int smc_rx_update_consumer(struct smc_sock *smc,
54 union smc_host_cursor cons, size_t len)
55{
56 struct smc_connection *conn = &smc->conn;
57 struct sock *sk = &smc->sk;
58 bool force = false;
59 int diff, rc = 0;
60
61 smc_curs_add(conn->rmb_desc->len, &cons, len);
62
63 /* did we process urgent data? */
64 if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
65 diff = smc_curs_comp(conn->rmb_desc->len, &cons,
66 &conn->urg_curs);
67 if (sock_flag(sk, SOCK_URGINLINE)) {
68 if (diff == 0) {
69 force = true;
70 rc = 1;
71 conn->urg_state = SMC_URG_READ;
72 }
73 } else {
74 if (diff == 1) {
75 /* skip urgent byte */
76 force = true;
77 smc_curs_add(conn->rmb_desc->len, &cons, 1);
78 conn->urg_rx_skip_pend = false;
79 } else if (diff < -1)
80 /* we read past urgent byte */
81 conn->urg_state = SMC_URG_READ;
82 }
83 }
84
85 smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
86 conn);
87
88 /* send consumer cursor update if required */
89 /* similar to advertising new TCP rcv_wnd if required */
90 smc_tx_consumer_update(conn, force);
91
92 return rc;
93}
94
95static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
96{
97 struct smc_connection *conn = &smc->conn;
98 union smc_host_cursor cons;
99
100 smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
101 conn);
102 smc_rx_update_consumer(smc, cons, len);
103}
104
105struct smc_spd_priv {
106 struct smc_sock *smc;
107 size_t len;
108};
109
110static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
111 struct pipe_buffer *buf)
112{
113 struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
114 struct smc_sock *smc = priv->smc;
115 struct smc_connection *conn;
116 struct sock *sk = &smc->sk;
117
118 if (sk->sk_state == SMC_CLOSED ||
119 sk->sk_state == SMC_PEERFINCLOSEWAIT ||
120 sk->sk_state == SMC_APPFINCLOSEWAIT)
121 goto out;
122 conn = &smc->conn;
123 lock_sock(sk);
124 smc_rx_update_cons(smc, priv->len);
125 release_sock(sk);
126 if (atomic_sub_and_test(priv->len, &conn->splice_pending))
127 smc_rx_wake_up(sk);
128out:
129 kfree(priv);
130 put_page(buf->page);
131 sock_put(sk);
132}
133
134static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe,
135 struct pipe_buffer *buf)
136{
137 return 1;
138}
139
140static const struct pipe_buf_operations smc_pipe_ops = {
141 .can_merge = 0,
142 .confirm = generic_pipe_buf_confirm,
143 .release = smc_rx_pipe_buf_release,
144 .steal = smc_rx_pipe_buf_nosteal,
145 .get = generic_pipe_buf_get
146};
147
148static void smc_rx_spd_release(struct splice_pipe_desc *spd,
149 unsigned int i)
150{
151 put_page(spd->pages[i]);
152}
153
154static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
155 struct smc_sock *smc)
156{
157 struct splice_pipe_desc spd;
158 struct partial_page partial;
159 struct smc_spd_priv *priv;
160 struct page *page;
161 int bytes;
162
163 page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
164 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
165 if (!priv)
166 return -ENOMEM;
167 priv->len = len;
168 priv->smc = smc;
169 partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
170 partial.len = len;
171 partial.private = (unsigned long)priv;
172
173 spd.nr_pages_max = 1;
174 spd.nr_pages = 1;
175 spd.pages = &page;
176 spd.partial = &partial;
177 spd.ops = &smc_pipe_ops;
178 spd.spd_release = smc_rx_spd_release;
179
180 bytes = splice_to_pipe(pipe, &spd);
181 if (bytes > 0) {
182 sock_hold(&smc->sk);
183 get_page(smc->conn.rmb_desc->pages);
184 atomic_add(bytes, &smc->conn.splice_pending);
185 }
186
187 return bytes;
188}
189
190static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
191{
192 return atomic_read(&conn->bytes_to_rcv) &&
193 !atomic_read(&conn->splice_pending);
194}
195
47/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted 196/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
48 * @smc smc socket 197 * @smc smc socket
49 * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout 198 * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
199 * @fcrit add'l criterion to evaluate as function pointer
50 * Returns: 200 * Returns:
51 * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. 201 * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
52 * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). 202 * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
53 */ 203 */
54static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) 204int smc_rx_wait(struct smc_sock *smc, long *timeo,
205 int (*fcrit)(struct smc_connection *conn))
55{ 206{
56 DEFINE_WAIT_FUNC(wait, woken_wake_function); 207 DEFINE_WAIT_FUNC(wait, woken_wake_function);
57 struct smc_connection *conn = &smc->conn; 208 struct smc_connection *conn = &smc->conn;
58 struct sock *sk = &smc->sk; 209 struct sock *sk = &smc->sk;
59 int rc; 210 int rc;
60 211
61 if (atomic_read(&conn->bytes_to_rcv)) 212 if (fcrit(conn))
62 return 1; 213 return 1;
63 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 214 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
64 add_wait_queue(sk_sleep(sk), &wait); 215 add_wait_queue(sk_sleep(sk), &wait);
65 rc = sk_wait_event(sk, timeo, 216 rc = sk_wait_event(sk, timeo,
66 sk->sk_err || 217 sk->sk_err ||
67 sk->sk_shutdown & RCV_SHUTDOWN || 218 sk->sk_shutdown & RCV_SHUTDOWN ||
68 atomic_read(&conn->bytes_to_rcv) || 219 fcrit(conn) ||
69 smc_cdc_rxed_any_close_or_senddone(conn), 220 smc_cdc_rxed_any_close_or_senddone(conn),
70 &wait); 221 &wait);
71 remove_wait_queue(sk_sleep(sk), &wait); 222 remove_wait_queue(sk_sleep(sk), &wait);
@@ -73,65 +224,115 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
73 return rc; 224 return rc;
74} 225}
75 226
76/* rcvbuf consumer: main API called by socket layer. 227static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
77 * called under sk lock. 228 int flags)
229{
230 struct smc_connection *conn = &smc->conn;
231 union smc_host_cursor cons;
232 struct sock *sk = &smc->sk;
233 int rc = 0;
234
235 if (sock_flag(sk, SOCK_URGINLINE) ||
236 !(conn->urg_state == SMC_URG_VALID) ||
237 conn->urg_state == SMC_URG_READ)
238 return -EINVAL;
239
240 if (conn->urg_state == SMC_URG_VALID) {
241 if (!(flags & MSG_PEEK))
242 smc->conn.urg_state = SMC_URG_READ;
243 msg->msg_flags |= MSG_OOB;
244 if (len > 0) {
245 if (!(flags & MSG_TRUNC))
246 rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
247 len = 1;
248 smc_curs_write(&cons,
249 smc_curs_read(&conn->local_tx_ctrl.cons,
250 conn),
251 conn);
252 if (smc_curs_diff(conn->rmb_desc->len, &cons,
253 &conn->urg_curs) > 1)
254 conn->urg_rx_skip_pend = true;
255 /* Urgent Byte was already accounted for, but trigger
256 * skipping the urgent byte in non-inline case
257 */
258 if (!(flags & MSG_PEEK))
259 smc_rx_update_consumer(smc, cons, 0);
260 } else {
261 msg->msg_flags |= MSG_TRUNC;
262 }
263
264 return rc ? -EFAULT : len;
265 }
266
267 if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
268 return 0;
269
270 return -EAGAIN;
271}
272
273/* smc_rx_recvmsg - receive data from RMBE
274 * @msg: copy data to receive buffer
275 * @pipe: copy data to pipe if set - indicates splice() call
276 *
277 * rcvbuf consumer: main API called by socket layer.
278 * Called under sk lock.
78 */ 279 */
79int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, 280int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
80 int flags) 281 struct pipe_inode_info *pipe, size_t len, int flags)
81{ 282{
82 size_t copylen, read_done = 0, read_remaining = len; 283 size_t copylen, read_done = 0, read_remaining = len;
83 size_t chunk_len, chunk_off, chunk_len_sum; 284 size_t chunk_len, chunk_off, chunk_len_sum;
84 struct smc_connection *conn = &smc->conn; 285 struct smc_connection *conn = &smc->conn;
286 int (*func)(struct smc_connection *conn);
85 union smc_host_cursor cons; 287 union smc_host_cursor cons;
86 int readable, chunk; 288 int readable, chunk;
87 char *rcvbuf_base; 289 char *rcvbuf_base;
88 struct sock *sk; 290 struct sock *sk;
291 int splbytes;
89 long timeo; 292 long timeo;
90 int target; /* Read at least these many bytes */ 293 int target; /* Read at least these many bytes */
91 int rc; 294 int rc;
92 295
93 if (unlikely(flags & MSG_ERRQUEUE)) 296 if (unlikely(flags & MSG_ERRQUEUE))
94 return -EINVAL; /* future work for sk.sk_family == AF_SMC */ 297 return -EINVAL; /* future work for sk.sk_family == AF_SMC */
95 if (flags & MSG_OOB)
96 return -EINVAL; /* future work */
97 298
98 sk = &smc->sk; 299 sk = &smc->sk;
99 if (sk->sk_state == SMC_LISTEN) 300 if (sk->sk_state == SMC_LISTEN)
100 return -ENOTCONN; 301 return -ENOTCONN;
302 if (flags & MSG_OOB)
303 return smc_rx_recv_urg(smc, msg, len, flags);
101 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 304 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
102 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 305 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
103 306
104 msg->msg_namelen = 0;
105 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ 307 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
106 rcvbuf_base = conn->rmb_desc->cpu_addr; 308 rcvbuf_base = conn->rmb_desc->cpu_addr;
107 309
108 do { /* while (read_remaining) */ 310 do { /* while (read_remaining) */
109 if (read_done >= target) 311 if (read_done >= target || (pipe && read_done))
110 break; 312 break;
111 313
112 if (atomic_read(&conn->bytes_to_rcv)) 314 if (atomic_read(&conn->bytes_to_rcv))
113 goto copy; 315 goto copy;
316 else if (conn->urg_state == SMC_URG_VALID)
317 /* we received a single urgent Byte - skip */
318 smc_rx_update_cons(smc, 0);
319
320 if (sk->sk_shutdown & RCV_SHUTDOWN ||
321 smc_cdc_rxed_any_close_or_senddone(conn) ||
322 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
323 break;
114 324
115 if (read_done) { 325 if (read_done) {
116 if (sk->sk_err || 326 if (sk->sk_err ||
117 sk->sk_state == SMC_CLOSED || 327 sk->sk_state == SMC_CLOSED ||
118 sk->sk_shutdown & RCV_SHUTDOWN ||
119 !timeo || 328 !timeo ||
120 signal_pending(current) || 329 signal_pending(current))
121 smc_cdc_rxed_any_close_or_senddone(conn) ||
122 conn->local_tx_ctrl.conn_state_flags.
123 peer_conn_abort)
124 break; 330 break;
125 } else { 331 } else {
126 if (sk->sk_err) { 332 if (sk->sk_err) {
127 read_done = sock_error(sk); 333 read_done = sock_error(sk);
128 break; 334 break;
129 } 335 }
130 if (sk->sk_shutdown & RCV_SHUTDOWN ||
131 smc_cdc_rxed_any_close_or_senddone(conn) ||
132 conn->local_tx_ctrl.conn_state_flags.
133 peer_conn_abort)
134 break;
135 if (sk->sk_state == SMC_CLOSED) { 336 if (sk->sk_state == SMC_CLOSED) {
136 if (!sock_flag(sk, SOCK_DONE)) { 337 if (!sock_flag(sk, SOCK_DONE)) {
137 /* This occurs when user tries to read 338 /* This occurs when user tries to read
@@ -150,32 +351,56 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
150 return -EAGAIN; 351 return -EAGAIN;
151 } 352 }
152 353
153 if (!atomic_read(&conn->bytes_to_rcv)) { 354 if (!smc_rx_data_available(conn)) {
154 smc_rx_wait_data(smc, &timeo); 355 smc_rx_wait(smc, &timeo, smc_rx_data_available);
155 continue; 356 continue;
156 } 357 }
157 358
158copy: 359copy:
159 /* initialize variables for 1st iteration of subsequent loop */ 360 /* initialize variables for 1st iteration of subsequent loop */
160 /* could be just 1 byte, even after smc_rx_wait_data above */ 361 /* could be just 1 byte, even after waiting on data above */
161 readable = atomic_read(&conn->bytes_to_rcv); 362 readable = atomic_read(&conn->bytes_to_rcv);
162 /* not more than what user space asked for */ 363 splbytes = atomic_read(&conn->splice_pending);
163 copylen = min_t(size_t, read_remaining, readable); 364 if (!readable || (msg && splbytes)) {
365 if (splbytes)
366 func = smc_rx_data_available_and_no_splice_pend;
367 else
368 func = smc_rx_data_available;
369 smc_rx_wait(smc, &timeo, func);
370 continue;
371 }
372
164 smc_curs_write(&cons, 373 smc_curs_write(&cons,
165 smc_curs_read(&conn->local_tx_ctrl.cons, conn), 374 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
166 conn); 375 conn);
376 /* subsequent splice() calls pick up where previous left */
377 if (splbytes)
378 smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
379 if (conn->urg_state == SMC_URG_VALID &&
380 sock_flag(&smc->sk, SOCK_URGINLINE) &&
381 readable > 1)
382 readable--; /* always stop at urgent Byte */
383 /* not more than what user space asked for */
384 copylen = min_t(size_t, read_remaining, readable);
167 /* determine chunks where to read from rcvbuf */ 385 /* determine chunks where to read from rcvbuf */
168 /* either unwrapped case, or 1st chunk of wrapped case */ 386 /* either unwrapped case, or 1st chunk of wrapped case */
169 chunk_len = min_t(size_t, 387 chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
170 copylen, conn->rmbe_size - cons.count); 388 cons.count);
171 chunk_len_sum = chunk_len; 389 chunk_len_sum = chunk_len;
172 chunk_off = cons.count; 390 chunk_off = cons.count;
173 smc_rmb_sync_sg_for_cpu(conn); 391 smc_rmb_sync_sg_for_cpu(conn);
174 for (chunk = 0; chunk < 2; chunk++) { 392 for (chunk = 0; chunk < 2; chunk++) {
175 if (!(flags & MSG_TRUNC)) { 393 if (!(flags & MSG_TRUNC)) {
176 rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, 394 if (msg) {
177 chunk_len); 395 rc = memcpy_to_msg(msg, rcvbuf_base +
178 if (rc) { 396 chunk_off,
397 chunk_len);
398 } else {
399 rc = smc_rx_splice(pipe, rcvbuf_base +
400 chunk_off, chunk_len,
401 smc);
402 }
403 if (rc < 0) {
179 if (!read_done) 404 if (!read_done)
180 read_done = -EFAULT; 405 read_done = -EFAULT;
181 smc_rmb_sync_sg_for_device(conn); 406 smc_rmb_sync_sg_for_device(conn);
@@ -196,18 +421,13 @@ copy:
196 421
197 /* update cursors */ 422 /* update cursors */
198 if (!(flags & MSG_PEEK)) { 423 if (!(flags & MSG_PEEK)) {
199 smc_curs_add(conn->rmbe_size, &cons, copylen);
200 /* increased in recv tasklet smc_cdc_msg_rcv() */ 424 /* increased in recv tasklet smc_cdc_msg_rcv() */
201 smp_mb__before_atomic(); 425 smp_mb__before_atomic();
202 atomic_sub(copylen, &conn->bytes_to_rcv); 426 atomic_sub(copylen, &conn->bytes_to_rcv);
203 /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ 427 /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
204 smp_mb__after_atomic(); 428 smp_mb__after_atomic();
205 smc_curs_write(&conn->local_tx_ctrl.cons, 429 if (msg && smc_rx_update_consumer(smc, cons, copylen))
206 smc_curs_read(&cons, conn), 430 goto out;
207 conn);
208 /* send consumer cursor update if required */
209 /* similar to advertising new TCP rcv_wnd if required */
210 smc_tx_consumer_update(conn);
211 } 431 }
212 } while (read_remaining); 432 } while (read_remaining);
213out: 433out:
@@ -217,5 +437,7 @@ out:
217/* Initialize receive properties on connection establishment. NB: not __init! */ 437/* Initialize receive properties on connection establishment. NB: not __init! */
218void smc_rx_init(struct smc_sock *smc) 438void smc_rx_init(struct smc_sock *smc)
219{ 439{
220 smc->sk.sk_data_ready = smc_rx_data_ready; 440 smc->sk.sk_data_ready = smc_rx_wake_up;
441 atomic_set(&smc->conn.splice_pending, 0);
442 smc->conn.urg_state = SMC_URG_READ;
221} 443}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index 3a32b59bf06c..db823c97d824 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -18,7 +18,14 @@
18#include "smc.h" 18#include "smc.h"
19 19
20void smc_rx_init(struct smc_sock *smc); 20void smc_rx_init(struct smc_sock *smc);
21int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, 21
22 int flags); 22int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
23 struct pipe_inode_info *pipe, size_t len, int flags);
24int smc_rx_wait(struct smc_sock *smc, long *timeo,
25 int (*fcrit)(struct smc_connection *conn));
26static inline int smc_rx_data_available(struct smc_connection *conn)
27{
28 return atomic_read(&conn->bytes_to_rcv);
29}
23 30
24#endif /* SMC_RX_H */ 31#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 72f004c9c9b1..cee666400752 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -19,6 +19,7 @@
19#include <linux/sched/signal.h> 19#include <linux/sched/signal.h>
20 20
21#include <net/sock.h> 21#include <net/sock.h>
22#include <net/tcp.h>
22 23
23#include "smc.h" 24#include "smc.h"
24#include "smc_wr.h" 25#include "smc_wr.h"
@@ -26,11 +27,12 @@
26#include "smc_tx.h" 27#include "smc_tx.h"
27 28
28#define SMC_TX_WORK_DELAY HZ 29#define SMC_TX_WORK_DELAY HZ
30#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
29 31
30/***************************** sndbuf producer *******************************/ 32/***************************** sndbuf producer *******************************/
31 33
32/* callback implementation for sk.sk_write_space() 34/* callback implementation for sk.sk_write_space()
33 * to wakeup sndbuf producers that blocked with smc_tx_wait_memory(). 35 * to wakeup sndbuf producers that blocked with smc_tx_wait().
34 * called under sk_socket lock. 36 * called under sk_socket lock.
35 */ 37 */
36static void smc_tx_write_space(struct sock *sk) 38static void smc_tx_write_space(struct sock *sk)
@@ -54,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk)
54 } 56 }
55} 57}
56 58
57/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory(). 59/* Wakeup sndbuf producers that blocked with smc_tx_wait().
58 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). 60 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
59 */ 61 */
60void smc_tx_sndbuf_nonfull(struct smc_sock *smc) 62void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
@@ -64,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
64 smc->sk.sk_write_space(&smc->sk); 66 smc->sk.sk_write_space(&smc->sk);
65} 67}
66 68
67/* blocks sndbuf producer until at least one byte of free space available */ 69/* blocks sndbuf producer until at least one byte of free space available
68static int smc_tx_wait_memory(struct smc_sock *smc, int flags) 70 * or urgent Byte was consumed
71 */
72static int smc_tx_wait(struct smc_sock *smc, int flags)
69{ 73{
70 DEFINE_WAIT_FUNC(wait, woken_wake_function); 74 DEFINE_WAIT_FUNC(wait, woken_wake_function);
71 struct smc_connection *conn = &smc->conn; 75 struct smc_connection *conn = &smc->conn;
@@ -101,20 +105,28 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
101 break; 105 break;
102 } 106 }
103 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 107 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
104 if (atomic_read(&conn->sndbuf_space)) 108 if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
105 break; /* at least 1 byte of free space available */ 109 break; /* at least 1 byte of free & no urgent data */
106 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 110 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
107 sk_wait_event(sk, &timeo, 111 sk_wait_event(sk, &timeo,
108 sk->sk_err || 112 sk->sk_err ||
109 (sk->sk_shutdown & SEND_SHUTDOWN) || 113 (sk->sk_shutdown & SEND_SHUTDOWN) ||
110 smc_cdc_rxed_any_close(conn) || 114 smc_cdc_rxed_any_close(conn) ||
111 atomic_read(&conn->sndbuf_space), 115 (atomic_read(&conn->sndbuf_space) &&
116 !conn->urg_tx_pend),
112 &wait); 117 &wait);
113 } 118 }
114 remove_wait_queue(sk_sleep(sk), &wait); 119 remove_wait_queue(sk_sleep(sk), &wait);
115 return rc; 120 return rc;
116} 121}
117 122
123static bool smc_tx_is_corked(struct smc_sock *smc)
124{
125 struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
126
127 return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
128}
129
118/* sndbuf producer: main API called by socket layer. 130/* sndbuf producer: main API called by socket layer.
119 * called under sock lock. 131 * called under sock lock.
120 */ 132 */
@@ -148,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
148 if (smc_cdc_rxed_any_close(conn)) 160 if (smc_cdc_rxed_any_close(conn))
149 return send_done ?: -ECONNRESET; 161 return send_done ?: -ECONNRESET;
150 162
151 if (!atomic_read(&conn->sndbuf_space)) { 163 if (msg->msg_flags & MSG_OOB)
152 rc = smc_tx_wait_memory(smc, msg->msg_flags); 164 conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
165
166 if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
167 rc = smc_tx_wait(smc, msg->msg_flags);
153 if (rc) { 168 if (rc) {
154 if (send_done) 169 if (send_done)
155 return send_done; 170 return send_done;
@@ -159,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
159 } 174 }
160 175
161 /* initialize variables for 1st iteration of subsequent loop */ 176 /* initialize variables for 1st iteration of subsequent loop */
162 /* could be just 1 byte, even after smc_tx_wait_memory above */ 177 /* could be just 1 byte, even after smc_tx_wait above */
163 writespace = atomic_read(&conn->sndbuf_space); 178 writespace = atomic_read(&conn->sndbuf_space);
164 /* not more than what user space asked for */ 179 /* not more than what user space asked for */
165 copylen = min_t(size_t, send_remaining, writespace); 180 copylen = min_t(size_t, send_remaining, writespace);
@@ -171,8 +186,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
171 tx_cnt_prep = prep.count; 186 tx_cnt_prep = prep.count;
172 /* determine chunks where to write into sndbuf */ 187 /* determine chunks where to write into sndbuf */
173 /* either unwrapped case, or 1st chunk of wrapped case */ 188 /* either unwrapped case, or 1st chunk of wrapped case */
174 chunk_len = min_t(size_t, 189 chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
175 copylen, conn->sndbuf_size - tx_cnt_prep); 190 tx_cnt_prep);
176 chunk_len_sum = chunk_len; 191 chunk_len_sum = chunk_len;
177 chunk_off = tx_cnt_prep; 192 chunk_off = tx_cnt_prep;
178 smc_sndbuf_sync_sg_for_cpu(conn); 193 smc_sndbuf_sync_sg_for_cpu(conn);
@@ -197,19 +212,30 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
197 } 212 }
198 smc_sndbuf_sync_sg_for_device(conn); 213 smc_sndbuf_sync_sg_for_device(conn);
199 /* update cursors */ 214 /* update cursors */
200 smc_curs_add(conn->sndbuf_size, &prep, copylen); 215 smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
201 smc_curs_write(&conn->tx_curs_prep, 216 smc_curs_write(&conn->tx_curs_prep,
202 smc_curs_read(&prep, conn), 217 smc_curs_read(&prep, conn),
203 conn); 218 conn);
204 /* increased in send tasklet smc_cdc_tx_handler() */ 219 /* increased in send tasklet smc_cdc_tx_handler() */
205 smp_mb__before_atomic(); 220 smp_mb__before_atomic();
206 atomic_sub(copylen, &conn->sndbuf_space); 221 atomic_sub(copylen, &conn->sndbuf_space);
207 /* guarantee 0 <= sndbuf_space <= sndbuf_size */ 222 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
208 smp_mb__after_atomic(); 223 smp_mb__after_atomic();
209 /* since we just produced more new data into sndbuf, 224 /* since we just produced more new data into sndbuf,
210 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC 225 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
211 */ 226 */
212 smc_tx_sndbuf_nonempty(conn); 227 if ((msg->msg_flags & MSG_OOB) && !send_remaining)
228 conn->urg_tx_pend = true;
229 if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
230 (atomic_read(&conn->sndbuf_space) >
231 (conn->sndbuf_desc->len >> 1)))
232 /* for a corked socket defer the RDMA writes if there
233 * is still sufficient sndbuf_space available
234 */
235 schedule_delayed_work(&conn->tx_work,
236 SMC_TX_CORK_DELAY);
237 else
238 smc_tx_sndbuf_nonempty(conn);
213 } /* while (msg_data_left(msg)) */ 239 } /* while (msg_data_left(msg)) */
214 240
215 return send_done; 241 return send_done;
@@ -243,7 +269,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
243 rdma_wr.remote_addr = 269 rdma_wr.remote_addr =
244 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + 270 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
245 /* RMBE within RMB */ 271 /* RMBE within RMB */
246 ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) + 272 conn->tx_off +
247 /* offset within RMBE */ 273 /* offset within RMBE */
248 peer_rmbe_offset; 274 peer_rmbe_offset;
249 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; 275 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
@@ -268,7 +294,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
268 atomic_sub(len, &conn->peer_rmbe_space); 294 atomic_sub(len, &conn->peer_rmbe_space);
269 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ 295 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
270 smp_mb__after_atomic(); 296 smp_mb__after_atomic();
271 smc_curs_add(conn->sndbuf_size, sent, len); 297 smc_curs_add(conn->sndbuf_desc->len, sent, len);
272} 298}
273 299
274/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; 300/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
@@ -281,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
281 union smc_host_cursor sent, prep, prod, cons; 307 union smc_host_cursor sent, prep, prod, cons;
282 struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; 308 struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
283 struct smc_link_group *lgr = conn->lgr; 309 struct smc_link_group *lgr = conn->lgr;
310 struct smc_cdc_producer_flags *pflags;
284 int to_send, rmbespace; 311 int to_send, rmbespace;
285 struct smc_link *link; 312 struct smc_link *link;
286 dma_addr_t dma_addr; 313 dma_addr_t dma_addr;
@@ -291,7 +318,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
291 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); 318 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
292 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); 319 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
293 /* cf. wmem_alloc - (snd_max - snd_una) */ 320 /* cf. wmem_alloc - (snd_max - snd_una) */
294 to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep); 321 to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
295 if (to_send <= 0) 322 if (to_send <= 0)
296 return 0; 323 return 0;
297 324
@@ -308,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
308 conn); 335 conn);
309 336
310 /* if usable snd_wnd closes ask peer to advertise once it opens again */ 337 /* if usable snd_wnd closes ask peer to advertise once it opens again */
311 conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace); 338 pflags = &conn->local_tx_ctrl.prod_flags;
339 pflags->write_blocked = (to_send >= rmbespace);
312 /* cf. usable snd_wnd */ 340 /* cf. usable snd_wnd */
313 len = min(to_send, rmbespace); 341 len = min(to_send, rmbespace);
314 342
@@ -333,12 +361,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
333 dst_len_sum = dst_len; 361 dst_len_sum = dst_len;
334 src_off = sent.count; 362 src_off = sent.count;
335 /* dst_len determines the maximum src_len */ 363 /* dst_len determines the maximum src_len */
336 if (sent.count + dst_len <= conn->sndbuf_size) { 364 if (sent.count + dst_len <= conn->sndbuf_desc->len) {
337 /* unwrapped src case: single chunk of entire dst_len */ 365 /* unwrapped src case: single chunk of entire dst_len */
338 src_len = dst_len; 366 src_len = dst_len;
339 } else { 367 } else {
340 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ 368 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
341 src_len = conn->sndbuf_size - sent.count; 369 src_len = conn->sndbuf_desc->len - sent.count;
342 } 370 }
343 src_len_sum = src_len; 371 src_len_sum = src_len;
344 dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); 372 dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
@@ -350,8 +378,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
350 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; 378 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
351 num_sges++; 379 num_sges++;
352 src_off += src_len; 380 src_off += src_len;
353 if (src_off >= conn->sndbuf_size) 381 if (src_off >= conn->sndbuf_desc->len)
354 src_off -= conn->sndbuf_size; 382 src_off -= conn->sndbuf_desc->len;
355 /* modulo in send ring */ 383 /* modulo in send ring */
356 if (src_len_sum == dst_len) 384 if (src_len_sum == dst_len)
357 break; /* either on 1st or 2nd iteration */ 385 break; /* either on 1st or 2nd iteration */
@@ -369,10 +397,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
369 dst_len = len - dst_len; /* remainder */ 397 dst_len = len - dst_len; /* remainder */
370 dst_len_sum += dst_len; 398 dst_len_sum += dst_len;
371 src_len = min_t(int, 399 src_len = min_t(int,
372 dst_len, conn->sndbuf_size - sent.count); 400 dst_len, conn->sndbuf_desc->len - sent.count);
373 src_len_sum = src_len; 401 src_len_sum = src_len;
374 } 402 }
375 403
404 if (conn->urg_tx_pend && len == to_send)
405 pflags->urg_data_present = 1;
376 smc_tx_advance_cursors(conn, &prod, &sent, len); 406 smc_tx_advance_cursors(conn, &prod, &sent, len);
377 /* update connection's cursors with advanced local cursors */ 407 /* update connection's cursors with advanced local cursors */
378 smc_curs_write(&conn->local_tx_ctrl.prod, 408 smc_curs_write(&conn->local_tx_ctrl.prod,
@@ -392,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
392 */ 422 */
393int smc_tx_sndbuf_nonempty(struct smc_connection *conn) 423int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
394{ 424{
425 struct smc_cdc_producer_flags *pflags;
395 struct smc_cdc_tx_pend *pend; 426 struct smc_cdc_tx_pend *pend;
396 struct smc_wr_buf *wr_buf; 427 struct smc_wr_buf *wr_buf;
397 int rc; 428 int rc;
@@ -409,20 +440,27 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
409 } 440 }
410 rc = 0; 441 rc = 0;
411 if (conn->alert_token_local) /* connection healthy */ 442 if (conn->alert_token_local) /* connection healthy */
412 schedule_delayed_work(&conn->tx_work, 443 mod_delayed_work(system_wq, &conn->tx_work,
413 SMC_TX_WORK_DELAY); 444 SMC_TX_WORK_DELAY);
414 } 445 }
415 goto out_unlock; 446 goto out_unlock;
416 } 447 }
417 448
418 rc = smc_tx_rdma_writes(conn); 449 if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
419 if (rc) { 450 rc = smc_tx_rdma_writes(conn);
420 smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], 451 if (rc) {
421 (struct smc_wr_tx_pend_priv *)pend); 452 smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
422 goto out_unlock; 453 (struct smc_wr_tx_pend_priv *)pend);
454 goto out_unlock;
455 }
423 } 456 }
424 457
425 rc = smc_cdc_msg_send(conn, wr_buf, pend); 458 rc = smc_cdc_msg_send(conn, wr_buf, pend);
459 pflags = &conn->local_tx_ctrl.prod_flags;
460 if (!rc && pflags->urg_data_present) {
461 pflags->urg_data_pending = 0;
462 pflags->urg_data_present = 0;
463 }
426 464
427out_unlock: 465out_unlock:
428 spin_unlock_bh(&conn->send_lock); 466 spin_unlock_bh(&conn->send_lock);
@@ -432,7 +470,7 @@ out_unlock:
432/* Wakeup sndbuf consumers from process context 470/* Wakeup sndbuf consumers from process context
433 * since there is more data to transmit 471 * since there is more data to transmit
434 */ 472 */
435static void smc_tx_work(struct work_struct *work) 473void smc_tx_work(struct work_struct *work)
436{ 474{
437 struct smc_connection *conn = container_of(to_delayed_work(work), 475 struct smc_connection *conn = container_of(to_delayed_work(work),
438 struct smc_connection, 476 struct smc_connection,
@@ -455,7 +493,7 @@ out:
455 release_sock(&smc->sk); 493 release_sock(&smc->sk);
456} 494}
457 495
458void smc_tx_consumer_update(struct smc_connection *conn) 496void smc_tx_consumer_update(struct smc_connection *conn, bool force)
459{ 497{
460 union smc_host_cursor cfed, cons; 498 union smc_host_cursor cfed, cons;
461 int to_confirm; 499 int to_confirm;
@@ -466,11 +504,12 @@ void smc_tx_consumer_update(struct smc_connection *conn)
466 smc_curs_write(&cfed, 504 smc_curs_write(&cfed,
467 smc_curs_read(&conn->rx_curs_confirmed, conn), 505 smc_curs_read(&conn->rx_curs_confirmed, conn),
468 conn); 506 conn);
469 to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons); 507 to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
470 508
471 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || 509 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
510 force ||
472 ((to_confirm > conn->rmbe_update_limit) && 511 ((to_confirm > conn->rmbe_update_limit) &&
473 ((to_confirm > (conn->rmbe_size / 2)) || 512 ((to_confirm > (conn->rmb_desc->len / 2)) ||
474 conn->local_rx_ctrl.prod_flags.write_blocked))) { 513 conn->local_rx_ctrl.prod_flags.write_blocked))) {
475 if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && 514 if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
476 conn->alert_token_local) { /* connection healthy */ 515 conn->alert_token_local) { /* connection healthy */
@@ -494,6 +533,4 @@ void smc_tx_consumer_update(struct smc_connection *conn)
494void smc_tx_init(struct smc_sock *smc) 533void smc_tx_init(struct smc_sock *smc)
495{ 534{
496 smc->sk.sk_write_space = smc_tx_write_space; 535 smc->sk.sk_write_space = smc_tx_write_space;
497 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
498 spin_lock_init(&smc->conn.send_lock);
499} 536}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 78255964fa4d..9d2238909fa0 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -24,13 +24,14 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
24 24
25 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); 25 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
26 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); 26 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
27 return smc_curs_diff(conn->sndbuf_size, &sent, &prep); 27 return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
28} 28}
29 29
30void smc_tx_work(struct work_struct *work);
30void smc_tx_init(struct smc_sock *smc); 31void smc_tx_init(struct smc_sock *smc);
31int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); 32int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
32int smc_tx_sndbuf_nonempty(struct smc_connection *conn); 33int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
33void smc_tx_sndbuf_nonfull(struct smc_sock *smc); 34void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
34void smc_tx_consumer_update(struct smc_connection *conn); 35void smc_tx_consumer_update(struct smc_connection *conn, bool force);
35 36
36#endif /* SMC_TX_H */ 37#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 1b8af23e6e2b..cc7c1bb60fe8 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -376,6 +376,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
376 for (i = 0; i < num; i++) { 376 for (i = 0; i < num; i++) {
377 link = wc[i].qp->qp_context; 377 link = wc[i].qp->qp_context;
378 if (wc[i].status == IB_WC_SUCCESS) { 378 if (wc[i].status == IB_WC_SUCCESS) {
379 link->wr_rx_tstamp = jiffies;
379 smc_wr_rx_demultiplex(&wc[i]); 380 smc_wr_rx_demultiplex(&wc[i]);
380 smc_wr_rx_post(link); /* refill WR RX */ 381 smc_wr_rx_post(link); /* refill WR RX */
381 } else { 382 } else {
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 092bebc70048..1a9695183599 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -512,6 +512,19 @@ int strp_init(struct strparser *strp, struct sock *sk,
512} 512}
513EXPORT_SYMBOL_GPL(strp_init); 513EXPORT_SYMBOL_GPL(strp_init);
514 514
515/* Sock process lock held (lock_sock) */
516void __strp_unpause(struct strparser *strp)
517{
518 strp->paused = 0;
519
520 if (strp->need_bytes) {
521 if (strp_peek_len(strp) < strp->need_bytes)
522 return;
523 }
524 strp_read_sock(strp);
525}
526EXPORT_SYMBOL_GPL(__strp_unpause);
527
515void strp_unpause(struct strparser *strp) 528void strp_unpause(struct strparser *strp)
516{ 529{
517 strp->paused = 0; 530 strp->paused = 0;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f7d47c89d658..2dfb492a7c94 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -697,6 +697,9 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
697 goto prop_msg_full; 697 goto prop_msg_full;
698 if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) 698 if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window))
699 goto prop_msg_full; 699 goto prop_msg_full;
700 if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP)
701 if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu))
702 goto prop_msg_full;
700 703
701 nla_nest_end(msg->skb, prop); 704 nla_nest_end(msg->skb, prop);
702 705
@@ -979,12 +982,23 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
979 982
980 if (props[TIPC_NLA_PROP_TOL]) { 983 if (props[TIPC_NLA_PROP_TOL]) {
981 b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); 984 b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
982 tipc_node_apply_tolerance(net, b); 985 tipc_node_apply_property(net, b, TIPC_NLA_PROP_TOL);
983 } 986 }
984 if (props[TIPC_NLA_PROP_PRIO]) 987 if (props[TIPC_NLA_PROP_PRIO])
985 b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); 988 b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
986 if (props[TIPC_NLA_PROP_WIN]) 989 if (props[TIPC_NLA_PROP_WIN])
987 b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); 990 b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
991 if (props[TIPC_NLA_PROP_MTU]) {
992 if (b->media->type_id != TIPC_MEDIA_TYPE_UDP)
993 return -EINVAL;
994#ifdef CONFIG_TIPC_MEDIA_UDP
995 if (tipc_udp_mtu_bad(nla_get_u32
996 (props[TIPC_NLA_PROP_MTU])))
997 return -EINVAL;
998 b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
999 tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU);
1000#endif
1001 }
988 } 1002 }
989 1003
990 return 0; 1004 return 0;
@@ -1029,6 +1043,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
1029 goto prop_msg_full; 1043 goto prop_msg_full;
1030 if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) 1044 if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
1031 goto prop_msg_full; 1045 goto prop_msg_full;
1046 if (media->type_id == TIPC_MEDIA_TYPE_UDP)
1047 if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
1048 goto prop_msg_full;
1032 1049
1033 nla_nest_end(msg->skb, prop); 1050 nla_nest_end(msg->skb, prop);
1034 nla_nest_end(msg->skb, attrs); 1051 nla_nest_end(msg->skb, attrs);
@@ -1158,6 +1175,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
1158 m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); 1175 m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
1159 if (props[TIPC_NLA_PROP_WIN]) 1176 if (props[TIPC_NLA_PROP_WIN])
1160 m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); 1177 m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
1178 if (props[TIPC_NLA_PROP_MTU]) {
1179 if (m->type_id != TIPC_MEDIA_TYPE_UDP)
1180 return -EINVAL;
1181#ifdef CONFIG_TIPC_MEDIA_UDP
1182 if (tipc_udp_mtu_bad(nla_get_u32
1183 (props[TIPC_NLA_PROP_MTU])))
1184 return -EINVAL;
1185 m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
1186#endif
1187 }
1161 } 1188 }
1162 1189
1163 return 0; 1190 return 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 6efcee63a381..394290cbbb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -94,6 +94,8 @@ struct tipc_bearer;
94 * @priority: default link (and bearer) priority 94 * @priority: default link (and bearer) priority
95 * @tolerance: default time (in ms) before declaring link failure 95 * @tolerance: default time (in ms) before declaring link failure
96 * @window: default window (in packets) before declaring link congestion 96 * @window: default window (in packets) before declaring link congestion
97 * @mtu: max packet size bearer can support for media type not dependent on
98 * underlying device MTU
97 * @type_id: TIPC media identifier 99 * @type_id: TIPC media identifier
98 * @hwaddr_len: TIPC media address len 100 * @hwaddr_len: TIPC media address len
99 * @name: media name 101 * @name: media name
@@ -118,6 +120,7 @@ struct tipc_media {
118 u32 priority; 120 u32 priority;
119 u32 tolerance; 121 u32 tolerance;
120 u32 window; 122 u32 window;
123 u32 mtu;
121 u32 type_id; 124 u32 type_id;
122 u32 hwaddr_len; 125 u32 hwaddr_len;
123 char name[TIPC_MAX_MEDIA_NAME]; 126 char name[TIPC_MAX_MEDIA_NAME];
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index dd1c4fa2eb78..bebe88cae07b 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -136,12 +136,12 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
136} 136}
137 137
138/** 138/**
139 * tipc_service_find_range - find service range matching a service instance 139 * tipc_service_first_range - find first service range in tree matching instance
140 * 140 *
141 * Very time-critical, so binary search through range rb tree 141 * Very time-critical, so binary search through range rb tree
142 */ 142 */
143static struct service_range *tipc_service_find_range(struct tipc_service *sc, 143static struct service_range *tipc_service_first_range(struct tipc_service *sc,
144 u32 instance) 144 u32 instance)
145{ 145{
146 struct rb_node *n = sc->ranges.rb_node; 146 struct rb_node *n = sc->ranges.rb_node;
147 struct service_range *sr; 147 struct service_range *sr;
@@ -158,6 +158,30 @@ static struct service_range *tipc_service_find_range(struct tipc_service *sc,
158 return NULL; 158 return NULL;
159} 159}
160 160
161/* tipc_service_find_range - find service range matching publication parameters
162 */
163static struct service_range *tipc_service_find_range(struct tipc_service *sc,
164 u32 lower, u32 upper)
165{
166 struct rb_node *n = sc->ranges.rb_node;
167 struct service_range *sr;
168
169 sr = tipc_service_first_range(sc, lower);
170 if (!sr)
171 return NULL;
172
173 /* Look for exact match */
174 for (n = &sr->tree_node; n; n = rb_next(n)) {
175 sr = container_of(n, struct service_range, tree_node);
176 if (sr->upper == upper)
177 break;
178 }
179 if (!n || sr->lower != lower || sr->upper != upper)
180 return NULL;
181
182 return sr;
183}
184
161static struct service_range *tipc_service_create_range(struct tipc_service *sc, 185static struct service_range *tipc_service_create_range(struct tipc_service *sc,
162 u32 lower, u32 upper) 186 u32 lower, u32 upper)
163{ 187{
@@ -238,54 +262,19 @@ err:
238/** 262/**
239 * tipc_service_remove_publ - remove a publication from a service 263 * tipc_service_remove_publ - remove a publication from a service
240 */ 264 */
241static struct publication *tipc_service_remove_publ(struct net *net, 265static struct publication *tipc_service_remove_publ(struct service_range *sr,
242 struct tipc_service *sc, 266 u32 node, u32 key)
243 u32 lower, u32 upper,
244 u32 node, u32 key,
245 struct service_range **rng)
246{ 267{
247 struct tipc_subscription *sub, *tmp;
248 struct service_range *sr;
249 struct publication *p; 268 struct publication *p;
250 bool found = false;
251 bool last = false;
252 struct rb_node *n;
253
254 sr = tipc_service_find_range(sc, lower);
255 if (!sr)
256 return NULL;
257 269
258 /* Find exact matching service range */
259 for (n = &sr->tree_node; n; n = rb_next(n)) {
260 sr = container_of(n, struct service_range, tree_node);
261 if (sr->upper == upper)
262 break;
263 }
264 if (!n || sr->lower != lower || sr->upper != upper)
265 return NULL;
266
267 /* Find publication, if it exists */
268 list_for_each_entry(p, &sr->all_publ, all_publ) { 270 list_for_each_entry(p, &sr->all_publ, all_publ) {
269 if (p->key != key || (node && node != p->node)) 271 if (p->key != key || (node && node != p->node))
270 continue; 272 continue;
271 found = true; 273 list_del(&p->all_publ);
272 break; 274 list_del(&p->local_publ);
275 return p;
273 } 276 }
274 if (!found) 277 return NULL;
275 return NULL;
276
277 list_del(&p->all_publ);
278 list_del(&p->local_publ);
279 if (list_empty(&sr->all_publ))
280 last = true;
281
282 /* Notify any waiting subscriptions */
283 list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
284 tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
285 p->port, p->node, p->scope, last);
286 }
287 *rng = sr;
288 return p;
289} 278}
290 279
291/** 280/**
@@ -376,17 +365,31 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
376 u32 node, u32 key) 365 u32 node, u32 key)
377{ 366{
378 struct tipc_service *sc = tipc_service_find(net, type); 367 struct tipc_service *sc = tipc_service_find(net, type);
368 struct tipc_subscription *sub, *tmp;
379 struct service_range *sr = NULL; 369 struct service_range *sr = NULL;
380 struct publication *p = NULL; 370 struct publication *p = NULL;
371 bool last;
381 372
382 if (!sc) 373 if (!sc)
383 return NULL; 374 return NULL;
384 375
385 spin_lock_bh(&sc->lock); 376 spin_lock_bh(&sc->lock);
386 p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr); 377 sr = tipc_service_find_range(sc, lower, upper);
378 if (!sr)
379 goto exit;
380 p = tipc_service_remove_publ(sr, node, key);
381 if (!p)
382 goto exit;
383
384 /* Notify any waiting subscriptions */
385 last = list_empty(&sr->all_publ);
386 list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
387 tipc_sub_report_overlap(sub, lower, upper, TIPC_WITHDRAWN,
388 p->port, node, p->scope, last);
389 }
387 390
388 /* Remove service range item if this was its last publication */ 391 /* Remove service range item if this was its last publication */
389 if (sr && list_empty(&sr->all_publ)) { 392 if (list_empty(&sr->all_publ)) {
390 rb_erase(&sr->tree_node, &sc->ranges); 393 rb_erase(&sr->tree_node, &sc->ranges);
391 kfree(sr); 394 kfree(sr);
392 } 395 }
@@ -396,6 +399,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
396 hlist_del_init_rcu(&sc->service_list); 399 hlist_del_init_rcu(&sc->service_list);
397 kfree_rcu(sc, rcu); 400 kfree_rcu(sc, rcu);
398 } 401 }
402exit:
399 spin_unlock_bh(&sc->lock); 403 spin_unlock_bh(&sc->lock);
400 return p; 404 return p;
401} 405}
@@ -437,7 +441,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
437 goto not_found; 441 goto not_found;
438 442
439 spin_lock_bh(&sc->lock); 443 spin_lock_bh(&sc->lock);
440 sr = tipc_service_find_range(sc, instance); 444 sr = tipc_service_first_range(sc, instance);
441 if (unlikely(!sr)) 445 if (unlikely(!sr))
442 goto no_match; 446 goto no_match;
443 447
@@ -484,7 +488,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
484 488
485 spin_lock_bh(&sc->lock); 489 spin_lock_bh(&sc->lock);
486 490
487 sr = tipc_service_find_range(sc, instance); 491 sr = tipc_service_first_range(sc, instance);
488 if (!sr) 492 if (!sr)
489 goto no_match; 493 goto no_match;
490 494
@@ -756,8 +760,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc)
756 spin_lock_bh(&sc->lock); 760 spin_lock_bh(&sc->lock);
757 rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { 761 rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
758 list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { 762 list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
759 tipc_service_remove_publ(net, sc, p->lower, p->upper, 763 tipc_service_remove_publ(sr, p->node, p->key);
760 p->node, p->key, &sr);
761 kfree_rcu(p, rcu); 764 kfree_rcu(p, rcu);
762 } 765 }
763 rb_erase(&sr->tree_node, &sc->ranges); 766 rb_erase(&sr->tree_node, &sc->ranges);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index f29549de9245..6a44eb812baf 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
195 return mtu; 195 return mtu;
196} 196}
197 197
198bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
199{
200 u8 *own_id = tipc_own_id(net);
201 struct tipc_node *n;
202
203 if (!own_id)
204 return true;
205
206 if (addr == tipc_own_addr(net)) {
207 memcpy(id, own_id, TIPC_NODEID_LEN);
208 return true;
209 }
210 n = tipc_node_find(net, addr);
211 if (!n)
212 return false;
213
214 memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
215 tipc_node_put(n);
216 return true;
217}
218
198u16 tipc_node_get_capabilities(struct net *net, u32 addr) 219u16 tipc_node_get_capabilities(struct net *net, u32 addr)
199{ 220{
200 struct tipc_node *n; 221 struct tipc_node *n;
@@ -1681,7 +1702,8 @@ discard:
1681 kfree_skb(skb); 1702 kfree_skb(skb);
1682} 1703}
1683 1704
1684void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b) 1705void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
1706 int prop)
1685{ 1707{
1686 struct tipc_net *tn = tipc_net(net); 1708 struct tipc_net *tn = tipc_net(net);
1687 int bearer_id = b->identity; 1709 int bearer_id = b->identity;
@@ -1696,8 +1718,13 @@ void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
1696 list_for_each_entry_rcu(n, &tn->node_list, list) { 1718 list_for_each_entry_rcu(n, &tn->node_list, list) {
1697 tipc_node_write_lock(n); 1719 tipc_node_write_lock(n);
1698 e = &n->links[bearer_id]; 1720 e = &n->links[bearer_id];
1699 if (e->link) 1721 if (e->link) {
1700 tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); 1722 if (prop == TIPC_NLA_PROP_TOL)
1723 tipc_link_set_tolerance(e->link, b->tolerance,
1724 &xmitq);
1725 else if (prop == TIPC_NLA_PROP_MTU)
1726 tipc_link_set_mtu(e->link, b->mtu);
1727 }
1701 tipc_node_write_unlock(n); 1728 tipc_node_write_unlock(n);
1702 tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); 1729 tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr);
1703 } 1730 }
diff --git a/net/tipc/node.h b/net/tipc/node.h
index f24b83500df1..846c8f240872 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,6 +60,7 @@ enum {
60#define INVALID_BEARER_ID -1 60#define INVALID_BEARER_ID -1
61 61
62void tipc_node_stop(struct net *net); 62void tipc_node_stop(struct net *net);
63bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
63u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); 64u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
64void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, 65void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
65 struct tipc_bearer *bearer, 66 struct tipc_bearer *bearer,
@@ -67,7 +68,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
67 struct tipc_media_addr *maddr, 68 struct tipc_media_addr *maddr,
68 bool *respond, bool *dupl_addr); 69 bool *respond, bool *dupl_addr);
69void tipc_node_delete_links(struct net *net, int bearer_id); 70void tipc_node_delete_links(struct net *net, int bearer_id);
70void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b); 71void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop);
71int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, 72int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
72 char *linkname, size_t len); 73 char *linkname, size_t len);
73int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, 74int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3bb45042e833..14a5d055717d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2970,7 +2970,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
2970 2970
2971static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2971static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2972{ 2972{
2973 struct sock *sk = sock->sk; 2973 struct net *net = sock_net(sock->sk);
2974 struct tipc_sioc_nodeid_req nr = {0};
2974 struct tipc_sioc_ln_req lnr; 2975 struct tipc_sioc_ln_req lnr;
2975 void __user *argp = (void __user *)arg; 2976 void __user *argp = (void __user *)arg;
2976 2977
@@ -2978,7 +2979,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2978 case SIOCGETLINKNAME: 2979 case SIOCGETLINKNAME:
2979 if (copy_from_user(&lnr, argp, sizeof(lnr))) 2980 if (copy_from_user(&lnr, argp, sizeof(lnr)))
2980 return -EFAULT; 2981 return -EFAULT;
2981 if (!tipc_node_get_linkname(sock_net(sk), 2982 if (!tipc_node_get_linkname(net,
2982 lnr.bearer_id & 0xffff, lnr.peer, 2983 lnr.bearer_id & 0xffff, lnr.peer,
2983 lnr.linkname, TIPC_MAX_LINK_NAME)) { 2984 lnr.linkname, TIPC_MAX_LINK_NAME)) {
2984 if (copy_to_user(argp, &lnr, sizeof(lnr))) 2985 if (copy_to_user(argp, &lnr, sizeof(lnr)))
@@ -2986,6 +2987,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2986 return 0; 2987 return 0;
2987 } 2988 }
2988 return -EADDRNOTAVAIL; 2989 return -EADDRNOTAVAIL;
2990 case SIOCGETNODEID:
2991 if (copy_from_user(&nr, argp, sizeof(nr)))
2992 return -EFAULT;
2993 if (!tipc_node_get_id(net, nr.peer, nr.node_id))
2994 return -EADDRNOTAVAIL;
2995 if (copy_to_user(argp, &nr, sizeof(nr)))
2996 return -EFAULT;
2997 return 0;
2989 default: 2998 default:
2990 return -ENOIOCTLCMD; 2999 return -ENOIOCTLCMD;
2991 } 3000 }
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e7d91f5d5cae..9783101bc4a9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
713 err = -EINVAL; 713 err = -EINVAL;
714 goto err; 714 goto err;
715 } 715 }
716 b->mtu = dev->mtu - sizeof(struct iphdr) 716 b->mtu = b->media->mtu;
717 - sizeof(struct udphdr);
718#if IS_ENABLED(CONFIG_IPV6) 717#if IS_ENABLED(CONFIG_IPV6)
719 } else if (local.proto == htons(ETH_P_IPV6)) { 718 } else if (local.proto == htons(ETH_P_IPV6)) {
720 udp_conf.family = AF_INET6; 719 udp_conf.family = AF_INET6;
@@ -803,6 +802,7 @@ struct tipc_media udp_media_info = {
803 .priority = TIPC_DEF_LINK_PRI, 802 .priority = TIPC_DEF_LINK_PRI,
804 .tolerance = TIPC_DEF_LINK_TOL, 803 .tolerance = TIPC_DEF_LINK_TOL,
805 .window = TIPC_DEF_LINK_WIN, 804 .window = TIPC_DEF_LINK_WIN,
805 .mtu = TIPC_DEF_LINK_UDP_MTU,
806 .type_id = TIPC_MEDIA_TYPE_UDP, 806 .type_id = TIPC_MEDIA_TYPE_UDP,
807 .hwaddr_len = 0, 807 .hwaddr_len = 0,
808 .name = "udp" 808 .name = "udp"
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 281bbae87726..e7455cc73e16 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -38,9 +38,23 @@
38#ifndef _TIPC_UDP_MEDIA_H 38#ifndef _TIPC_UDP_MEDIA_H
39#define _TIPC_UDP_MEDIA_H 39#define _TIPC_UDP_MEDIA_H
40 40
41#include <linux/ip.h>
42#include <linux/udp.h>
43
41int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); 44int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
42int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b); 45int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
43int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb); 46int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
44 47
48/* check if configured MTU is too low for tipc headers */
49static inline bool tipc_udp_mtu_bad(u32 mtu)
50{
51 if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) +
52 sizeof(struct udphdr)))
53 return false;
54
55 pr_warn("MTU too low for tipc bearer\n");
56 return true;
57}
58
45#endif 59#endif
46#endif 60#endif
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 89b8745a986f..73f05ece53d0 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -14,3 +14,13 @@ config TLS
14 encryption handling of the TLS protocol to be done in-kernel. 14 encryption handling of the TLS protocol to be done in-kernel.
15 15
16 If unsure, say N. 16 If unsure, say N.
17
18config TLS_DEVICE
19 bool "Transport Layer Security HW offload"
20 depends on TLS
21 select SOCK_VALIDATE_XMIT
22 default n
23 help
24 Enable kernel support for HW offload of the TLS protocol.
25
26 If unsure, say N.
diff --git a/net/tls/Makefile b/net/tls/Makefile
index a930fd1c4f7b..4d6b728a67d0 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_TLS) += tls.o 5obj-$(CONFIG_TLS) += tls.o
6 6
7tls-y := tls_main.o tls_sw.o 7tls-y := tls_main.o tls_sw.o
8
9tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
new file mode 100644
index 000000000000..a7a8f8e20ff3
--- /dev/null
+++ b/net/tls/tls_device.c
@@ -0,0 +1,766 @@
1/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
2 *
3 * This software is available to you under a choice of one of two
4 * licenses. You may choose to be licensed under the terms of the GNU
5 * General Public License (GPL) Version 2, available from the file
6 * COPYING in the main directory of this source tree, or the
7 * OpenIB.org BSD license below:
8 *
9 * Redistribution and use in source and binary forms, with or
10 * without modification, are permitted provided that the following
11 * conditions are met:
12 *
13 * - Redistributions of source code must retain the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer.
16 *
17 * - Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials
20 * provided with the distribution.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
26 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
27 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
28 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 * SOFTWARE.
30 */
31
32#include <crypto/aead.h>
33#include <linux/highmem.h>
34#include <linux/module.h>
35#include <linux/netdevice.h>
36#include <net/dst.h>
37#include <net/inet_connection_sock.h>
38#include <net/tcp.h>
39#include <net/tls.h>
40
41/* device_offload_lock is used to synchronize tls_dev_add
42 * against NETDEV_DOWN notifications.
43 */
44static DECLARE_RWSEM(device_offload_lock);
45
46static void tls_device_gc_task(struct work_struct *work);
47
48static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
49static LIST_HEAD(tls_device_gc_list);
50static LIST_HEAD(tls_device_list);
51static DEFINE_SPINLOCK(tls_device_lock);
52
53static void tls_device_free_ctx(struct tls_context *ctx)
54{
55 struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
56
57 kfree(offload_ctx);
58 kfree(ctx);
59}
60
61static void tls_device_gc_task(struct work_struct *work)
62{
63 struct tls_context *ctx, *tmp;
64 unsigned long flags;
65 LIST_HEAD(gc_list);
66
67 spin_lock_irqsave(&tls_device_lock, flags);
68 list_splice_init(&tls_device_gc_list, &gc_list);
69 spin_unlock_irqrestore(&tls_device_lock, flags);
70
71 list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
72 struct net_device *netdev = ctx->netdev;
73
74 if (netdev) {
75 netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
76 TLS_OFFLOAD_CTX_DIR_TX);
77 dev_put(netdev);
78 }
79
80 list_del(&ctx->list);
81 tls_device_free_ctx(ctx);
82 }
83}
84
85static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
86{
87 unsigned long flags;
88
89 spin_lock_irqsave(&tls_device_lock, flags);
90 list_move_tail(&ctx->list, &tls_device_gc_list);
91
92 /* schedule_work inside the spinlock
93 * to make sure tls_device_down waits for that work.
94 */
95 schedule_work(&tls_device_gc_work);
96
97 spin_unlock_irqrestore(&tls_device_lock, flags);
98}
99
100/* We assume that the socket is already connected */
101static struct net_device *get_netdev_for_sock(struct sock *sk)
102{
103 struct dst_entry *dst = sk_dst_get(sk);
104 struct net_device *netdev = NULL;
105
106 if (likely(dst)) {
107 netdev = dst->dev;
108 dev_hold(netdev);
109 }
110
111 dst_release(dst);
112
113 return netdev;
114}
115
116static void destroy_record(struct tls_record_info *record)
117{
118 int nr_frags = record->num_frags;
119 skb_frag_t *frag;
120
121 while (nr_frags-- > 0) {
122 frag = &record->frags[nr_frags];
123 __skb_frag_unref(frag);
124 }
125 kfree(record);
126}
127
128static void delete_all_records(struct tls_offload_context *offload_ctx)
129{
130 struct tls_record_info *info, *temp;
131
132 list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) {
133 list_del(&info->list);
134 destroy_record(info);
135 }
136
137 offload_ctx->retransmit_hint = NULL;
138}
139
140static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
141{
142 struct tls_context *tls_ctx = tls_get_ctx(sk);
143 struct tls_record_info *info, *temp;
144 struct tls_offload_context *ctx;
145 u64 deleted_records = 0;
146 unsigned long flags;
147
148 if (!tls_ctx)
149 return;
150
151 ctx = tls_offload_ctx(tls_ctx);
152
153 spin_lock_irqsave(&ctx->lock, flags);
154 info = ctx->retransmit_hint;
155 if (info && !before(acked_seq, info->end_seq)) {
156 ctx->retransmit_hint = NULL;
157 list_del(&info->list);
158 destroy_record(info);
159 deleted_records++;
160 }
161
162 list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
163 if (before(acked_seq, info->end_seq))
164 break;
165 list_del(&info->list);
166
167 destroy_record(info);
168 deleted_records++;
169 }
170
171 ctx->unacked_record_sn += deleted_records;
172 spin_unlock_irqrestore(&ctx->lock, flags);
173}
174
175/* At this point, there should be no references on this
176 * socket and no in-flight SKBs associated with this
177 * socket, so it is safe to free all the resources.
178 */
179void tls_device_sk_destruct(struct sock *sk)
180{
181 struct tls_context *tls_ctx = tls_get_ctx(sk);
182 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
183
184 if (ctx->open_record)
185 destroy_record(ctx->open_record);
186
187 delete_all_records(ctx);
188 crypto_free_aead(ctx->aead_send);
189 ctx->sk_destruct(sk);
190 clean_acked_data_disable(inet_csk(sk));
191
192 if (refcount_dec_and_test(&tls_ctx->refcount))
193 tls_device_queue_ctx_destruction(tls_ctx);
194}
195EXPORT_SYMBOL(tls_device_sk_destruct);
196
197static void tls_append_frag(struct tls_record_info *record,
198 struct page_frag *pfrag,
199 int size)
200{
201 skb_frag_t *frag;
202
203 frag = &record->frags[record->num_frags - 1];
204 if (frag->page.p == pfrag->page &&
205 frag->page_offset + frag->size == pfrag->offset) {
206 frag->size += size;
207 } else {
208 ++frag;
209 frag->page.p = pfrag->page;
210 frag->page_offset = pfrag->offset;
211 frag->size = size;
212 ++record->num_frags;
213 get_page(pfrag->page);
214 }
215
216 pfrag->offset += size;
217 record->len += size;
218}
219
220static int tls_push_record(struct sock *sk,
221 struct tls_context *ctx,
222 struct tls_offload_context *offload_ctx,
223 struct tls_record_info *record,
224 struct page_frag *pfrag,
225 int flags,
226 unsigned char record_type)
227{
228 struct tcp_sock *tp = tcp_sk(sk);
229 struct page_frag dummy_tag_frag;
230 skb_frag_t *frag;
231 int i;
232
233 /* fill prepend */
234 frag = &record->frags[0];
235 tls_fill_prepend(ctx,
236 skb_frag_address(frag),
237 record->len - ctx->tx.prepend_size,
238 record_type);
239
240 /* HW doesn't care about the data in the tag, because it fills it. */
241 dummy_tag_frag.page = skb_frag_page(frag);
242 dummy_tag_frag.offset = 0;
243
244 tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size);
245 record->end_seq = tp->write_seq + record->len;
246 spin_lock_irq(&offload_ctx->lock);
247 list_add_tail(&record->list, &offload_ctx->records_list);
248 spin_unlock_irq(&offload_ctx->lock);
249 offload_ctx->open_record = NULL;
250 set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
251 tls_advance_record_sn(sk, &ctx->tx);
252
253 for (i = 0; i < record->num_frags; i++) {
254 frag = &record->frags[i];
255 sg_unmark_end(&offload_ctx->sg_tx_data[i]);
256 sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
257 frag->size, frag->page_offset);
258 sk_mem_charge(sk, frag->size);
259 get_page(skb_frag_page(frag));
260 }
261 sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
262
263 /* all ready, send */
264 return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
265}
266
267static int tls_create_new_record(struct tls_offload_context *offload_ctx,
268 struct page_frag *pfrag,
269 size_t prepend_size)
270{
271 struct tls_record_info *record;
272 skb_frag_t *frag;
273
274 record = kmalloc(sizeof(*record), GFP_KERNEL);
275 if (!record)
276 return -ENOMEM;
277
278 frag = &record->frags[0];
279 __skb_frag_set_page(frag, pfrag->page);
280 frag->page_offset = pfrag->offset;
281 skb_frag_size_set(frag, prepend_size);
282
283 get_page(pfrag->page);
284 pfrag->offset += prepend_size;
285
286 record->num_frags = 1;
287 record->len = prepend_size;
288 offload_ctx->open_record = record;
289 return 0;
290}
291
292static int tls_do_allocation(struct sock *sk,
293 struct tls_offload_context *offload_ctx,
294 struct page_frag *pfrag,
295 size_t prepend_size)
296{
297 int ret;
298
299 if (!offload_ctx->open_record) {
300 if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
301 sk->sk_allocation))) {
302 sk->sk_prot->enter_memory_pressure(sk);
303 sk_stream_moderate_sndbuf(sk);
304 return -ENOMEM;
305 }
306
307 ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
308 if (ret)
309 return ret;
310
311 if (pfrag->size > pfrag->offset)
312 return 0;
313 }
314
315 if (!sk_page_frag_refill(sk, pfrag))
316 return -ENOMEM;
317
318 return 0;
319}
320
321static int tls_push_data(struct sock *sk,
322 struct iov_iter *msg_iter,
323 size_t size, int flags,
324 unsigned char record_type)
325{
326 struct tls_context *tls_ctx = tls_get_ctx(sk);
327 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
328 int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
329 int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
330 struct tls_record_info *record = ctx->open_record;
331 struct page_frag *pfrag;
332 size_t orig_size = size;
333 u32 max_open_record_len;
334 int copy, rc = 0;
335 bool done = false;
336 long timeo;
337
338 if (flags &
339 ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
340 return -ENOTSUPP;
341
342 if (sk->sk_err)
343 return -sk->sk_err;
344
345 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
346 rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
347 if (rc < 0)
348 return rc;
349
350 pfrag = sk_page_frag(sk);
351
352 /* TLS_HEADER_SIZE is not counted as part of the TLS record, and
353 * we need to leave room for an authentication tag.
354 */
355 max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
356 tls_ctx->tx.prepend_size;
357 do {
358 rc = tls_do_allocation(sk, ctx, pfrag,
359 tls_ctx->tx.prepend_size);
360 if (rc) {
361 rc = sk_stream_wait_memory(sk, &timeo);
362 if (!rc)
363 continue;
364
365 record = ctx->open_record;
366 if (!record)
367 break;
368handle_error:
369 if (record_type != TLS_RECORD_TYPE_DATA) {
370 /* avoid sending partial
371 * record with type !=
372 * application_data
373 */
374 size = orig_size;
375 destroy_record(record);
376 ctx->open_record = NULL;
377 } else if (record->len > tls_ctx->tx.prepend_size) {
378 goto last_record;
379 }
380
381 break;
382 }
383
384 record = ctx->open_record;
385 copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
386 copy = min_t(size_t, copy, (max_open_record_len - record->len));
387
388 if (copy_from_iter_nocache(page_address(pfrag->page) +
389 pfrag->offset,
390 copy, msg_iter) != copy) {
391 rc = -EFAULT;
392 goto handle_error;
393 }
394 tls_append_frag(record, pfrag, copy);
395
396 size -= copy;
397 if (!size) {
398last_record:
399 tls_push_record_flags = flags;
400 if (more) {
401 tls_ctx->pending_open_record_frags =
402 record->num_frags;
403 break;
404 }
405
406 done = true;
407 }
408
409 if (done || record->len >= max_open_record_len ||
410 (record->num_frags >= MAX_SKB_FRAGS - 1)) {
411 rc = tls_push_record(sk,
412 tls_ctx,
413 ctx,
414 record,
415 pfrag,
416 tls_push_record_flags,
417 record_type);
418 if (rc < 0)
419 break;
420 }
421 } while (!done);
422
423 if (orig_size - size > 0)
424 rc = orig_size - size;
425
426 return rc;
427}
428
429int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
430{
431 unsigned char record_type = TLS_RECORD_TYPE_DATA;
432 int rc;
433
434 lock_sock(sk);
435
436 if (unlikely(msg->msg_controllen)) {
437 rc = tls_proccess_cmsg(sk, msg, &record_type);
438 if (rc)
439 goto out;
440 }
441
442 rc = tls_push_data(sk, &msg->msg_iter, size,
443 msg->msg_flags, record_type);
444
445out:
446 release_sock(sk);
447 return rc;
448}
449
450int tls_device_sendpage(struct sock *sk, struct page *page,
451 int offset, size_t size, int flags)
452{
453 struct iov_iter msg_iter;
454 char *kaddr = kmap(page);
455 struct kvec iov;
456 int rc;
457
458 if (flags & MSG_SENDPAGE_NOTLAST)
459 flags |= MSG_MORE;
460
461 lock_sock(sk);
462
463 if (flags & MSG_OOB) {
464 rc = -ENOTSUPP;
465 goto out;
466 }
467
468 iov.iov_base = kaddr + offset;
469 iov.iov_len = size;
470 iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
471 rc = tls_push_data(sk, &msg_iter, size,
472 flags, TLS_RECORD_TYPE_DATA);
473 kunmap(page);
474
475out:
476 release_sock(sk);
477 return rc;
478}
479
480struct tls_record_info *tls_get_record(struct tls_offload_context *context,
481 u32 seq, u64 *p_record_sn)
482{
483 u64 record_sn = context->hint_record_sn;
484 struct tls_record_info *info;
485
486 info = context->retransmit_hint;
487 if (!info ||
488 before(seq, info->end_seq - info->len)) {
489 /* if retransmit_hint is irrelevant start
490 * from the beggining of the list
491 */
492 info = list_first_entry(&context->records_list,
493 struct tls_record_info, list);
494 record_sn = context->unacked_record_sn;
495 }
496
497 list_for_each_entry_from(info, &context->records_list, list) {
498 if (before(seq, info->end_seq)) {
499 if (!context->retransmit_hint ||
500 after(info->end_seq,
501 context->retransmit_hint->end_seq)) {
502 context->hint_record_sn = record_sn;
503 context->retransmit_hint = info;
504 }
505 *p_record_sn = record_sn;
506 return info;
507 }
508 record_sn++;
509 }
510
511 return NULL;
512}
513EXPORT_SYMBOL(tls_get_record);
514
515static int tls_device_push_pending_record(struct sock *sk, int flags)
516{
517 struct iov_iter msg_iter;
518
519 iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
520 return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
521}
522
523int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
524{
525 u16 nonce_size, tag_size, iv_size, rec_seq_size;
526 struct tls_record_info *start_marker_record;
527 struct tls_offload_context *offload_ctx;
528 struct tls_crypto_info *crypto_info;
529 struct net_device *netdev;
530 char *iv, *rec_seq;
531 struct sk_buff *skb;
532 int rc = -EINVAL;
533 __be64 rcd_sn;
534
535 if (!ctx)
536 goto out;
537
538 if (ctx->priv_ctx_tx) {
539 rc = -EEXIST;
540 goto out;
541 }
542
543 start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
544 if (!start_marker_record) {
545 rc = -ENOMEM;
546 goto out;
547 }
548
549 offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
550 if (!offload_ctx) {
551 rc = -ENOMEM;
552 goto free_marker_record;
553 }
554
555 crypto_info = &ctx->crypto_send;
556 switch (crypto_info->cipher_type) {
557 case TLS_CIPHER_AES_GCM_128:
558 nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
559 tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
560 iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
561 iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
562 rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
563 rec_seq =
564 ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
565 break;
566 default:
567 rc = -EINVAL;
568 goto free_offload_ctx;
569 }
570
571 ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
572 ctx->tx.tag_size = tag_size;
573 ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
574 ctx->tx.iv_size = iv_size;
575 ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
576 GFP_KERNEL);
577 if (!ctx->tx.iv) {
578 rc = -ENOMEM;
579 goto free_offload_ctx;
580 }
581
582 memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
583
584 ctx->tx.rec_seq_size = rec_seq_size;
585 ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
586 if (!ctx->tx.rec_seq) {
587 rc = -ENOMEM;
588 goto free_iv;
589 }
590 memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
591
592 rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
593 if (rc)
594 goto free_rec_seq;
595
596 /* start at rec_seq - 1 to account for the start marker record */
597 memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn));
598 offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;
599
600 start_marker_record->end_seq = tcp_sk(sk)->write_seq;
601 start_marker_record->len = 0;
602 start_marker_record->num_frags = 0;
603
604 INIT_LIST_HEAD(&offload_ctx->records_list);
605 list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
606 spin_lock_init(&offload_ctx->lock);
607 sg_init_table(offload_ctx->sg_tx_data,
608 ARRAY_SIZE(offload_ctx->sg_tx_data));
609
610 clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
611 ctx->push_pending_record = tls_device_push_pending_record;
612 offload_ctx->sk_destruct = sk->sk_destruct;
613
614 /* TLS offload is greatly simplified if we don't send
615 * SKBs where only part of the payload needs to be encrypted.
616 * So mark the last skb in the write queue as end of record.
617 */
618 skb = tcp_write_queue_tail(sk);
619 if (skb)
620 TCP_SKB_CB(skb)->eor = 1;
621
622 refcount_set(&ctx->refcount, 1);
623
624 /* We support starting offload on multiple sockets
625 * concurrently, so we only need a read lock here.
626 * This lock must precede get_netdev_for_sock to prevent races between
627 * NETDEV_DOWN and setsockopt.
628 */
629 down_read(&device_offload_lock);
630 netdev = get_netdev_for_sock(sk);
631 if (!netdev) {
632 pr_err_ratelimited("%s: netdev not found\n", __func__);
633 rc = -EINVAL;
634 goto release_lock;
635 }
636
637 if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
638 rc = -ENOTSUPP;
639 goto release_netdev;
640 }
641
642 /* Avoid offloading if the device is down
643 * We don't want to offload new flows after
644 * the NETDEV_DOWN event
645 */
646 if (!(netdev->flags & IFF_UP)) {
647 rc = -EINVAL;
648 goto release_netdev;
649 }
650
651 ctx->priv_ctx_tx = offload_ctx;
652 rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
653 &ctx->crypto_send,
654 tcp_sk(sk)->write_seq);
655 if (rc)
656 goto release_netdev;
657
658 ctx->netdev = netdev;
659
660 spin_lock_irq(&tls_device_lock);
661 list_add_tail(&ctx->list, &tls_device_list);
662 spin_unlock_irq(&tls_device_lock);
663
664 sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
665 /* following this assignment tls_is_sk_tx_device_offloaded
666 * will return true and the context might be accessed
667 * by the netdev's xmit function.
668 */
669 smp_store_release(&sk->sk_destruct,
670 &tls_device_sk_destruct);
671 up_read(&device_offload_lock);
672 goto out;
673
674release_netdev:
675 dev_put(netdev);
676release_lock:
677 up_read(&device_offload_lock);
678 clean_acked_data_disable(inet_csk(sk));
679 crypto_free_aead(offload_ctx->aead_send);
680free_rec_seq:
681 kfree(ctx->tx.rec_seq);
682free_iv:
683 kfree(ctx->tx.iv);
684free_offload_ctx:
685 kfree(offload_ctx);
686 ctx->priv_ctx_tx = NULL;
687free_marker_record:
688 kfree(start_marker_record);
689out:
690 return rc;
691}
692
693static int tls_device_down(struct net_device *netdev)
694{
695 struct tls_context *ctx, *tmp;
696 unsigned long flags;
697 LIST_HEAD(list);
698
699 /* Request a write lock to block new offload attempts */
700 down_write(&device_offload_lock);
701
702 spin_lock_irqsave(&tls_device_lock, flags);
703 list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
704 if (ctx->netdev != netdev ||
705 !refcount_inc_not_zero(&ctx->refcount))
706 continue;
707
708 list_move(&ctx->list, &list);
709 }
710 spin_unlock_irqrestore(&tls_device_lock, flags);
711
712 list_for_each_entry_safe(ctx, tmp, &list, list) {
713 netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
714 TLS_OFFLOAD_CTX_DIR_TX);
715 ctx->netdev = NULL;
716 dev_put(netdev);
717 list_del_init(&ctx->list);
718
719 if (refcount_dec_and_test(&ctx->refcount))
720 tls_device_free_ctx(ctx);
721 }
722
723 up_write(&device_offload_lock);
724
725 flush_work(&tls_device_gc_work);
726
727 return NOTIFY_DONE;
728}
729
730static int tls_dev_event(struct notifier_block *this, unsigned long event,
731 void *ptr)
732{
733 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
734
735 if (!(dev->features & NETIF_F_HW_TLS_TX))
736 return NOTIFY_DONE;
737
738 switch (event) {
739 case NETDEV_REGISTER:
740 case NETDEV_FEAT_CHANGE:
741 if (dev->tlsdev_ops &&
742 dev->tlsdev_ops->tls_dev_add &&
743 dev->tlsdev_ops->tls_dev_del)
744 return NOTIFY_DONE;
745 else
746 return NOTIFY_BAD;
747 case NETDEV_DOWN:
748 return tls_device_down(dev);
749 }
750 return NOTIFY_DONE;
751}
752
753static struct notifier_block tls_dev_notifier = {
754 .notifier_call = tls_dev_event,
755};
756
757void __init tls_device_init(void)
758{
759 register_netdevice_notifier(&tls_dev_notifier);
760}
761
762void __exit tls_device_cleanup(void)
763{
764 unregister_netdevice_notifier(&tls_dev_notifier);
765 flush_work(&tls_device_gc_work);
766}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
new file mode 100644
index 000000000000..748914abdb60
--- /dev/null
+++ b/net/tls/tls_device_fallback.c
@@ -0,0 +1,450 @@
1/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
2 *
3 * This software is available to you under a choice of one of two
4 * licenses. You may choose to be licensed under the terms of the GNU
5 * General Public License (GPL) Version 2, available from the file
6 * COPYING in the main directory of this source tree, or the
7 * OpenIB.org BSD license below:
8 *
9 * Redistribution and use in source and binary forms, with or
10 * without modification, are permitted provided that the following
11 * conditions are met:
12 *
13 * - Redistributions of source code must retain the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer.
16 *
17 * - Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials
20 * provided with the distribution.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
26 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
27 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
28 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 * SOFTWARE.
30 */
31
32#include <net/tls.h>
33#include <crypto/aead.h>
34#include <crypto/scatterwalk.h>
35#include <net/ip6_checksum.h>
36
37static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
38{
39 struct scatterlist *src = walk->sg;
40 int diff = walk->offset - src->offset;
41
42 sg_set_page(sg, sg_page(src),
43 src->length - diff, walk->offset);
44
45 scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
46}
47
48static int tls_enc_record(struct aead_request *aead_req,
49 struct crypto_aead *aead, char *aad,
50 char *iv, __be64 rcd_sn,
51 struct scatter_walk *in,
52 struct scatter_walk *out, int *in_len)
53{
54 unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
55 struct scatterlist sg_in[3];
56 struct scatterlist sg_out[3];
57 u16 len;
58 int rc;
59
60 len = min_t(int, *in_len, ARRAY_SIZE(buf));
61
62 scatterwalk_copychunks(buf, in, len, 0);
63 scatterwalk_copychunks(buf, out, len, 1);
64
65 *in_len -= len;
66 if (!*in_len)
67 return 0;
68
69 scatterwalk_pagedone(in, 0, 1);
70 scatterwalk_pagedone(out, 1, 1);
71
72 len = buf[4] | (buf[3] << 8);
73 len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
74
75 tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
76 (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
77
78 memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
79 TLS_CIPHER_AES_GCM_128_IV_SIZE);
80
81 sg_init_table(sg_in, ARRAY_SIZE(sg_in));
82 sg_init_table(sg_out, ARRAY_SIZE(sg_out));
83 sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE);
84 sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE);
85 chain_to_walk(sg_in + 1, in);
86 chain_to_walk(sg_out + 1, out);
87
88 *in_len -= len;
89 if (*in_len < 0) {
90 *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
91 /* the input buffer doesn't contain the entire record.
92 * trim len accordingly. The resulting authentication tag
93 * will contain garbage, but we don't care, so we won't
94 * include any of it in the output skb
95 * Note that we assume the output buffer length
96 * is larger then input buffer length + tag size
97 */
98 if (*in_len < 0)
99 len += *in_len;
100
101 *in_len = 0;
102 }
103
104 if (*in_len) {
105 scatterwalk_copychunks(NULL, in, len, 2);
106 scatterwalk_pagedone(in, 0, 1);
107 scatterwalk_copychunks(NULL, out, len, 2);
108 scatterwalk_pagedone(out, 1, 1);
109 }
110
111 len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE;
112 aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv);
113
114 rc = crypto_aead_encrypt(aead_req);
115
116 return rc;
117}
118
119static void tls_init_aead_request(struct aead_request *aead_req,
120 struct crypto_aead *aead)
121{
122 aead_request_set_tfm(aead_req, aead);
123 aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
124}
125
126static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead,
127 gfp_t flags)
128{
129 unsigned int req_size = sizeof(struct aead_request) +
130 crypto_aead_reqsize(aead);
131 struct aead_request *aead_req;
132
133 aead_req = kzalloc(req_size, flags);
134 if (aead_req)
135 tls_init_aead_request(aead_req, aead);
136 return aead_req;
137}
138
139static int tls_enc_records(struct aead_request *aead_req,
140 struct crypto_aead *aead, struct scatterlist *sg_in,
141 struct scatterlist *sg_out, char *aad, char *iv,
142 u64 rcd_sn, int len)
143{
144 struct scatter_walk out, in;
145 int rc;
146
147 scatterwalk_start(&in, sg_in);
148 scatterwalk_start(&out, sg_out);
149
150 do {
151 rc = tls_enc_record(aead_req, aead, aad, iv,
152 cpu_to_be64(rcd_sn), &in, &out, &len);
153 rcd_sn++;
154
155 } while (rc == 0 && len);
156
157 scatterwalk_done(&in, 0, 0);
158 scatterwalk_done(&out, 1, 0);
159
160 return rc;
161}
162
163/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses
164 * might have been changed by NAT.
165 */
166static void update_chksum(struct sk_buff *skb, int headln)
167{
168 struct tcphdr *th = tcp_hdr(skb);
169 int datalen = skb->len - headln;
170 const struct ipv6hdr *ipv6h;
171 const struct iphdr *iph;
172
173 /* We only changed the payload so if we are using partial we don't
174 * need to update anything.
175 */
176 if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
177 return;
178
179 skb->ip_summed = CHECKSUM_PARTIAL;
180 skb->csum_start = skb_transport_header(skb) - skb->head;
181 skb->csum_offset = offsetof(struct tcphdr, check);
182
183 if (skb->sk->sk_family == AF_INET6) {
184 ipv6h = ipv6_hdr(skb);
185 th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
186 datalen, IPPROTO_TCP, 0);
187 } else {
188 iph = ip_hdr(skb);
189 th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
190 IPPROTO_TCP, 0);
191 }
192}
193
194static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
195{
196 skb_copy_header(nskb, skb);
197
198 skb_put(nskb, skb->len);
199 memcpy(nskb->data, skb->data, headln);
200 update_chksum(nskb, headln);
201
202 nskb->destructor = skb->destructor;
203 nskb->sk = skb->sk;
204 skb->destructor = NULL;
205 skb->sk = NULL;
206 refcount_add(nskb->truesize - skb->truesize,
207 &nskb->sk->sk_wmem_alloc);
208}
209
210/* This function may be called after the user socket is already
211 * closed so make sure we don't use anything freed during
212 * tls_sk_proto_close here
213 */
214
215static int fill_sg_in(struct scatterlist *sg_in,
216 struct sk_buff *skb,
217 struct tls_offload_context *ctx,
218 u64 *rcd_sn,
219 s32 *sync_size,
220 int *resync_sgs)
221{
222 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
223 int payload_len = skb->len - tcp_payload_offset;
224 u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
225 struct tls_record_info *record;
226 unsigned long flags;
227 int remaining;
228 int i;
229
230 spin_lock_irqsave(&ctx->lock, flags);
231 record = tls_get_record(ctx, tcp_seq, rcd_sn);
232 if (!record) {
233 spin_unlock_irqrestore(&ctx->lock, flags);
234 WARN(1, "Record not found for seq %u\n", tcp_seq);
235 return -EINVAL;
236 }
237
238 *sync_size = tcp_seq - tls_record_start_seq(record);
239 if (*sync_size < 0) {
240 int is_start_marker = tls_record_is_start_marker(record);
241
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 /* This should only occur if the relevant record was
244 * already acked. In that case it should be ok
245 * to drop the packet and avoid retransmission.
246 *
247 * There is a corner case where the packet contains
248 * both an acked and a non-acked record.
249 * We currently don't handle that case and rely
250 * on TCP to retranmit a packet that doesn't contain
251 * already acked payload.
252 */
253 if (!is_start_marker)
254 *sync_size = 0;
255 return -EINVAL;
256 }
257
258 remaining = *sync_size;
259 for (i = 0; remaining > 0; i++) {
260 skb_frag_t *frag = &record->frags[i];
261
262 __skb_frag_ref(frag);
263 sg_set_page(sg_in + i, skb_frag_page(frag),
264 skb_frag_size(frag), frag->page_offset);
265
266 remaining -= skb_frag_size(frag);
267
268 if (remaining < 0)
269 sg_in[i].length += remaining;
270 }
271 *resync_sgs = i;
272
273 spin_unlock_irqrestore(&ctx->lock, flags);
274 if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0)
275 return -EINVAL;
276
277 return 0;
278}
279
280static void fill_sg_out(struct scatterlist sg_out[3], void *buf,
281 struct tls_context *tls_ctx,
282 struct sk_buff *nskb,
283 int tcp_payload_offset,
284 int payload_len,
285 int sync_size,
286 void *dummy_buf)
287{
288 sg_set_buf(&sg_out[0], dummy_buf, sync_size);
289 sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len);
290 /* Add room for authentication tag produced by crypto */
291 dummy_buf += sync_size;
292 sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE);
293}
294
295static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
296 struct scatterlist sg_out[3],
297 struct scatterlist *sg_in,
298 struct sk_buff *skb,
299 s32 sync_size, u64 rcd_sn)
300{
301 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
302 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
303 int payload_len = skb->len - tcp_payload_offset;
304 void *buf, *iv, *aad, *dummy_buf;
305 struct aead_request *aead_req;
306 struct sk_buff *nskb = NULL;
307 int buf_len;
308
309 aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC);
310 if (!aead_req)
311 return NULL;
312
313 buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE +
314 TLS_CIPHER_AES_GCM_128_IV_SIZE +
315 TLS_AAD_SPACE_SIZE +
316 sync_size +
317 TLS_CIPHER_AES_GCM_128_TAG_SIZE;
318 buf = kmalloc(buf_len, GFP_ATOMIC);
319 if (!buf)
320 goto free_req;
321
322 iv = buf;
323 memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
324 TLS_CIPHER_AES_GCM_128_SALT_SIZE);
325 aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
326 TLS_CIPHER_AES_GCM_128_IV_SIZE;
327 dummy_buf = aad + TLS_AAD_SPACE_SIZE;
328
329 nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC);
330 if (!nskb)
331 goto free_buf;
332
333 skb_reserve(nskb, skb_headroom(skb));
334
335 fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset,
336 payload_len, sync_size, dummy_buf);
337
338 if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv,
339 rcd_sn, sync_size + payload_len) < 0)
340 goto free_nskb;
341
342 complete_skb(nskb, skb, tcp_payload_offset);
343
344 /* validate_xmit_skb_list assumes that if the skb wasn't segmented
345 * nskb->prev will point to the skb itself
346 */
347 nskb->prev = nskb;
348
349free_buf:
350 kfree(buf);
351free_req:
352 kfree(aead_req);
353 return nskb;
354free_nskb:
355 kfree_skb(nskb);
356 nskb = NULL;
357 goto free_buf;
358}
359
360static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
361{
362 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
363 struct tls_context *tls_ctx = tls_get_ctx(sk);
364 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
365 int payload_len = skb->len - tcp_payload_offset;
366 struct scatterlist *sg_in, sg_out[3];
367 struct sk_buff *nskb = NULL;
368 int sg_in_max_elements;
369 int resync_sgs = 0;
370 s32 sync_size = 0;
371 u64 rcd_sn;
372
373 /* worst case is:
374 * MAX_SKB_FRAGS in tls_record_info
375 * MAX_SKB_FRAGS + 1 in SKB head and frags.
376 */
377 sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1;
378
379 if (!payload_len)
380 return skb;
381
382 sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC);
383 if (!sg_in)
384 goto free_orig;
385
386 sg_init_table(sg_in, sg_in_max_elements);
387 sg_init_table(sg_out, ARRAY_SIZE(sg_out));
388
389 if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) {
390 /* bypass packets before kernel TLS socket option was set */
391 if (sync_size < 0 && payload_len <= -sync_size)
392 nskb = skb_get(skb);
393 goto put_sg;
394 }
395
396 nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn);
397
398put_sg:
399 while (resync_sgs)
400 put_page(sg_page(&sg_in[--resync_sgs]));
401 kfree(sg_in);
402free_orig:
403 kfree_skb(skb);
404 return nskb;
405}
406
407struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
408 struct net_device *dev,
409 struct sk_buff *skb)
410{
411 if (dev == tls_get_ctx(sk)->netdev)
412 return skb;
413
414 return tls_sw_fallback(sk, skb);
415}
416
417int tls_sw_fallback_init(struct sock *sk,
418 struct tls_offload_context *offload_ctx,
419 struct tls_crypto_info *crypto_info)
420{
421 const u8 *key;
422 int rc;
423
424 offload_ctx->aead_send =
425 crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
426 if (IS_ERR(offload_ctx->aead_send)) {
427 rc = PTR_ERR(offload_ctx->aead_send);
428 pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc);
429 offload_ctx->aead_send = NULL;
430 goto err_out;
431 }
432
433 key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key;
434
435 rc = crypto_aead_setkey(offload_ctx->aead_send, key,
436 TLS_CIPHER_AES_GCM_128_KEY_SIZE);
437 if (rc)
438 goto free_aead;
439
440 rc = crypto_aead_setauthsize(offload_ctx->aead_send,
441 TLS_CIPHER_AES_GCM_128_TAG_SIZE);
442 if (rc)
443 goto free_aead;
444
445 return 0;
446free_aead:
447 crypto_free_aead(offload_ctx->aead_send);
448err_out:
449 return rc;
450}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 20cd93be6236..301f22430469 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,12 +51,12 @@ enum {
51 TLSV6, 51 TLSV6,
52 TLS_NUM_PROTS, 52 TLS_NUM_PROTS,
53}; 53};
54
55enum { 54enum {
56 TLS_BASE, 55 TLS_BASE,
57 TLS_SW_TX, 56 TLS_SW,
58 TLS_SW_RX, 57#ifdef CONFIG_TLS_DEVICE
59 TLS_SW_RXTX, 58 TLS_HW,
59#endif
60 TLS_HW_RECORD, 60 TLS_HW_RECORD,
61 TLS_NUM_CONFIG, 61 TLS_NUM_CONFIG,
62}; 62};
@@ -65,14 +65,14 @@ static struct proto *saved_tcpv6_prot;
65static DEFINE_MUTEX(tcpv6_prot_mutex); 65static DEFINE_MUTEX(tcpv6_prot_mutex);
66static LIST_HEAD(device_list); 66static LIST_HEAD(device_list);
67static DEFINE_MUTEX(device_mutex); 67static DEFINE_MUTEX(device_mutex);
68static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; 68static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
69static struct proto_ops tls_sw_proto_ops; 69static struct proto_ops tls_sw_proto_ops;
70 70
71static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) 71static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
72{ 72{
73 int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; 73 int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
74 74
75 sk->sk_prot = &tls_prots[ip_ver][ctx->conf]; 75 sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
76} 76}
77 77
78int wait_on_pending_writer(struct sock *sk, long *timeo) 78int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -254,7 +254,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
254 lock_sock(sk); 254 lock_sock(sk);
255 sk_proto_close = ctx->sk_proto_close; 255 sk_proto_close = ctx->sk_proto_close;
256 256
257 if (ctx->conf == TLS_BASE || ctx->conf == TLS_HW_RECORD) { 257 if ((ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) ||
258 (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)) {
258 free_ctx = true; 259 free_ctx = true;
259 goto skip_tx_cleanup; 260 goto skip_tx_cleanup;
260 } 261 }
@@ -275,15 +276,26 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
275 } 276 }
276 } 277 }
277 278
278 kfree(ctx->tx.rec_seq); 279 /* We need these for tls_sw_fallback handling of other packets */
279 kfree(ctx->tx.iv); 280 if (ctx->tx_conf == TLS_SW) {
280 kfree(ctx->rx.rec_seq); 281 kfree(ctx->tx.rec_seq);
281 kfree(ctx->rx.iv); 282 kfree(ctx->tx.iv);
283 tls_sw_free_resources_tx(sk);
284 }
282 285
283 if (ctx->conf == TLS_SW_TX || 286 if (ctx->rx_conf == TLS_SW) {
284 ctx->conf == TLS_SW_RX || 287 kfree(ctx->rx.rec_seq);
285 ctx->conf == TLS_SW_RXTX) { 288 kfree(ctx->rx.iv);
286 tls_sw_free_resources(sk); 289 tls_sw_free_resources_rx(sk);
290 }
291
292#ifdef CONFIG_TLS_DEVICE
293 if (ctx->tx_conf != TLS_HW) {
294#else
295 {
296#endif
297 kfree(ctx);
298 ctx = NULL;
287 } 299 }
288 300
289skip_tx_cleanup: 301skip_tx_cleanup:
@@ -446,25 +458,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
446 goto err_crypto_info; 458 goto err_crypto_info;
447 } 459 }
448 460
449 /* currently SW is default, we will have ethtool in future */
450 if (tx) { 461 if (tx) {
451 rc = tls_set_sw_offload(sk, ctx, 1); 462#ifdef CONFIG_TLS_DEVICE
452 if (ctx->conf == TLS_SW_RX) 463 rc = tls_set_device_offload(sk, ctx);
453 conf = TLS_SW_RXTX; 464 conf = TLS_HW;
454 else 465 if (rc) {
455 conf = TLS_SW_TX; 466#else
467 {
468#endif
469 rc = tls_set_sw_offload(sk, ctx, 1);
470 conf = TLS_SW;
471 }
456 } else { 472 } else {
457 rc = tls_set_sw_offload(sk, ctx, 0); 473 rc = tls_set_sw_offload(sk, ctx, 0);
458 if (ctx->conf == TLS_SW_TX) 474 conf = TLS_SW;
459 conf = TLS_SW_RXTX;
460 else
461 conf = TLS_SW_RX;
462 } 475 }
463 476
464 if (rc) 477 if (rc)
465 goto err_crypto_info; 478 goto err_crypto_info;
466 479
467 ctx->conf = conf; 480 if (tx)
481 ctx->tx_conf = conf;
482 else
483 ctx->rx_conf = conf;
468 update_sk_prot(sk, ctx); 484 update_sk_prot(sk, ctx);
469 if (tx) { 485 if (tx) {
470 ctx->sk_write_space = sk->sk_write_space; 486 ctx->sk_write_space = sk->sk_write_space;
@@ -540,7 +556,8 @@ static int tls_hw_prot(struct sock *sk)
540 ctx->hash = sk->sk_prot->hash; 556 ctx->hash = sk->sk_prot->hash;
541 ctx->unhash = sk->sk_prot->unhash; 557 ctx->unhash = sk->sk_prot->unhash;
542 ctx->sk_proto_close = sk->sk_prot->close; 558 ctx->sk_proto_close = sk->sk_prot->close;
543 ctx->conf = TLS_HW_RECORD; 559 ctx->rx_conf = TLS_HW_RECORD;
560 ctx->tx_conf = TLS_HW_RECORD;
544 update_sk_prot(sk, ctx); 561 update_sk_prot(sk, ctx);
545 rc = 1; 562 rc = 1;
546 break; 563 break;
@@ -584,29 +601,40 @@ static int tls_hw_hash(struct sock *sk)
584 return err; 601 return err;
585} 602}
586 603
587static void build_protos(struct proto *prot, struct proto *base) 604static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
605 struct proto *base)
588{ 606{
589 prot[TLS_BASE] = *base; 607 prot[TLS_BASE][TLS_BASE] = *base;
590 prot[TLS_BASE].setsockopt = tls_setsockopt; 608 prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
591 prot[TLS_BASE].getsockopt = tls_getsockopt; 609 prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt;
592 prot[TLS_BASE].close = tls_sk_proto_close; 610 prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close;
593 611
594 prot[TLS_SW_TX] = prot[TLS_BASE]; 612 prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
595 prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; 613 prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg;
596 prot[TLS_SW_TX].sendpage = tls_sw_sendpage; 614 prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
597 615
598 prot[TLS_SW_RX] = prot[TLS_BASE]; 616 prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
599 prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; 617 prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
600 prot[TLS_SW_RX].close = tls_sk_proto_close; 618 prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
601 619
602 prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; 620 prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
603 prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; 621 prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
604 prot[TLS_SW_RXTX].close = tls_sk_proto_close; 622 prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
605 623
606 prot[TLS_HW_RECORD] = *base; 624#ifdef CONFIG_TLS_DEVICE
607 prot[TLS_HW_RECORD].hash = tls_hw_hash; 625 prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
608 prot[TLS_HW_RECORD].unhash = tls_hw_unhash; 626 prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg;
609 prot[TLS_HW_RECORD].close = tls_sk_proto_close; 627 prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage;
628
629 prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
630 prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
631 prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
632#endif
633
634 prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
635 prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash;
636 prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash;
637 prot[TLS_HW_RECORD][TLS_HW_RECORD].close = tls_sk_proto_close;
610} 638}
611 639
612static int tls_init(struct sock *sk) 640static int tls_init(struct sock *sk)
@@ -637,7 +665,7 @@ static int tls_init(struct sock *sk)
637 ctx->getsockopt = sk->sk_prot->getsockopt; 665 ctx->getsockopt = sk->sk_prot->getsockopt;
638 ctx->sk_proto_close = sk->sk_prot->close; 666 ctx->sk_proto_close = sk->sk_prot->close;
639 667
640 /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ 668 /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
641 if (ip_ver == TLSV6 && 669 if (ip_ver == TLSV6 &&
642 unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { 670 unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
643 mutex_lock(&tcpv6_prot_mutex); 671 mutex_lock(&tcpv6_prot_mutex);
@@ -648,7 +676,8 @@ static int tls_init(struct sock *sk)
648 mutex_unlock(&tcpv6_prot_mutex); 676 mutex_unlock(&tcpv6_prot_mutex);
649 } 677 }
650 678
651 ctx->conf = TLS_BASE; 679 ctx->tx_conf = TLS_BASE;
680 ctx->rx_conf = TLS_BASE;
652 update_sk_prot(sk, ctx); 681 update_sk_prot(sk, ctx);
653out: 682out:
654 return rc; 683 return rc;
@@ -686,6 +715,9 @@ static int __init tls_register(void)
686 tls_sw_proto_ops.poll = tls_sw_poll; 715 tls_sw_proto_ops.poll = tls_sw_poll;
687 tls_sw_proto_ops.splice_read = tls_sw_splice_read; 716 tls_sw_proto_ops.splice_read = tls_sw_splice_read;
688 717
718#ifdef CONFIG_TLS_DEVICE
719 tls_device_init();
720#endif
689 tcp_register_ulp(&tcp_tls_ulp_ops); 721 tcp_register_ulp(&tcp_tls_ulp_ops);
690 722
691 return 0; 723 return 0;
@@ -694,6 +726,9 @@ static int __init tls_register(void)
694static void __exit tls_unregister(void) 726static void __exit tls_unregister(void)
695{ 727{
696 tcp_unregister_ulp(&tcp_tls_ulp_ops); 728 tcp_unregister_ulp(&tcp_tls_ulp_ops);
729#ifdef CONFIG_TLS_DEVICE
730 tls_device_cleanup();
731#endif
697} 732}
698 733
699module_init(tls_register); 734module_init(tls_register);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index e1c93ce74e0f..8ca57d01b18f 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -52,7 +52,7 @@ static int tls_do_decryption(struct sock *sk,
52 gfp_t flags) 52 gfp_t flags)
53{ 53{
54 struct tls_context *tls_ctx = tls_get_ctx(sk); 54 struct tls_context *tls_ctx = tls_get_ctx(sk);
55 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 55 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
56 struct strp_msg *rxm = strp_msg(skb); 56 struct strp_msg *rxm = strp_msg(skb);
57 struct aead_request *aead_req; 57 struct aead_request *aead_req;
58 58
@@ -122,7 +122,7 @@ out:
122static void trim_both_sgl(struct sock *sk, int target_size) 122static void trim_both_sgl(struct sock *sk, int target_size)
123{ 123{
124 struct tls_context *tls_ctx = tls_get_ctx(sk); 124 struct tls_context *tls_ctx = tls_get_ctx(sk);
125 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 125 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
126 126
127 trim_sg(sk, ctx->sg_plaintext_data, 127 trim_sg(sk, ctx->sg_plaintext_data,
128 &ctx->sg_plaintext_num_elem, 128 &ctx->sg_plaintext_num_elem,
@@ -141,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
141static int alloc_encrypted_sg(struct sock *sk, int len) 141static int alloc_encrypted_sg(struct sock *sk, int len)
142{ 142{
143 struct tls_context *tls_ctx = tls_get_ctx(sk); 143 struct tls_context *tls_ctx = tls_get_ctx(sk);
144 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 144 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
145 int rc = 0; 145 int rc = 0;
146 146
147 rc = sk_alloc_sg(sk, len, 147 rc = sk_alloc_sg(sk, len,
@@ -155,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
155static int alloc_plaintext_sg(struct sock *sk, int len) 155static int alloc_plaintext_sg(struct sock *sk, int len)
156{ 156{
157 struct tls_context *tls_ctx = tls_get_ctx(sk); 157 struct tls_context *tls_ctx = tls_get_ctx(sk);
158 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 158 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
159 int rc = 0; 159 int rc = 0;
160 160
161 rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, 161 rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
@@ -181,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg,
181static void tls_free_both_sg(struct sock *sk) 181static void tls_free_both_sg(struct sock *sk)
182{ 182{
183 struct tls_context *tls_ctx = tls_get_ctx(sk); 183 struct tls_context *tls_ctx = tls_get_ctx(sk);
184 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 184 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
185 185
186 free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, 186 free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
187 &ctx->sg_encrypted_size); 187 &ctx->sg_encrypted_size);
@@ -191,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk)
191} 191}
192 192
193static int tls_do_encryption(struct tls_context *tls_ctx, 193static int tls_do_encryption(struct tls_context *tls_ctx,
194 struct tls_sw_context *ctx, size_t data_len, 194 struct tls_sw_context_tx *ctx, size_t data_len,
195 gfp_t flags) 195 gfp_t flags)
196{ 196{
197 unsigned int req_size = sizeof(struct aead_request) + 197 unsigned int req_size = sizeof(struct aead_request) +
@@ -227,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags,
227 unsigned char record_type) 227 unsigned char record_type)
228{ 228{
229 struct tls_context *tls_ctx = tls_get_ctx(sk); 229 struct tls_context *tls_ctx = tls_get_ctx(sk);
230 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 230 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
231 int rc; 231 int rc;
232 232
233 sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); 233 sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
@@ -339,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
339 int bytes) 339 int bytes)
340{ 340{
341 struct tls_context *tls_ctx = tls_get_ctx(sk); 341 struct tls_context *tls_ctx = tls_get_ctx(sk);
342 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 342 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
343 struct scatterlist *sg = ctx->sg_plaintext_data; 343 struct scatterlist *sg = ctx->sg_plaintext_data;
344 int copy, i, rc = 0; 344 int copy, i, rc = 0;
345 345
@@ -367,7 +367,7 @@ out:
367int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) 367int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
368{ 368{
369 struct tls_context *tls_ctx = tls_get_ctx(sk); 369 struct tls_context *tls_ctx = tls_get_ctx(sk);
370 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 370 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
371 int ret = 0; 371 int ret = 0;
372 int required_size; 372 int required_size;
373 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 373 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -522,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
522 int offset, size_t size, int flags) 522 int offset, size_t size, int flags)
523{ 523{
524 struct tls_context *tls_ctx = tls_get_ctx(sk); 524 struct tls_context *tls_ctx = tls_get_ctx(sk);
525 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 525 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
526 int ret = 0; 526 int ret = 0;
527 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 527 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
528 bool eor; 528 bool eor;
@@ -636,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
636 long timeo, int *err) 636 long timeo, int *err)
637{ 637{
638 struct tls_context *tls_ctx = tls_get_ctx(sk); 638 struct tls_context *tls_ctx = tls_get_ctx(sk);
639 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 639 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
640 struct sk_buff *skb; 640 struct sk_buff *skb;
641 DEFINE_WAIT_FUNC(wait, woken_wake_function); 641 DEFINE_WAIT_FUNC(wait, woken_wake_function);
642 642
@@ -674,7 +674,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
674 struct scatterlist *sgout) 674 struct scatterlist *sgout)
675{ 675{
676 struct tls_context *tls_ctx = tls_get_ctx(sk); 676 struct tls_context *tls_ctx = tls_get_ctx(sk);
677 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 677 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
678 char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; 678 char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
679 struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; 679 struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
680 struct scatterlist *sgin = &sgin_arr[0]; 680 struct scatterlist *sgin = &sgin_arr[0];
@@ -692,8 +692,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
692 if (!sgout) { 692 if (!sgout) {
693 nsg = skb_cow_data(skb, 0, &unused) + 1; 693 nsg = skb_cow_data(skb, 0, &unused) + 1;
694 sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); 694 sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
695 if (!sgout) 695 sgout = sgin;
696 sgout = sgin;
697 } 696 }
698 697
699 sg_init_table(sgin, nsg); 698 sg_init_table(sgin, nsg);
@@ -723,7 +722,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
723 unsigned int len) 722 unsigned int len)
724{ 723{
725 struct tls_context *tls_ctx = tls_get_ctx(sk); 724 struct tls_context *tls_ctx = tls_get_ctx(sk);
726 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 725 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
727 struct strp_msg *rxm = strp_msg(skb); 726 struct strp_msg *rxm = strp_msg(skb);
728 727
729 if (len < rxm->full_len) { 728 if (len < rxm->full_len) {
@@ -736,7 +735,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
736 /* Finished with message */ 735 /* Finished with message */
737 ctx->recv_pkt = NULL; 736 ctx->recv_pkt = NULL;
738 kfree_skb(skb); 737 kfree_skb(skb);
739 strp_unpause(&ctx->strp); 738 __strp_unpause(&ctx->strp);
740 739
741 return true; 740 return true;
742} 741}
@@ -749,7 +748,7 @@ int tls_sw_recvmsg(struct sock *sk,
749 int *addr_len) 748 int *addr_len)
750{ 749{
751 struct tls_context *tls_ctx = tls_get_ctx(sk); 750 struct tls_context *tls_ctx = tls_get_ctx(sk);
752 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 751 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
753 unsigned char control; 752 unsigned char control;
754 struct strp_msg *rxm; 753 struct strp_msg *rxm;
755 struct sk_buff *skb; 754 struct sk_buff *skb;
@@ -869,7 +868,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
869 size_t len, unsigned int flags) 868 size_t len, unsigned int flags)
870{ 869{
871 struct tls_context *tls_ctx = tls_get_ctx(sock->sk); 870 struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
872 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 871 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
873 struct strp_msg *rxm = NULL; 872 struct strp_msg *rxm = NULL;
874 struct sock *sk = sock->sk; 873 struct sock *sk = sock->sk;
875 struct sk_buff *skb; 874 struct sk_buff *skb;
@@ -922,7 +921,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
922 unsigned int ret; 921 unsigned int ret;
923 struct sock *sk = sock->sk; 922 struct sock *sk = sock->sk;
924 struct tls_context *tls_ctx = tls_get_ctx(sk); 923 struct tls_context *tls_ctx = tls_get_ctx(sk);
925 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 924 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
926 925
927 /* Grab POLLOUT and POLLHUP from the underlying socket */ 926 /* Grab POLLOUT and POLLHUP from the underlying socket */
928 ret = ctx->sk_poll(file, sock, wait); 927 ret = ctx->sk_poll(file, sock, wait);
@@ -938,7 +937,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
938static int tls_read_size(struct strparser *strp, struct sk_buff *skb) 937static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
939{ 938{
940 struct tls_context *tls_ctx = tls_get_ctx(strp->sk); 939 struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
941 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 940 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
942 char header[tls_ctx->rx.prepend_size]; 941 char header[tls_ctx->rx.prepend_size];
943 struct strp_msg *rxm = strp_msg(skb); 942 struct strp_msg *rxm = strp_msg(skb);
944 size_t cipher_overhead; 943 size_t cipher_overhead;
@@ -987,7 +986,7 @@ read_failure:
987static void tls_queue(struct strparser *strp, struct sk_buff *skb) 986static void tls_queue(struct strparser *strp, struct sk_buff *skb)
988{ 987{
989 struct tls_context *tls_ctx = tls_get_ctx(strp->sk); 988 struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
990 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 989 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
991 struct strp_msg *rxm; 990 struct strp_msg *rxm;
992 991
993 rxm = strp_msg(skb); 992 rxm = strp_msg(skb);
@@ -1003,18 +1002,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
1003static void tls_data_ready(struct sock *sk) 1002static void tls_data_ready(struct sock *sk)
1004{ 1003{
1005 struct tls_context *tls_ctx = tls_get_ctx(sk); 1004 struct tls_context *tls_ctx = tls_get_ctx(sk);
1006 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 1005 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1007 1006
1008 strp_data_ready(&ctx->strp); 1007 strp_data_ready(&ctx->strp);
1009} 1008}
1010 1009
1011void tls_sw_free_resources(struct sock *sk) 1010void tls_sw_free_resources_tx(struct sock *sk)
1012{ 1011{
1013 struct tls_context *tls_ctx = tls_get_ctx(sk); 1012 struct tls_context *tls_ctx = tls_get_ctx(sk);
1014 struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); 1013 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
1015 1014
1016 if (ctx->aead_send) 1015 if (ctx->aead_send)
1017 crypto_free_aead(ctx->aead_send); 1016 crypto_free_aead(ctx->aead_send);
1017 tls_free_both_sg(sk);
1018
1019 kfree(ctx);
1020}
1021
1022void tls_sw_free_resources_rx(struct sock *sk)
1023{
1024 struct tls_context *tls_ctx = tls_get_ctx(sk);
1025 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1026
1018 if (ctx->aead_recv) { 1027 if (ctx->aead_recv) {
1019 if (ctx->recv_pkt) { 1028 if (ctx->recv_pkt) {
1020 kfree_skb(ctx->recv_pkt); 1029 kfree_skb(ctx->recv_pkt);
@@ -1030,10 +1039,7 @@ void tls_sw_free_resources(struct sock *sk)
1030 lock_sock(sk); 1039 lock_sock(sk);
1031 } 1040 }
1032 1041
1033 tls_free_both_sg(sk);
1034
1035 kfree(ctx); 1042 kfree(ctx);
1036 kfree(tls_ctx);
1037} 1043}
1038 1044
1039int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) 1045int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
@@ -1041,7 +1047,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1041 char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; 1047 char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
1042 struct tls_crypto_info *crypto_info; 1048 struct tls_crypto_info *crypto_info;
1043 struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; 1049 struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
1044 struct tls_sw_context *sw_ctx; 1050 struct tls_sw_context_tx *sw_ctx_tx = NULL;
1051 struct tls_sw_context_rx *sw_ctx_rx = NULL;
1045 struct cipher_context *cctx; 1052 struct cipher_context *cctx;
1046 struct crypto_aead **aead; 1053 struct crypto_aead **aead;
1047 struct strp_callbacks cb; 1054 struct strp_callbacks cb;
@@ -1054,27 +1061,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1054 goto out; 1061 goto out;
1055 } 1062 }
1056 1063
1057 if (!ctx->priv_ctx) { 1064 if (tx) {
1058 sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); 1065 sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
1059 if (!sw_ctx) { 1066 if (!sw_ctx_tx) {
1060 rc = -ENOMEM; 1067 rc = -ENOMEM;
1061 goto out; 1068 goto out;
1062 } 1069 }
1063 crypto_init_wait(&sw_ctx->async_wait); 1070 crypto_init_wait(&sw_ctx_tx->async_wait);
1071 ctx->priv_ctx_tx = sw_ctx_tx;
1064 } else { 1072 } else {
1065 sw_ctx = ctx->priv_ctx; 1073 sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
1074 if (!sw_ctx_rx) {
1075 rc = -ENOMEM;
1076 goto out;
1077 }
1078 crypto_init_wait(&sw_ctx_rx->async_wait);
1079 ctx->priv_ctx_rx = sw_ctx_rx;
1066 } 1080 }
1067 1081
1068 ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
1069
1070 if (tx) { 1082 if (tx) {
1071 crypto_info = &ctx->crypto_send; 1083 crypto_info = &ctx->crypto_send;
1072 cctx = &ctx->tx; 1084 cctx = &ctx->tx;
1073 aead = &sw_ctx->aead_send; 1085 aead = &sw_ctx_tx->aead_send;
1074 } else { 1086 } else {
1075 crypto_info = &ctx->crypto_recv; 1087 crypto_info = &ctx->crypto_recv;
1076 cctx = &ctx->rx; 1088 cctx = &ctx->rx;
1077 aead = &sw_ctx->aead_recv; 1089 aead = &sw_ctx_rx->aead_recv;
1078 } 1090 }
1079 1091
1080 switch (crypto_info->cipher_type) { 1092 switch (crypto_info->cipher_type) {
@@ -1121,22 +1133,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1121 } 1133 }
1122 memcpy(cctx->rec_seq, rec_seq, rec_seq_size); 1134 memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
1123 1135
1124 if (tx) { 1136 if (sw_ctx_tx) {
1125 sg_init_table(sw_ctx->sg_encrypted_data, 1137 sg_init_table(sw_ctx_tx->sg_encrypted_data,
1126 ARRAY_SIZE(sw_ctx->sg_encrypted_data)); 1138 ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
1127 sg_init_table(sw_ctx->sg_plaintext_data, 1139 sg_init_table(sw_ctx_tx->sg_plaintext_data,
1128 ARRAY_SIZE(sw_ctx->sg_plaintext_data)); 1140 ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
1129 1141
1130 sg_init_table(sw_ctx->sg_aead_in, 2); 1142 sg_init_table(sw_ctx_tx->sg_aead_in, 2);
1131 sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, 1143 sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
1132 sizeof(sw_ctx->aad_space)); 1144 sizeof(sw_ctx_tx->aad_space));
1133 sg_unmark_end(&sw_ctx->sg_aead_in[1]); 1145 sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
1134 sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); 1146 sg_chain(sw_ctx_tx->sg_aead_in, 2,
1135 sg_init_table(sw_ctx->sg_aead_out, 2); 1147 sw_ctx_tx->sg_plaintext_data);
1136 sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, 1148 sg_init_table(sw_ctx_tx->sg_aead_out, 2);
1137 sizeof(sw_ctx->aad_space)); 1149 sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
1138 sg_unmark_end(&sw_ctx->sg_aead_out[1]); 1150 sizeof(sw_ctx_tx->aad_space));
1139 sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); 1151 sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
1152 sg_chain(sw_ctx_tx->sg_aead_out, 2,
1153 sw_ctx_tx->sg_encrypted_data);
1140 } 1154 }
1141 1155
1142 if (!*aead) { 1156 if (!*aead) {
@@ -1161,22 +1175,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1161 if (rc) 1175 if (rc)
1162 goto free_aead; 1176 goto free_aead;
1163 1177
1164 if (!tx) { 1178 if (sw_ctx_rx) {
1165 /* Set up strparser */ 1179 /* Set up strparser */
1166 memset(&cb, 0, sizeof(cb)); 1180 memset(&cb, 0, sizeof(cb));
1167 cb.rcv_msg = tls_queue; 1181 cb.rcv_msg = tls_queue;
1168 cb.parse_msg = tls_read_size; 1182 cb.parse_msg = tls_read_size;
1169 1183
1170 strp_init(&sw_ctx->strp, sk, &cb); 1184 strp_init(&sw_ctx_rx->strp, sk, &cb);
1171 1185
1172 write_lock_bh(&sk->sk_callback_lock); 1186 write_lock_bh(&sk->sk_callback_lock);
1173 sw_ctx->saved_data_ready = sk->sk_data_ready; 1187 sw_ctx_rx->saved_data_ready = sk->sk_data_ready;
1174 sk->sk_data_ready = tls_data_ready; 1188 sk->sk_data_ready = tls_data_ready;
1175 write_unlock_bh(&sk->sk_callback_lock); 1189 write_unlock_bh(&sk->sk_callback_lock);
1176 1190
1177 sw_ctx->sk_poll = sk->sk_socket->ops->poll; 1191 sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
1178 1192
1179 strp_check_rcv(&sw_ctx->strp); 1193 strp_check_rcv(&sw_ctx_rx->strp);
1180 } 1194 }
1181 1195
1182 goto out; 1196 goto out;
@@ -1188,11 +1202,16 @@ free_rec_seq:
1188 kfree(cctx->rec_seq); 1202 kfree(cctx->rec_seq);
1189 cctx->rec_seq = NULL; 1203 cctx->rec_seq = NULL;
1190free_iv: 1204free_iv:
1191 kfree(ctx->tx.iv); 1205 kfree(cctx->iv);
1192 ctx->tx.iv = NULL; 1206 cctx->iv = NULL;
1193free_priv: 1207free_priv:
1194 kfree(ctx->priv_ctx); 1208 if (tx) {
1195 ctx->priv_ctx = NULL; 1209 kfree(ctx->priv_ctx_tx);
1210 ctx->priv_ctx_tx = NULL;
1211 } else {
1212 kfree(ctx->priv_ctx_rx);
1213 ctx->priv_ctx_rx = NULL;
1214 }
1196out: 1215out:
1197 return rc; 1216 return rc;
1198} 1217}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c0fd8a85e7f7..5fe35aafdd9c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -725,6 +725,10 @@ int wiphy_register(struct wiphy *wiphy)
725 (!rdev->ops->set_pmk || !rdev->ops->del_pmk))) 725 (!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
726 return -EINVAL; 726 return -EINVAL;
727 727
728 if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
729 rdev->ops->update_connect_params))
730 return -EINVAL;
731
728 if (wiphy->addresses) 732 if (wiphy->addresses)
729 memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); 733 memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
730 734
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7c5135a92d76..07514ca011b2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4,6 +4,7 @@
4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright 2015-2017 Intel Deutschland GmbH 6 * Copyright 2015-2017 Intel Deutschland GmbH
7 * Copyright (C) 2018 Intel Corporation
7 */ 8 */
8 9
9#include <linux/if.h> 10#include <linux/if.h>
@@ -423,6 +424,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
423 [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, 424 [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
424 [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, 425 [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
425 [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, 426 [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
427
428 [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
429 [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
430 [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
426}; 431};
427 432
428/* policy for the key attributes */ 433/* policy for the key attributes */
@@ -645,7 +650,43 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
645 return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd); 650 return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
646} 651}
647 652
648static int nl80211_msg_put_channel(struct sk_buff *msg, 653static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
654 const struct ieee80211_reg_rule *rule)
655{
656 int j;
657 struct nlattr *nl_wmm_rules =
658 nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM);
659
660 if (!nl_wmm_rules)
661 goto nla_put_failure;
662
663 for (j = 0; j < IEEE80211_NUM_ACS; j++) {
664 struct nlattr *nl_wmm_rule = nla_nest_start(msg, j);
665
666 if (!nl_wmm_rule)
667 goto nla_put_failure;
668
669 if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
670 rule->wmm_rule->client[j].cw_min) ||
671 nla_put_u16(msg, NL80211_WMMR_CW_MAX,
672 rule->wmm_rule->client[j].cw_max) ||
673 nla_put_u8(msg, NL80211_WMMR_AIFSN,
674 rule->wmm_rule->client[j].aifsn) ||
675 nla_put_u8(msg, NL80211_WMMR_TXOP,
676 rule->wmm_rule->client[j].cot))
677 goto nla_put_failure;
678
679 nla_nest_end(msg, nl_wmm_rule);
680 }
681 nla_nest_end(msg, nl_wmm_rules);
682
683 return 0;
684
685nla_put_failure:
686 return -ENOBUFS;
687}
688
689static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
649 struct ieee80211_channel *chan, 690 struct ieee80211_channel *chan,
650 bool large) 691 bool large)
651{ 692{
@@ -721,12 +762,55 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
721 DBM_TO_MBM(chan->max_power))) 762 DBM_TO_MBM(chan->max_power)))
722 goto nla_put_failure; 763 goto nla_put_failure;
723 764
765 if (large) {
766 const struct ieee80211_reg_rule *rule =
767 freq_reg_info(wiphy, chan->center_freq);
768
769 if (!IS_ERR(rule) && rule->wmm_rule) {
770 if (nl80211_msg_put_wmm_rules(msg, rule))
771 goto nla_put_failure;
772 }
773 }
774
724 return 0; 775 return 0;
725 776
726 nla_put_failure: 777 nla_put_failure:
727 return -ENOBUFS; 778 return -ENOBUFS;
728} 779}
729 780
781static bool nl80211_put_txq_stats(struct sk_buff *msg,
782 struct cfg80211_txq_stats *txqstats,
783 int attrtype)
784{
785 struct nlattr *txqattr;
786
787#define PUT_TXQVAL_U32(attr, memb) do { \
788 if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) && \
789 nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \
790 return false; \
791 } while (0)
792
793 txqattr = nla_nest_start(msg, attrtype);
794 if (!txqattr)
795 return false;
796
797 PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes);
798 PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets);
799 PUT_TXQVAL_U32(FLOWS, flows);
800 PUT_TXQVAL_U32(DROPS, drops);
801 PUT_TXQVAL_U32(ECN_MARKS, ecn_marks);
802 PUT_TXQVAL_U32(OVERLIMIT, overlimit);
803 PUT_TXQVAL_U32(OVERMEMORY, overmemory);
804 PUT_TXQVAL_U32(COLLISIONS, collisions);
805 PUT_TXQVAL_U32(TX_BYTES, tx_bytes);
806 PUT_TXQVAL_U32(TX_PACKETS, tx_packets);
807 PUT_TXQVAL_U32(MAX_FLOWS, max_flows);
808 nla_nest_end(msg, txqattr);
809
810#undef PUT_TXQVAL_U32
811 return true;
812}
813
730/* netlink command implementations */ 814/* netlink command implementations */
731 815
732struct key_parse { 816struct key_parse {
@@ -1631,7 +1715,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
1631 chan = &sband->channels[i]; 1715 chan = &sband->channels[i];
1632 1716
1633 if (nl80211_msg_put_channel( 1717 if (nl80211_msg_put_channel(
1634 msg, chan, 1718 msg, &rdev->wiphy, chan,
1635 state->split)) 1719 state->split))
1636 goto nla_put_failure; 1720 goto nla_put_failure;
1637 1721
@@ -1926,6 +2010,28 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
1926 rdev->wiphy.nan_supported_bands)) 2010 rdev->wiphy.nan_supported_bands))
1927 goto nla_put_failure; 2011 goto nla_put_failure;
1928 2012
2013 if (wiphy_ext_feature_isset(&rdev->wiphy,
2014 NL80211_EXT_FEATURE_TXQS)) {
2015 struct cfg80211_txq_stats txqstats = {};
2016 int res;
2017
2018 res = rdev_get_txq_stats(rdev, NULL, &txqstats);
2019 if (!res &&
2020 !nl80211_put_txq_stats(msg, &txqstats,
2021 NL80211_ATTR_TXQ_STATS))
2022 goto nla_put_failure;
2023
2024 if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT,
2025 rdev->wiphy.txq_limit))
2026 goto nla_put_failure;
2027 if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT,
2028 rdev->wiphy.txq_memory_limit))
2029 goto nla_put_failure;
2030 if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM,
2031 rdev->wiphy.txq_quantum))
2032 goto nla_put_failure;
2033 }
2034
1929 /* done */ 2035 /* done */
1930 state->split_start = 0; 2036 state->split_start = 0;
1931 break; 2037 break;
@@ -2303,6 +2409,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2303 u8 retry_short = 0, retry_long = 0; 2409 u8 retry_short = 0, retry_long = 0;
2304 u32 frag_threshold = 0, rts_threshold = 0; 2410 u32 frag_threshold = 0, rts_threshold = 0;
2305 u8 coverage_class = 0; 2411 u8 coverage_class = 0;
2412 u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;
2306 2413
2307 ASSERT_RTNL(); 2414 ASSERT_RTNL();
2308 2415
@@ -2509,10 +2616,38 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2509 changed |= WIPHY_PARAM_DYN_ACK; 2616 changed |= WIPHY_PARAM_DYN_ACK;
2510 } 2617 }
2511 2618
2619 if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
2620 if (!wiphy_ext_feature_isset(&rdev->wiphy,
2621 NL80211_EXT_FEATURE_TXQS))
2622 return -EOPNOTSUPP;
2623 txq_limit = nla_get_u32(
2624 info->attrs[NL80211_ATTR_TXQ_LIMIT]);
2625 changed |= WIPHY_PARAM_TXQ_LIMIT;
2626 }
2627
2628 if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
2629 if (!wiphy_ext_feature_isset(&rdev->wiphy,
2630 NL80211_EXT_FEATURE_TXQS))
2631 return -EOPNOTSUPP;
2632 txq_memory_limit = nla_get_u32(
2633 info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
2634 changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
2635 }
2636
2637 if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
2638 if (!wiphy_ext_feature_isset(&rdev->wiphy,
2639 NL80211_EXT_FEATURE_TXQS))
2640 return -EOPNOTSUPP;
2641 txq_quantum = nla_get_u32(
2642 info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
2643 changed |= WIPHY_PARAM_TXQ_QUANTUM;
2644 }
2645
2512 if (changed) { 2646 if (changed) {
2513 u8 old_retry_short, old_retry_long; 2647 u8 old_retry_short, old_retry_long;
2514 u32 old_frag_threshold, old_rts_threshold; 2648 u32 old_frag_threshold, old_rts_threshold;
2515 u8 old_coverage_class; 2649 u8 old_coverage_class;
2650 u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
2516 2651
2517 if (!rdev->ops->set_wiphy_params) 2652 if (!rdev->ops->set_wiphy_params)
2518 return -EOPNOTSUPP; 2653 return -EOPNOTSUPP;
@@ -2522,6 +2657,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2522 old_frag_threshold = rdev->wiphy.frag_threshold; 2657 old_frag_threshold = rdev->wiphy.frag_threshold;
2523 old_rts_threshold = rdev->wiphy.rts_threshold; 2658 old_rts_threshold = rdev->wiphy.rts_threshold;
2524 old_coverage_class = rdev->wiphy.coverage_class; 2659 old_coverage_class = rdev->wiphy.coverage_class;
2660 old_txq_limit = rdev->wiphy.txq_limit;
2661 old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
2662 old_txq_quantum = rdev->wiphy.txq_quantum;
2525 2663
2526 if (changed & WIPHY_PARAM_RETRY_SHORT) 2664 if (changed & WIPHY_PARAM_RETRY_SHORT)
2527 rdev->wiphy.retry_short = retry_short; 2665 rdev->wiphy.retry_short = retry_short;
@@ -2533,6 +2671,12 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2533 rdev->wiphy.rts_threshold = rts_threshold; 2671 rdev->wiphy.rts_threshold = rts_threshold;
2534 if (changed & WIPHY_PARAM_COVERAGE_CLASS) 2672 if (changed & WIPHY_PARAM_COVERAGE_CLASS)
2535 rdev->wiphy.coverage_class = coverage_class; 2673 rdev->wiphy.coverage_class = coverage_class;
2674 if (changed & WIPHY_PARAM_TXQ_LIMIT)
2675 rdev->wiphy.txq_limit = txq_limit;
2676 if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT)
2677 rdev->wiphy.txq_memory_limit = txq_memory_limit;
2678 if (changed & WIPHY_PARAM_TXQ_QUANTUM)
2679 rdev->wiphy.txq_quantum = txq_quantum;
2536 2680
2537 result = rdev_set_wiphy_params(rdev, changed); 2681 result = rdev_set_wiphy_params(rdev, changed);
2538 if (result) { 2682 if (result) {
@@ -2541,6 +2685,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2541 rdev->wiphy.frag_threshold = old_frag_threshold; 2685 rdev->wiphy.frag_threshold = old_frag_threshold;
2542 rdev->wiphy.rts_threshold = old_rts_threshold; 2686 rdev->wiphy.rts_threshold = old_rts_threshold;
2543 rdev->wiphy.coverage_class = old_coverage_class; 2687 rdev->wiphy.coverage_class = old_coverage_class;
2688 rdev->wiphy.txq_limit = old_txq_limit;
2689 rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
2690 rdev->wiphy.txq_quantum = old_txq_quantum;
2544 return result; 2691 return result;
2545 } 2692 }
2546 } 2693 }
@@ -2662,6 +2809,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
2662 } 2809 }
2663 wdev_unlock(wdev); 2810 wdev_unlock(wdev);
2664 2811
2812 if (rdev->ops->get_txq_stats) {
2813 struct cfg80211_txq_stats txqstats = {};
2814 int ret = rdev_get_txq_stats(rdev, wdev, &txqstats);
2815
2816 if (ret == 0 &&
2817 !nl80211_put_txq_stats(msg, &txqstats,
2818 NL80211_ATTR_TXQ_STATS))
2819 goto nla_put_failure;
2820 }
2821
2665 genlmsg_end(msg, hdr); 2822 genlmsg_end(msg, hdr);
2666 return 0; 2823 return 0;
2667 2824
@@ -4494,11 +4651,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4494 PUT_SINFO_U64(BEACON_RX, rx_beacon); 4651 PUT_SINFO_U64(BEACON_RX, rx_beacon);
4495 PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); 4652 PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
4496 PUT_SINFO(ACK_SIGNAL, ack_signal, u8); 4653 PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
4654 if (wiphy_ext_feature_isset(&rdev->wiphy,
4655 NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
4656 PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
4497 4657
4498#undef PUT_SINFO 4658#undef PUT_SINFO
4499#undef PUT_SINFO_U64 4659#undef PUT_SINFO_U64
4500 4660
4501 if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) { 4661 if (sinfo->pertid) {
4502 struct nlattr *tidsattr; 4662 struct nlattr *tidsattr;
4503 int tid; 4663 int tid;
4504 4664
@@ -4532,6 +4692,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4532 PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed); 4692 PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);
4533 4693
4534#undef PUT_TIDVAL_U64 4694#undef PUT_TIDVAL_U64
4695 if ((tidstats->filled &
4696 BIT(NL80211_TID_STATS_TXQ_STATS)) &&
4697 !nl80211_put_txq_stats(msg, &tidstats->txq_stats,
4698 NL80211_TID_STATS_TXQ_STATS))
4699 goto nla_put_failure;
4700
4535 nla_nest_end(msg, tidattr); 4701 nla_nest_end(msg, tidattr);
4536 } 4702 }
4537 4703
@@ -4545,10 +4711,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4545 sinfo->assoc_req_ies)) 4711 sinfo->assoc_req_ies))
4546 goto nla_put_failure; 4712 goto nla_put_failure;
4547 4713
4714 cfg80211_sinfo_release_content(sinfo);
4548 genlmsg_end(msg, hdr); 4715 genlmsg_end(msg, hdr);
4549 return 0; 4716 return 0;
4550 4717
4551 nla_put_failure: 4718 nla_put_failure:
4719 cfg80211_sinfo_release_content(sinfo);
4552 genlmsg_cancel(msg, hdr); 4720 genlmsg_cancel(msg, hdr);
4553 return -EMSGSIZE; 4721 return -EMSGSIZE;
4554} 4722}
@@ -4630,8 +4798,10 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
4630 return err; 4798 return err;
4631 4799
4632 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4800 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4633 if (!msg) 4801 if (!msg) {
4802 cfg80211_sinfo_release_content(&sinfo);
4634 return -ENOMEM; 4803 return -ENOMEM;
4804 }
4635 4805
4636 if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 4806 if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
4637 info->snd_portid, info->snd_seq, 0, 4807 info->snd_portid, info->snd_seq, 0,
@@ -7930,7 +8100,15 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
7930 8100
7931 wdev_lock(wdev); 8101 wdev_lock(wdev);
7932 spin_lock_bh(&rdev->bss_lock); 8102 spin_lock_bh(&rdev->bss_lock);
7933 cfg80211_bss_expire(rdev); 8103
8104 /*
8105 * dump_scan will be called multiple times to break up the scan results
8106 * into multiple messages. It is unlikely that any more bss-es will be
8107 * expired after the first call, so only call only call this on the
8108 * first dump_scan invocation.
8109 */
8110 if (start == 0)
8111 cfg80211_bss_expire(rdev);
7934 8112
7935 cb->seq = rdev->bss_generation; 8113 cb->seq = rdev->bss_generation;
7936 8114
@@ -8336,6 +8514,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
8336 const u8 *bssid, *ssid; 8514 const u8 *bssid, *ssid;
8337 int err, ssid_len = 0; 8515 int err, ssid_len = 0;
8338 8516
8517 if (dev->ieee80211_ptr->conn_owner_nlportid &&
8518 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
8519 return -EPERM;
8520
8339 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) 8521 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8340 return -EINVAL; 8522 return -EINVAL;
8341 8523
@@ -8458,6 +8640,10 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
8458 u16 reason_code; 8640 u16 reason_code;
8459 bool local_state_change; 8641 bool local_state_change;
8460 8642
8643 if (dev->ieee80211_ptr->conn_owner_nlportid &&
8644 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
8645 return -EPERM;
8646
8461 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) 8647 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8462 return -EINVAL; 8648 return -EINVAL;
8463 8649
@@ -8505,6 +8691,10 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
8505 u16 reason_code; 8691 u16 reason_code;
8506 bool local_state_change; 8692 bool local_state_change;
8507 8693
8694 if (dev->ieee80211_ptr->conn_owner_nlportid &&
8695 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
8696 return -EPERM;
8697
8508 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) 8698 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8509 return -EINVAL; 8699 return -EINVAL;
8510 8700
@@ -9251,6 +9441,8 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
9251 struct cfg80211_registered_device *rdev = info->user_ptr[0]; 9441 struct cfg80211_registered_device *rdev = info->user_ptr[0];
9252 struct net_device *dev = info->user_ptr[1]; 9442 struct net_device *dev = info->user_ptr[1];
9253 struct wireless_dev *wdev = dev->ieee80211_ptr; 9443 struct wireless_dev *wdev = dev->ieee80211_ptr;
9444 bool fils_sk_offload;
9445 u32 auth_type;
9254 u32 changed = 0; 9446 u32 changed = 0;
9255 int ret; 9447 int ret;
9256 9448
@@ -9265,6 +9457,56 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
9265 changed |= UPDATE_ASSOC_IES; 9457 changed |= UPDATE_ASSOC_IES;
9266 } 9458 }
9267 9459
9460 fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy,
9461 NL80211_EXT_FEATURE_FILS_SK_OFFLOAD);
9462
9463 /*
9464 * when driver supports fils-sk offload all attributes must be
9465 * provided. So the else covers "fils-sk-not-all" and
9466 * "no-fils-sk-any".
9467 */
9468 if (fils_sk_offload &&
9469 info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
9470 info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
9471 info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
9472 info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
9473 connect.fils_erp_username =
9474 nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
9475 connect.fils_erp_username_len =
9476 nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
9477 connect.fils_erp_realm =
9478 nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
9479 connect.fils_erp_realm_len =
9480 nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
9481 connect.fils_erp_next_seq_num =
9482 nla_get_u16(
9483 info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
9484 connect.fils_erp_rrk =
9485 nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
9486 connect.fils_erp_rrk_len =
9487 nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
9488 changed |= UPDATE_FILS_ERP_INFO;
9489 } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
9490 info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
9491 info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
9492 info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
9493 return -EINVAL;
9494 }
9495
9496 if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
9497 auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
9498 if (!nl80211_valid_auth_type(rdev, auth_type,
9499 NL80211_CMD_CONNECT))
9500 return -EINVAL;
9501
9502 if (auth_type == NL80211_AUTHTYPE_FILS_SK &&
9503 fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO))
9504 return -EINVAL;
9505
9506 connect.auth_type = auth_type;
9507 changed |= UPDATE_AUTH_TYPE;
9508 }
9509
9268 wdev_lock(dev->ieee80211_ptr); 9510 wdev_lock(dev->ieee80211_ptr);
9269 if (!wdev->current_bss) 9511 if (!wdev->current_bss)
9270 ret = -ENOLINK; 9512 ret = -ENOLINK;
@@ -9282,6 +9524,10 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
9282 u16 reason; 9524 u16 reason;
9283 int ret; 9525 int ret;
9284 9526
9527 if (dev->ieee80211_ptr->conn_owner_nlportid &&
9528 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
9529 return -EPERM;
9530
9285 if (!info->attrs[NL80211_ATTR_REASON_CODE]) 9531 if (!info->attrs[NL80211_ATTR_REASON_CODE])
9286 reason = WLAN_REASON_DEAUTH_LEAVING; 9532 reason = WLAN_REASON_DEAUTH_LEAVING;
9287 else 9533 else
@@ -14028,8 +14274,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
14028 void *hdr; 14274 void *hdr;
14029 14275
14030 msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len + 14276 msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
14031 cr->fils_kek_len + cr->pmk_len + 14277 cr->fils.kek_len + cr->fils.pmk_len +
14032 (cr->pmkid ? WLAN_PMKID_LEN : 0), gfp); 14278 (cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
14033 if (!msg) 14279 if (!msg)
14034 return; 14280 return;
14035 14281
@@ -14055,17 +14301,17 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
14055 (cr->resp_ie && 14301 (cr->resp_ie &&
14056 nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len, 14302 nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len,
14057 cr->resp_ie)) || 14303 cr->resp_ie)) ||
14058 (cr->update_erp_next_seq_num && 14304 (cr->fils.update_erp_next_seq_num &&
14059 nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, 14305 nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
14060 cr->fils_erp_next_seq_num)) || 14306 cr->fils.erp_next_seq_num)) ||
14061 (cr->status == WLAN_STATUS_SUCCESS && 14307 (cr->status == WLAN_STATUS_SUCCESS &&
14062 ((cr->fils_kek && 14308 ((cr->fils.kek &&
14063 nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len, 14309 nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils.kek_len,
14064 cr->fils_kek)) || 14310 cr->fils.kek)) ||
14065 (cr->pmk && 14311 (cr->fils.pmk &&
14066 nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) || 14312 nla_put(msg, NL80211_ATTR_PMK, cr->fils.pmk_len, cr->fils.pmk)) ||
14067 (cr->pmkid && 14313 (cr->fils.pmkid &&
14068 nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid))))) 14314 nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
14069 goto nla_put_failure; 14315 goto nla_put_failure;
14070 14316
14071 genlmsg_end(msg, hdr); 14317 genlmsg_end(msg, hdr);
@@ -14086,7 +14332,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
14086 void *hdr; 14332 void *hdr;
14087 const u8 *bssid = info->bss ? info->bss->bssid : info->bssid; 14333 const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
14088 14334
14089 msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp); 14335 msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
14336 info->fils.kek_len + info->fils.pmk_len +
14337 (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
14090 if (!msg) 14338 if (!msg)
14091 return; 14339 return;
14092 14340
@@ -14104,7 +14352,17 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
14104 info->req_ie)) || 14352 info->req_ie)) ||
14105 (info->resp_ie && 14353 (info->resp_ie &&
14106 nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len, 14354 nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
14107 info->resp_ie))) 14355 info->resp_ie)) ||
14356 (info->fils.update_erp_next_seq_num &&
14357 nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
14358 info->fils.erp_next_seq_num)) ||
14359 (info->fils.kek &&
14360 nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len,
14361 info->fils.kek)) ||
14362 (info->fils.pmk &&
14363 nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) ||
14364 (info->fils.pmkid &&
14365 nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
14108 goto nla_put_failure; 14366 goto nla_put_failure;
14109 14367
14110 genlmsg_end(msg, hdr); 14368 genlmsg_end(msg, hdr);
@@ -14321,7 +14579,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
14321 nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE); 14579 nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
14322 if (!nl_freq) 14580 if (!nl_freq)
14323 goto nla_put_failure; 14581 goto nla_put_failure;
14324 if (nl80211_msg_put_channel(msg, channel_before, false)) 14582
14583 if (nl80211_msg_put_channel(msg, wiphy, channel_before, false))
14325 goto nla_put_failure; 14584 goto nla_put_failure;
14326 nla_nest_end(msg, nl_freq); 14585 nla_nest_end(msg, nl_freq);
14327 14586
@@ -14329,7 +14588,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
14329 nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER); 14588 nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
14330 if (!nl_freq) 14589 if (!nl_freq)
14331 goto nla_put_failure; 14590 goto nla_put_failure;
14332 if (nl80211_msg_put_channel(msg, channel_after, false)) 14591
14592 if (nl80211_msg_put_channel(msg, wiphy, channel_after, false))
14333 goto nla_put_failure; 14593 goto nla_put_failure;
14334 nla_nest_end(msg, nl_freq); 14594 nla_nest_end(msg, nl_freq);
14335 14595
@@ -14456,8 +14716,10 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
14456 trace_cfg80211_del_sta(dev, mac_addr); 14716 trace_cfg80211_del_sta(dev, mac_addr);
14457 14717
14458 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 14718 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
14459 if (!msg) 14719 if (!msg) {
14720 cfg80211_sinfo_release_content(sinfo);
14460 return; 14721 return;
14722 }
14461 14723
14462 if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, 14724 if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
14463 rdev, dev, mac_addr, sinfo) < 0) { 14725 rdev, dev, mac_addr, sinfo) < 0) {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 87479a53411b..364f5d67f05b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -586,6 +586,18 @@ rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
586 return ret; 586 return ret;
587} 587}
588 588
589static inline int
590rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
591 struct wireless_dev *wdev,
592 struct cfg80211_txq_stats *txqstats)
593{
594 int ret;
595 trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
596 ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
597 trace_rdev_return_int(&rdev->wiphy, ret);
598 return ret;
599}
600
589static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) 601static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
590{ 602{
591 trace_rdev_rfkill_poll(&rdev->wiphy); 603 trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5fcec5c94eb7..bbe6298e4bb9 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1656,7 +1656,7 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
1656 case NL80211_REGDOM_SET_BY_DRIVER: 1656 case NL80211_REGDOM_SET_BY_DRIVER:
1657 return "driver"; 1657 return "driver";
1658 case NL80211_REGDOM_SET_BY_COUNTRY_IE: 1658 case NL80211_REGDOM_SET_BY_COUNTRY_IE:
1659 return "country IE"; 1659 return "country element";
1660 default: 1660 default:
1661 WARN_ON(1); 1661 WARN_ON(1);
1662 return "bug"; 1662 return "bug";
@@ -2622,7 +2622,7 @@ reg_process_hint_country_ie(struct wiphy *wiphy,
2622 * This doesn't happen yet, not sure we 2622 * This doesn't happen yet, not sure we
2623 * ever want to support it for this case. 2623 * ever want to support it for this case.
2624 */ 2624 */
2625 WARN_ONCE(1, "Unexpected intersection for country IEs"); 2625 WARN_ONCE(1, "Unexpected intersection for country elements");
2626 return REG_REQ_IGNORE; 2626 return REG_REQ_IGNORE;
2627 } 2627 }
2628 2628
@@ -2772,6 +2772,21 @@ out_free:
2772 reg_free_request(reg_request); 2772 reg_free_request(reg_request);
2773} 2773}
2774 2774
2775static void notify_self_managed_wiphys(struct regulatory_request *request)
2776{
2777 struct cfg80211_registered_device *rdev;
2778 struct wiphy *wiphy;
2779
2780 list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
2781 wiphy = &rdev->wiphy;
2782 if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
2783 request->initiator == NL80211_REGDOM_SET_BY_USER &&
2784 request->user_reg_hint_type ==
2785 NL80211_USER_REG_HINT_CELL_BASE)
2786 reg_call_notifier(wiphy, request);
2787 }
2788}
2789
2775static bool reg_only_self_managed_wiphys(void) 2790static bool reg_only_self_managed_wiphys(void)
2776{ 2791{
2777 struct cfg80211_registered_device *rdev; 2792 struct cfg80211_registered_device *rdev;
@@ -2823,6 +2838,7 @@ static void reg_process_pending_hints(void)
2823 2838
2824 spin_unlock(&reg_requests_lock); 2839 spin_unlock(&reg_requests_lock);
2825 2840
2841 notify_self_managed_wiphys(reg_request);
2826 if (reg_only_self_managed_wiphys()) { 2842 if (reg_only_self_managed_wiphys()) {
2827 reg_free_request(reg_request); 2843 reg_free_request(reg_request);
2828 return; 2844 return;
@@ -3387,7 +3403,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
3387 case NL80211_DFS_JP: 3403 case NL80211_DFS_JP:
3388 return true; 3404 return true;
3389 default: 3405 default:
3390 pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region); 3406 pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region);
3391 return false; 3407 return false;
3392 } 3408 }
3393} 3409}
@@ -3702,17 +3718,26 @@ EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl);
3702 3718
3703void wiphy_regulatory_register(struct wiphy *wiphy) 3719void wiphy_regulatory_register(struct wiphy *wiphy)
3704{ 3720{
3705 struct regulatory_request *lr; 3721 struct regulatory_request *lr = get_last_request();
3706 3722
3707 /* self-managed devices ignore external hints */ 3723 /* self-managed devices ignore beacon hints and country IE */
3708 if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) 3724 if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
3709 wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS | 3725 wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS |
3710 REGULATORY_COUNTRY_IE_IGNORE; 3726 REGULATORY_COUNTRY_IE_IGNORE;
3711 3727
3728 /*
3729 * The last request may have been received before this
3730 * registration call. Call the driver notifier if
3731 * initiator is USER and user type is CELL_BASE.
3732 */
3733 if (lr->initiator == NL80211_REGDOM_SET_BY_USER &&
3734 lr->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE)
3735 reg_call_notifier(wiphy, lr);
3736 }
3737
3712 if (!reg_dev_ignore_cell_hint(wiphy)) 3738 if (!reg_dev_ignore_cell_hint(wiphy))
3713 reg_num_devs_support_basehint++; 3739 reg_num_devs_support_basehint++;
3714 3740
3715 lr = get_last_request();
3716 wiphy_update_regulatory(wiphy, lr->initiator); 3741 wiphy_update_regulatory(wiphy, lr->initiator);
3717 wiphy_all_share_dfs_chan_state(wiphy); 3742 wiphy_all_share_dfs_chan_state(wiphy);
3718} 3743}
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5df6b33db786..d536b07582f8 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -803,8 +803,8 @@ void cfg80211_connect_done(struct net_device *dev,
803 803
804 ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) + 804 ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
805 params->req_ie_len + params->resp_ie_len + 805 params->req_ie_len + params->resp_ie_len +
806 params->fils_kek_len + params->pmk_len + 806 params->fils.kek_len + params->fils.pmk_len +
807 (params->pmkid ? WLAN_PMKID_LEN : 0), gfp); 807 (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
808 if (!ev) { 808 if (!ev) {
809 cfg80211_put_bss(wdev->wiphy, params->bss); 809 cfg80211_put_bss(wdev->wiphy, params->bss);
810 return; 810 return;
@@ -831,27 +831,29 @@ void cfg80211_connect_done(struct net_device *dev,
831 params->resp_ie_len); 831 params->resp_ie_len);
832 next += params->resp_ie_len; 832 next += params->resp_ie_len;
833 } 833 }
834 if (params->fils_kek_len) { 834 if (params->fils.kek_len) {
835 ev->cr.fils_kek = next; 835 ev->cr.fils.kek = next;
836 ev->cr.fils_kek_len = params->fils_kek_len; 836 ev->cr.fils.kek_len = params->fils.kek_len;
837 memcpy((void *)ev->cr.fils_kek, params->fils_kek, 837 memcpy((void *)ev->cr.fils.kek, params->fils.kek,
838 params->fils_kek_len); 838 params->fils.kek_len);
839 next += params->fils_kek_len; 839 next += params->fils.kek_len;
840 } 840 }
841 if (params->pmk_len) { 841 if (params->fils.pmk_len) {
842 ev->cr.pmk = next; 842 ev->cr.fils.pmk = next;
843 ev->cr.pmk_len = params->pmk_len; 843 ev->cr.fils.pmk_len = params->fils.pmk_len;
844 memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len); 844 memcpy((void *)ev->cr.fils.pmk, params->fils.pmk,
845 next += params->pmk_len; 845 params->fils.pmk_len);
846 next += params->fils.pmk_len;
846 } 847 }
847 if (params->pmkid) { 848 if (params->fils.pmkid) {
848 ev->cr.pmkid = next; 849 ev->cr.fils.pmkid = next;
849 memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN); 850 memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid,
851 WLAN_PMKID_LEN);
850 next += WLAN_PMKID_LEN; 852 next += WLAN_PMKID_LEN;
851 } 853 }
852 ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num; 854 ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num;
853 if (params->update_erp_next_seq_num) 855 if (params->fils.update_erp_next_seq_num)
854 ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num; 856 ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num;
855 if (params->bss) 857 if (params->bss)
856 cfg80211_hold_bss(bss_from_pub(params->bss)); 858 cfg80211_hold_bss(bss_from_pub(params->bss));
857 ev->cr.bss = params->bss; 859 ev->cr.bss = params->bss;
@@ -930,6 +932,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
930 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 932 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
931 struct cfg80211_event *ev; 933 struct cfg80211_event *ev;
932 unsigned long flags; 934 unsigned long flags;
935 u8 *next;
933 936
934 if (!info->bss) { 937 if (!info->bss) {
935 info->bss = cfg80211_get_bss(wdev->wiphy, info->channel, 938 info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
@@ -942,19 +945,52 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
942 if (WARN_ON(!info->bss)) 945 if (WARN_ON(!info->bss))
943 return; 946 return;
944 947
945 ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp); 948 ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
949 info->fils.kek_len + info->fils.pmk_len +
950 (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
946 if (!ev) { 951 if (!ev) {
947 cfg80211_put_bss(wdev->wiphy, info->bss); 952 cfg80211_put_bss(wdev->wiphy, info->bss);
948 return; 953 return;
949 } 954 }
950 955
951 ev->type = EVENT_ROAMED; 956 ev->type = EVENT_ROAMED;
952 ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev); 957 next = ((u8 *)ev) + sizeof(*ev);
953 ev->rm.req_ie_len = info->req_ie_len; 958 if (info->req_ie_len) {
954 memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); 959 ev->rm.req_ie = next;
955 ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len; 960 ev->rm.req_ie_len = info->req_ie_len;
956 ev->rm.resp_ie_len = info->resp_ie_len; 961 memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
957 memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); 962 next += info->req_ie_len;
963 }
964 if (info->resp_ie_len) {
965 ev->rm.resp_ie = next;
966 ev->rm.resp_ie_len = info->resp_ie_len;
967 memcpy((void *)ev->rm.resp_ie, info->resp_ie,
968 info->resp_ie_len);
969 next += info->resp_ie_len;
970 }
971 if (info->fils.kek_len) {
972 ev->rm.fils.kek = next;
973 ev->rm.fils.kek_len = info->fils.kek_len;
974 memcpy((void *)ev->rm.fils.kek, info->fils.kek,
975 info->fils.kek_len);
976 next += info->fils.kek_len;
977 }
978 if (info->fils.pmk_len) {
979 ev->rm.fils.pmk = next;
980 ev->rm.fils.pmk_len = info->fils.pmk_len;
981 memcpy((void *)ev->rm.fils.pmk, info->fils.pmk,
982 info->fils.pmk_len);
983 next += info->fils.pmk_len;
984 }
985 if (info->fils.pmkid) {
986 ev->rm.fils.pmkid = next;
987 memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid,
988 WLAN_PMKID_LEN);
989 next += WLAN_PMKID_LEN;
990 }
991 ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
992 if (info->fils.update_erp_next_seq_num)
993 ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
958 ev->rm.bss = info->bss; 994 ev->rm.bss = info->bss;
959 995
960 spin_lock_irqsave(&wdev->event_lock, flags); 996 spin_lock_irqsave(&wdev->event_lock, flags);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 55fb279a5196..2b417a2fe63f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3243,6 +3243,20 @@ TRACE_EVENT(rdev_set_multicast_to_unicast,
3243 WIPHY_PR_ARG, NETDEV_PR_ARG, 3243 WIPHY_PR_ARG, NETDEV_PR_ARG,
3244 BOOL_TO_STR(__entry->enabled)) 3244 BOOL_TO_STR(__entry->enabled))
3245); 3245);
3246
3247TRACE_EVENT(rdev_get_txq_stats,
3248 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
3249 TP_ARGS(wiphy, wdev),
3250 TP_STRUCT__entry(
3251 WIPHY_ENTRY
3252 WDEV_ENTRY
3253 ),
3254 TP_fast_assign(
3255 WIPHY_ASSIGN;
3256 WDEV_ASSIGN;
3257 ),
3258 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
3259);
3246#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ 3260#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
3247 3261
3248#undef TRACE_INCLUDE_PATH 3262#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d112e9a89364..b5bb1c309914 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1787,6 +1787,17 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
1787 return false; 1787 return false;
1788} 1788}
1789 1789
1790int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
1791{
1792 sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)),
1793 IEEE80211_NUM_TIDS + 1, gfp);
1794 if (!sinfo->pertid)
1795 return -ENOMEM;
1796
1797 return 0;
1798}
1799EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats);
1800
1790/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ 1801/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
1791/* Ethernet-II snap header (RFC1042 for most EtherTypes) */ 1802/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
1792const unsigned char rfc1042_header[] __aligned(2) = 1803const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
1config XDP_SOCKETS
2 bool "XDP sockets"
3 depends on BPF_SYSCALL
4 default n
5 help
6 XDP sockets allows a channel between XDP programs and
7 userspace applications.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..04f073146256
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..7eb4948a38d2
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,361 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 */
5
6#include <linux/init.h>
7#include <linux/sched/mm.h>
8#include <linux/sched/signal.h>
9#include <linux/sched/task.h>
10#include <linux/uaccess.h>
11#include <linux/slab.h>
12#include <linux/bpf.h>
13#include <linux/mm.h>
14
15#include "xdp_umem.h"
16#include "xsk_queue.h"
17
18#define XDP_UMEM_MIN_CHUNK_SIZE 2048
19
20void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
21{
22 unsigned long flags;
23
24 spin_lock_irqsave(&umem->xsk_list_lock, flags);
25 list_add_rcu(&xs->list, &umem->xsk_list);
26 spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
27}
28
29void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
30{
31 unsigned long flags;
32
33 if (xs->dev) {
34 spin_lock_irqsave(&umem->xsk_list_lock, flags);
35 list_del_rcu(&xs->list);
36 spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
37
38 if (umem->zc)
39 synchronize_net();
40 }
41}
42
43int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
44 u32 queue_id, u16 flags)
45{
46 bool force_zc, force_copy;
47 struct netdev_bpf bpf;
48 int err;
49
50 force_zc = flags & XDP_ZEROCOPY;
51 force_copy = flags & XDP_COPY;
52
53 if (force_zc && force_copy)
54 return -EINVAL;
55
56 if (force_copy)
57 return 0;
58
59 dev_hold(dev);
60
61 if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
62 bpf.command = XDP_QUERY_XSK_UMEM;
63
64 rtnl_lock();
65 err = dev->netdev_ops->ndo_bpf(dev, &bpf);
66 rtnl_unlock();
67
68 if (err) {
69 dev_put(dev);
70 return force_zc ? -ENOTSUPP : 0;
71 }
72
73 bpf.command = XDP_SETUP_XSK_UMEM;
74 bpf.xsk.umem = umem;
75 bpf.xsk.queue_id = queue_id;
76
77 rtnl_lock();
78 err = dev->netdev_ops->ndo_bpf(dev, &bpf);
79 rtnl_unlock();
80
81 if (err) {
82 dev_put(dev);
83 return force_zc ? err : 0; /* fail or fallback */
84 }
85
86 umem->dev = dev;
87 umem->queue_id = queue_id;
88 umem->zc = true;
89 return 0;
90 }
91
92 dev_put(dev);
93 return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
94}
95
96static void xdp_umem_clear_dev(struct xdp_umem *umem)
97{
98 struct netdev_bpf bpf;
99 int err;
100
101 if (umem->dev) {
102 bpf.command = XDP_SETUP_XSK_UMEM;
103 bpf.xsk.umem = NULL;
104 bpf.xsk.queue_id = umem->queue_id;
105
106 rtnl_lock();
107 err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
108 rtnl_unlock();
109
110 if (err)
111 WARN(1, "failed to disable umem!\n");
112
113 dev_put(umem->dev);
114 umem->dev = NULL;
115 }
116}
117
118static void xdp_umem_unpin_pages(struct xdp_umem *umem)
119{
120 unsigned int i;
121
122 for (i = 0; i < umem->npgs; i++) {
123 struct page *page = umem->pgs[i];
124
125 set_page_dirty_lock(page);
126 put_page(page);
127 }
128
129 kfree(umem->pgs);
130 umem->pgs = NULL;
131}
132
133static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
134{
135 atomic_long_sub(umem->npgs, &umem->user->locked_vm);
136 free_uid(umem->user);
137}
138
139static void xdp_umem_release(struct xdp_umem *umem)
140{
141 struct task_struct *task;
142 struct mm_struct *mm;
143
144 xdp_umem_clear_dev(umem);
145
146 if (umem->fq) {
147 xskq_destroy(umem->fq);
148 umem->fq = NULL;
149 }
150
151 if (umem->cq) {
152 xskq_destroy(umem->cq);
153 umem->cq = NULL;
154 }
155
156 xdp_umem_unpin_pages(umem);
157
158 task = get_pid_task(umem->pid, PIDTYPE_PID);
159 put_pid(umem->pid);
160 if (!task)
161 goto out;
162 mm = get_task_mm(task);
163 put_task_struct(task);
164 if (!mm)
165 goto out;
166
167 mmput(mm);
168 kfree(umem->pages);
169 umem->pages = NULL;
170
171 xdp_umem_unaccount_pages(umem);
172out:
173 kfree(umem);
174}
175
176static void xdp_umem_release_deferred(struct work_struct *work)
177{
178 struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
179
180 xdp_umem_release(umem);
181}
182
183void xdp_get_umem(struct xdp_umem *umem)
184{
185 refcount_inc(&umem->users);
186}
187
188void xdp_put_umem(struct xdp_umem *umem)
189{
190 if (!umem)
191 return;
192
193 if (refcount_dec_and_test(&umem->users)) {
194 INIT_WORK(&umem->work, xdp_umem_release_deferred);
195 schedule_work(&umem->work);
196 }
197}
198
199static int xdp_umem_pin_pages(struct xdp_umem *umem)
200{
201 unsigned int gup_flags = FOLL_WRITE;
202 long npgs;
203 int err;
204
205 umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
206 if (!umem->pgs)
207 return -ENOMEM;
208
209 down_write(&current->mm->mmap_sem);
210 npgs = get_user_pages(umem->address, umem->npgs,
211 gup_flags, &umem->pgs[0], NULL);
212 up_write(&current->mm->mmap_sem);
213
214 if (npgs != umem->npgs) {
215 if (npgs >= 0) {
216 umem->npgs = npgs;
217 err = -ENOMEM;
218 goto out_pin;
219 }
220 err = npgs;
221 goto out_pgs;
222 }
223 return 0;
224
225out_pin:
226 xdp_umem_unpin_pages(umem);
227out_pgs:
228 kfree(umem->pgs);
229 umem->pgs = NULL;
230 return err;
231}
232
233static int xdp_umem_account_pages(struct xdp_umem *umem)
234{
235 unsigned long lock_limit, new_npgs, old_npgs;
236
237 if (capable(CAP_IPC_LOCK))
238 return 0;
239
240 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
241 umem->user = get_uid(current_user());
242
243 do {
244 old_npgs = atomic_long_read(&umem->user->locked_vm);
245 new_npgs = old_npgs + umem->npgs;
246 if (new_npgs > lock_limit) {
247 free_uid(umem->user);
248 umem->user = NULL;
249 return -ENOBUFS;
250 }
251 } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
252 new_npgs) != old_npgs);
253 return 0;
254}
255
256static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
257{
258 u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
259 unsigned int chunks, chunks_per_page;
260 u64 addr = mr->addr, size = mr->len;
261 int size_chk, err, i;
262
263 if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
264 /* Strictly speaking we could support this, if:
265 * - huge pages, or*
266 * - using an IOMMU, or
267 * - making sure the memory area is consecutive
268 * but for now, we simply say "computer says no".
269 */
270 return -EINVAL;
271 }
272
273 if (!is_power_of_2(chunk_size))
274 return -EINVAL;
275
276 if (!PAGE_ALIGNED(addr)) {
277 /* Memory area has to be page size aligned. For
278 * simplicity, this might change.
279 */
280 return -EINVAL;
281 }
282
283 if ((addr + size) < addr)
284 return -EINVAL;
285
286 chunks = (unsigned int)div_u64(size, chunk_size);
287 if (chunks == 0)
288 return -EINVAL;
289
290 chunks_per_page = PAGE_SIZE / chunk_size;
291 if (chunks < chunks_per_page || chunks % chunks_per_page)
292 return -EINVAL;
293
294 headroom = ALIGN(headroom, 64);
295
296 size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
297 if (size_chk < 0)
298 return -EINVAL;
299
300 umem->pid = get_task_pid(current, PIDTYPE_PID);
301 umem->address = (unsigned long)addr;
302 umem->props.chunk_mask = ~((u64)chunk_size - 1);
303 umem->props.size = size;
304 umem->headroom = headroom;
305 umem->chunk_size_nohr = chunk_size - headroom;
306 umem->npgs = size / PAGE_SIZE;
307 umem->pgs = NULL;
308 umem->user = NULL;
309 INIT_LIST_HEAD(&umem->xsk_list);
310 spin_lock_init(&umem->xsk_list_lock);
311
312 refcount_set(&umem->users, 1);
313
314 err = xdp_umem_account_pages(umem);
315 if (err)
316 goto out;
317
318 err = xdp_umem_pin_pages(umem);
319 if (err)
320 goto out_account;
321
322 umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
323 if (!umem->pages) {
324 err = -ENOMEM;
325 goto out_account;
326 }
327
328 for (i = 0; i < umem->npgs; i++)
329 umem->pages[i].addr = page_address(umem->pgs[i]);
330
331 return 0;
332
333out_account:
334 xdp_umem_unaccount_pages(umem);
335out:
336 put_pid(umem->pid);
337 return err;
338}
339
340struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
341{
342 struct xdp_umem *umem;
343 int err;
344
345 umem = kzalloc(sizeof(*umem), GFP_KERNEL);
346 if (!umem)
347 return ERR_PTR(-ENOMEM);
348
349 err = xdp_umem_reg(umem, mr);
350 if (err) {
351 kfree(umem);
352 return ERR_PTR(err);
353 }
354
355 return umem;
356}
357
358bool xdp_umem_validate_queues(struct xdp_umem *umem)
359{
360 return umem->fq && umem->cq;
361}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..f11560334f88
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,30 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 */
5
6#ifndef XDP_UMEM_H_
7#define XDP_UMEM_H_
8
9#include <net/xdp_sock.h>
10
11static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
12{
13 return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
14}
15
16static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
17{
18 return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
19}
20
21int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
22 u32 queue_id, u16 flags);
23bool xdp_umem_validate_queues(struct xdp_umem *umem);
24void xdp_get_umem(struct xdp_umem *umem);
25void xdp_put_umem(struct xdp_umem *umem);
26void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
27void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
28struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
29
30#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..40eab10dfc49
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,14 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 */
5
6#ifndef XDP_UMEM_PROPS_H_
7#define XDP_UMEM_PROPS_H_
8
9struct xdp_umem_props {
10 u64 chunk_mask;
11 u64 size;
12};
13
14#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..c6ed2454f7ce
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,788 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
24#include <linux/rculist.h>
25#include <net/xdp_sock.h>
26#include <net/xdp.h>
27
28#include "xsk_queue.h"
29#include "xdp_umem.h"
30
31#define TX_BATCH_SIZE 16
32
33static struct xdp_sock *xdp_sk(struct sock *sk)
34{
35 return (struct xdp_sock *)sk;
36}
37
38bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
39{
40 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
41 READ_ONCE(xs->umem->fq);
42}
43
44u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
45{
46 return xskq_peek_addr(umem->fq, addr);
47}
48EXPORT_SYMBOL(xsk_umem_peek_addr);
49
50void xsk_umem_discard_addr(struct xdp_umem *umem)
51{
52 xskq_discard_addr(umem->fq);
53}
54EXPORT_SYMBOL(xsk_umem_discard_addr);
55
56static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
57{
58 void *buffer;
59 u64 addr;
60 int err;
61
62 if (!xskq_peek_addr(xs->umem->fq, &addr) ||
63 len > xs->umem->chunk_size_nohr) {
64 xs->rx_dropped++;
65 return -ENOSPC;
66 }
67
68 addr += xs->umem->headroom;
69
70 buffer = xdp_umem_get_data(xs->umem, addr);
71 memcpy(buffer, xdp->data, len);
72 err = xskq_produce_batch_desc(xs->rx, addr, len);
73 if (!err) {
74 xskq_discard_addr(xs->umem->fq);
75 xdp_return_buff(xdp);
76 return 0;
77 }
78
79 xs->rx_dropped++;
80 return err;
81}
82
83static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
84{
85 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
86
87 if (err) {
88 xdp_return_buff(xdp);
89 xs->rx_dropped++;
90 }
91
92 return err;
93}
94
95int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
96{
97 u32 len;
98
99 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
100 return -EINVAL;
101
102 len = xdp->data_end - xdp->data;
103
104 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
105 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
106}
107
108void xsk_flush(struct xdp_sock *xs)
109{
110 xskq_produce_flush_desc(xs->rx);
111 xs->sk.sk_data_ready(&xs->sk);
112}
113
114int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
115{
116 u32 len = xdp->data_end - xdp->data;
117 void *buffer;
118 u64 addr;
119 int err;
120
121 if (!xskq_peek_addr(xs->umem->fq, &addr) ||
122 len > xs->umem->chunk_size_nohr) {
123 xs->rx_dropped++;
124 return -ENOSPC;
125 }
126
127 addr += xs->umem->headroom;
128
129 buffer = xdp_umem_get_data(xs->umem, addr);
130 memcpy(buffer, xdp->data, len);
131 err = xskq_produce_batch_desc(xs->rx, addr, len);
132 if (!err) {
133 xskq_discard_addr(xs->umem->fq);
134 xsk_flush(xs);
135 return 0;
136 }
137
138 xs->rx_dropped++;
139 return err;
140}
141
142void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
143{
144 xskq_produce_flush_addr_n(umem->cq, nb_entries);
145}
146EXPORT_SYMBOL(xsk_umem_complete_tx);
147
148void xsk_umem_consume_tx_done(struct xdp_umem *umem)
149{
150 struct xdp_sock *xs;
151
152 rcu_read_lock();
153 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
154 xs->sk.sk_write_space(&xs->sk);
155 }
156 rcu_read_unlock();
157}
158EXPORT_SYMBOL(xsk_umem_consume_tx_done);
159
160bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
161{
162 struct xdp_desc desc;
163 struct xdp_sock *xs;
164
165 rcu_read_lock();
166 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
167 if (!xskq_peek_desc(xs->tx, &desc))
168 continue;
169
170 if (xskq_produce_addr_lazy(umem->cq, desc.addr))
171 goto out;
172
173 *dma = xdp_umem_get_dma(umem, desc.addr);
174 *len = desc.len;
175
176 xskq_discard_desc(xs->tx);
177 rcu_read_unlock();
178 return true;
179 }
180
181out:
182 rcu_read_unlock();
183 return false;
184}
185EXPORT_SYMBOL(xsk_umem_consume_tx);
186
187static int xsk_zc_xmit(struct sock *sk)
188{
189 struct xdp_sock *xs = xdp_sk(sk);
190 struct net_device *dev = xs->dev;
191
192 return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
193}
194
195static void xsk_destruct_skb(struct sk_buff *skb)
196{
197 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
198 struct xdp_sock *xs = xdp_sk(skb->sk);
199
200 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
201
202 sock_wfree(skb);
203}
204
205static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
206 size_t total_len)
207{
208 u32 max_batch = TX_BATCH_SIZE;
209 struct xdp_sock *xs = xdp_sk(sk);
210 bool sent_frame = false;
211 struct xdp_desc desc;
212 struct sk_buff *skb;
213 int err = 0;
214
215 if (unlikely(!xs->tx))
216 return -ENOBUFS;
217
218 mutex_lock(&xs->mutex);
219
220 while (xskq_peek_desc(xs->tx, &desc)) {
221 char *buffer;
222 u64 addr;
223 u32 len;
224
225 if (max_batch-- == 0) {
226 err = -EAGAIN;
227 goto out;
228 }
229
230 if (xskq_reserve_addr(xs->umem->cq)) {
231 err = -EAGAIN;
232 goto out;
233 }
234
235 len = desc.len;
236 if (unlikely(len > xs->dev->mtu)) {
237 err = -EMSGSIZE;
238 goto out;
239 }
240
241 if (xs->queue_id >= xs->dev->real_num_tx_queues) {
242 err = -ENXIO;
243 goto out;
244 }
245
246 skb = sock_alloc_send_skb(sk, len, 1, &err);
247 if (unlikely(!skb)) {
248 err = -EAGAIN;
249 goto out;
250 }
251
252 skb_put(skb, len);
253 addr = desc.addr;
254 buffer = xdp_umem_get_data(xs->umem, addr);
255 err = skb_store_bits(skb, 0, buffer, len);
256 if (unlikely(err)) {
257 kfree_skb(skb);
258 goto out;
259 }
260
261 skb->dev = xs->dev;
262 skb->priority = sk->sk_priority;
263 skb->mark = sk->sk_mark;
264 skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
265 skb->destructor = xsk_destruct_skb;
266
267 err = dev_direct_xmit(skb, xs->queue_id);
268 /* Ignore NET_XMIT_CN as packet might have been sent */
269 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
270 err = -EAGAIN;
271 /* SKB consumed by dev_direct_xmit() */
272 goto out;
273 }
274
275 sent_frame = true;
276 xskq_discard_desc(xs->tx);
277 }
278
279out:
280 if (sent_frame)
281 sk->sk_write_space(sk);
282
283 mutex_unlock(&xs->mutex);
284 return err;
285}
286
287static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
288{
289 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
290 struct sock *sk = sock->sk;
291 struct xdp_sock *xs = xdp_sk(sk);
292
293 if (unlikely(!xs->dev))
294 return -ENXIO;
295 if (unlikely(!(xs->dev->flags & IFF_UP)))
296 return -ENETDOWN;
297 if (need_wait)
298 return -EOPNOTSUPP;
299
300 return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
301}
302
303static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events)
304{
305 __poll_t mask = datagram_poll_mask(sock, events);
306 struct sock *sk = sock->sk;
307 struct xdp_sock *xs = xdp_sk(sk);
308
309 if (xs->rx && !xskq_empty_desc(xs->rx))
310 mask |= POLLIN | POLLRDNORM;
311 if (xs->tx && !xskq_full_desc(xs->tx))
312 mask |= POLLOUT | POLLWRNORM;
313
314 return mask;
315}
316
317static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
318 bool umem_queue)
319{
320 struct xsk_queue *q;
321
322 if (entries == 0 || *queue || !is_power_of_2(entries))
323 return -EINVAL;
324
325 q = xskq_create(entries, umem_queue);
326 if (!q)
327 return -ENOMEM;
328
329 /* Make sure queue is ready before it can be seen by others */
330 smp_wmb();
331 *queue = q;
332 return 0;
333}
334
335static int xsk_release(struct socket *sock)
336{
337 struct sock *sk = sock->sk;
338 struct xdp_sock *xs = xdp_sk(sk);
339 struct net *net;
340
341 if (!sk)
342 return 0;
343
344 net = sock_net(sk);
345
346 local_bh_disable();
347 sock_prot_inuse_add(net, sk->sk_prot, -1);
348 local_bh_enable();
349
350 if (xs->dev) {
351 /* Wait for driver to stop using the xdp socket. */
352 synchronize_net();
353 dev_put(xs->dev);
354 xs->dev = NULL;
355 }
356
357 sock_orphan(sk);
358 sock->sk = NULL;
359
360 sk_refcnt_debug_release(sk);
361 sock_put(sk);
362
363 return 0;
364}
365
366static struct socket *xsk_lookup_xsk_from_fd(int fd)
367{
368 struct socket *sock;
369 int err;
370
371 sock = sockfd_lookup(fd, &err);
372 if (!sock)
373 return ERR_PTR(-ENOTSOCK);
374
375 if (sock->sk->sk_family != PF_XDP) {
376 sockfd_put(sock);
377 return ERR_PTR(-ENOPROTOOPT);
378 }
379
380 return sock;
381}
382
383static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
384{
385 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
386 struct sock *sk = sock->sk;
387 struct xdp_sock *xs = xdp_sk(sk);
388 struct net_device *dev;
389 u32 flags, qid;
390 int err = 0;
391
392 if (addr_len < sizeof(struct sockaddr_xdp))
393 return -EINVAL;
394 if (sxdp->sxdp_family != AF_XDP)
395 return -EINVAL;
396
397 mutex_lock(&xs->mutex);
398 if (xs->dev) {
399 err = -EBUSY;
400 goto out_release;
401 }
402
403 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
404 if (!dev) {
405 err = -ENODEV;
406 goto out_release;
407 }
408
409 if (!xs->rx && !xs->tx) {
410 err = -EINVAL;
411 goto out_unlock;
412 }
413
414 qid = sxdp->sxdp_queue_id;
415
416 if ((xs->rx && qid >= dev->real_num_rx_queues) ||
417 (xs->tx && qid >= dev->real_num_tx_queues)) {
418 err = -EINVAL;
419 goto out_unlock;
420 }
421
422 flags = sxdp->sxdp_flags;
423
424 if (flags & XDP_SHARED_UMEM) {
425 struct xdp_sock *umem_xs;
426 struct socket *sock;
427
428 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
429 /* Cannot specify flags for shared sockets. */
430 err = -EINVAL;
431 goto out_unlock;
432 }
433
434 if (xs->umem) {
435 /* We have already our own. */
436 err = -EINVAL;
437 goto out_unlock;
438 }
439
440 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
441 if (IS_ERR(sock)) {
442 err = PTR_ERR(sock);
443 goto out_unlock;
444 }
445
446 umem_xs = xdp_sk(sock->sk);
447 if (!umem_xs->umem) {
448 /* No umem to inherit. */
449 err = -EBADF;
450 sockfd_put(sock);
451 goto out_unlock;
452 } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
453 err = -EINVAL;
454 sockfd_put(sock);
455 goto out_unlock;
456 }
457
458 xdp_get_umem(umem_xs->umem);
459 xs->umem = umem_xs->umem;
460 sockfd_put(sock);
461 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
462 err = -EINVAL;
463 goto out_unlock;
464 } else {
465 /* This xsk has its own umem. */
466 xskq_set_umem(xs->umem->fq, &xs->umem->props);
467 xskq_set_umem(xs->umem->cq, &xs->umem->props);
468
469 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
470 if (err)
471 goto out_unlock;
472 }
473
474 xs->dev = dev;
475 xs->zc = xs->umem->zc;
476 xs->queue_id = qid;
477 xskq_set_umem(xs->rx, &xs->umem->props);
478 xskq_set_umem(xs->tx, &xs->umem->props);
479 xdp_add_sk_umem(xs->umem, xs);
480
481out_unlock:
482 if (err)
483 dev_put(dev);
484out_release:
485 mutex_unlock(&xs->mutex);
486 return err;
487}
488
489static int xsk_setsockopt(struct socket *sock, int level, int optname,
490 char __user *optval, unsigned int optlen)
491{
492 struct sock *sk = sock->sk;
493 struct xdp_sock *xs = xdp_sk(sk);
494 int err;
495
496 if (level != SOL_XDP)
497 return -ENOPROTOOPT;
498
499 switch (optname) {
500 case XDP_RX_RING:
501 case XDP_TX_RING:
502 {
503 struct xsk_queue **q;
504 int entries;
505
506 if (optlen < sizeof(entries))
507 return -EINVAL;
508 if (copy_from_user(&entries, optval, sizeof(entries)))
509 return -EFAULT;
510
511 mutex_lock(&xs->mutex);
512 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
513 err = xsk_init_queue(entries, q, false);
514 mutex_unlock(&xs->mutex);
515 return err;
516 }
517 case XDP_UMEM_REG:
518 {
519 struct xdp_umem_reg mr;
520 struct xdp_umem *umem;
521
522 if (copy_from_user(&mr, optval, sizeof(mr)))
523 return -EFAULT;
524
525 mutex_lock(&xs->mutex);
526 if (xs->umem) {
527 mutex_unlock(&xs->mutex);
528 return -EBUSY;
529 }
530
531 umem = xdp_umem_create(&mr);
532 if (IS_ERR(umem)) {
533 mutex_unlock(&xs->mutex);
534 return PTR_ERR(umem);
535 }
536
537 /* Make sure umem is ready before it can be seen by others */
538 smp_wmb();
539 xs->umem = umem;
540 mutex_unlock(&xs->mutex);
541 return 0;
542 }
543 case XDP_UMEM_FILL_RING:
544 case XDP_UMEM_COMPLETION_RING:
545 {
546 struct xsk_queue **q;
547 int entries;
548
549 if (copy_from_user(&entries, optval, sizeof(entries)))
550 return -EFAULT;
551
552 mutex_lock(&xs->mutex);
553 if (!xs->umem) {
554 mutex_unlock(&xs->mutex);
555 return -EINVAL;
556 }
557
558 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
559 &xs->umem->cq;
560 err = xsk_init_queue(entries, q, true);
561 mutex_unlock(&xs->mutex);
562 return err;
563 }
564 default:
565 break;
566 }
567
568 return -ENOPROTOOPT;
569}
570
571static int xsk_getsockopt(struct socket *sock, int level, int optname,
572 char __user *optval, int __user *optlen)
573{
574 struct sock *sk = sock->sk;
575 struct xdp_sock *xs = xdp_sk(sk);
576 int len;
577
578 if (level != SOL_XDP)
579 return -ENOPROTOOPT;
580
581 if (get_user(len, optlen))
582 return -EFAULT;
583 if (len < 0)
584 return -EINVAL;
585
586 switch (optname) {
587 case XDP_STATISTICS:
588 {
589 struct xdp_statistics stats;
590
591 if (len < sizeof(stats))
592 return -EINVAL;
593
594 mutex_lock(&xs->mutex);
595 stats.rx_dropped = xs->rx_dropped;
596 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
597 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
598 mutex_unlock(&xs->mutex);
599
600 if (copy_to_user(optval, &stats, sizeof(stats)))
601 return -EFAULT;
602 if (put_user(sizeof(stats), optlen))
603 return -EFAULT;
604
605 return 0;
606 }
607 case XDP_MMAP_OFFSETS:
608 {
609 struct xdp_mmap_offsets off;
610
611 if (len < sizeof(off))
612 return -EINVAL;
613
614 off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
615 off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
616 off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
617 off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
618 off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
619 off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
620
621 off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
622 off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
623 off.fr.desc = offsetof(struct xdp_umem_ring, desc);
624 off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
625 off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
626 off.cr.desc = offsetof(struct xdp_umem_ring, desc);
627
628 len = sizeof(off);
629 if (copy_to_user(optval, &off, len))
630 return -EFAULT;
631 if (put_user(len, optlen))
632 return -EFAULT;
633
634 return 0;
635 }
636 default:
637 break;
638 }
639
640 return -EOPNOTSUPP;
641}
642
643static int xsk_mmap(struct file *file, struct socket *sock,
644 struct vm_area_struct *vma)
645{
646 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
647 unsigned long size = vma->vm_end - vma->vm_start;
648 struct xdp_sock *xs = xdp_sk(sock->sk);
649 struct xsk_queue *q = NULL;
650 struct xdp_umem *umem;
651 unsigned long pfn;
652 struct page *qpg;
653
654 if (offset == XDP_PGOFF_RX_RING) {
655 q = READ_ONCE(xs->rx);
656 } else if (offset == XDP_PGOFF_TX_RING) {
657 q = READ_ONCE(xs->tx);
658 } else {
659 umem = READ_ONCE(xs->umem);
660 if (!umem)
661 return -EINVAL;
662
663 if (offset == XDP_UMEM_PGOFF_FILL_RING)
664 q = READ_ONCE(umem->fq);
665 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
666 q = READ_ONCE(umem->cq);
667 }
668
669 if (!q)
670 return -EINVAL;
671
672 qpg = virt_to_head_page(q->ring);
673 if (size > (PAGE_SIZE << compound_order(qpg)))
674 return -EINVAL;
675
676 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
677 return remap_pfn_range(vma, vma->vm_start, pfn,
678 size, vma->vm_page_prot);
679}
680
681static struct proto xsk_proto = {
682 .name = "XDP",
683 .owner = THIS_MODULE,
684 .obj_size = sizeof(struct xdp_sock),
685};
686
687static const struct proto_ops xsk_proto_ops = {
688 .family = PF_XDP,
689 .owner = THIS_MODULE,
690 .release = xsk_release,
691 .bind = xsk_bind,
692 .connect = sock_no_connect,
693 .socketpair = sock_no_socketpair,
694 .accept = sock_no_accept,
695 .getname = sock_no_getname,
696 .poll_mask = xsk_poll_mask,
697 .ioctl = sock_no_ioctl,
698 .listen = sock_no_listen,
699 .shutdown = sock_no_shutdown,
700 .setsockopt = xsk_setsockopt,
701 .getsockopt = xsk_getsockopt,
702 .sendmsg = xsk_sendmsg,
703 .recvmsg = sock_no_recvmsg,
704 .mmap = xsk_mmap,
705 .sendpage = sock_no_sendpage,
706};
707
708static void xsk_destruct(struct sock *sk)
709{
710 struct xdp_sock *xs = xdp_sk(sk);
711
712 if (!sock_flag(sk, SOCK_DEAD))
713 return;
714
715 xskq_destroy(xs->rx);
716 xskq_destroy(xs->tx);
717 xdp_del_sk_umem(xs->umem, xs);
718 xdp_put_umem(xs->umem);
719
720 sk_refcnt_debug_dec(sk);
721}
722
723static int xsk_create(struct net *net, struct socket *sock, int protocol,
724 int kern)
725{
726 struct sock *sk;
727 struct xdp_sock *xs;
728
729 if (!ns_capable(net->user_ns, CAP_NET_RAW))
730 return -EPERM;
731 if (sock->type != SOCK_RAW)
732 return -ESOCKTNOSUPPORT;
733
734 if (protocol)
735 return -EPROTONOSUPPORT;
736
737 sock->state = SS_UNCONNECTED;
738
739 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
740 if (!sk)
741 return -ENOBUFS;
742
743 sock->ops = &xsk_proto_ops;
744
745 sock_init_data(sock, sk);
746
747 sk->sk_family = PF_XDP;
748
749 sk->sk_destruct = xsk_destruct;
750 sk_refcnt_debug_inc(sk);
751
752 xs = xdp_sk(sk);
753 mutex_init(&xs->mutex);
754
755 local_bh_disable();
756 sock_prot_inuse_add(net, &xsk_proto, 1);
757 local_bh_enable();
758
759 return 0;
760}
761
762static const struct net_proto_family xsk_family_ops = {
763 .family = PF_XDP,
764 .create = xsk_create,
765 .owner = THIS_MODULE,
766};
767
768static int __init xsk_init(void)
769{
770 int err;
771
772 err = proto_register(&xsk_proto, 0 /* no slab */);
773 if (err)
774 goto out;
775
776 err = sock_register(&xsk_family_ops);
777 if (err)
778 goto out_proto;
779
780 return 0;
781
782out_proto:
783 proto_unregister(&xsk_proto);
784out:
785 return err;
786}
787
788fs_initcall(xsk_init);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..6c32e92e98fc
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,63 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XDP user-space ring structure
3 * Copyright(c) 2018 Intel Corporation.
4 */
5
6#include <linux/slab.h>
7
8#include "xsk_queue.h"
9
10void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
11{
12 if (!q)
13 return;
14
15 q->umem_props = *umem_props;
16}
17
18static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
19{
20 return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
21}
22
23static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
24{
25 return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc);
26}
27
28struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
29{
30 struct xsk_queue *q;
31 gfp_t gfp_flags;
32 size_t size;
33
34 q = kzalloc(sizeof(*q), GFP_KERNEL);
35 if (!q)
36 return NULL;
37
38 q->nentries = nentries;
39 q->ring_mask = nentries - 1;
40
41 gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
42 __GFP_COMP | __GFP_NORETRY;
43 size = umem_queue ? xskq_umem_get_ring_size(q) :
44 xskq_rxtx_get_ring_size(q);
45
46 q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
47 get_order(size));
48 if (!q->ring) {
49 kfree(q);
50 return NULL;
51 }
52
53 return q;
54}
55
56void xskq_destroy(struct xsk_queue *q)
57{
58 if (!q)
59 return;
60
61 page_frag_free(q->ring);
62 kfree(q);
63}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..ef6a6f0ec949
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,265 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* XDP user-space ring structure
3 * Copyright(c) 2018 Intel Corporation.
4 */
5
6#ifndef _LINUX_XSK_QUEUE_H
7#define _LINUX_XSK_QUEUE_H
8
9#include <linux/types.h>
10#include <linux/if_xdp.h>
11#include <net/xdp_sock.h>
12
13#define RX_BATCH_SIZE 16
14#define LAZY_UPDATE_THRESHOLD 128
15
16struct xdp_ring {
17 u32 producer ____cacheline_aligned_in_smp;
18 u32 consumer ____cacheline_aligned_in_smp;
19};
20
21/* Used for the RX and TX queues for packets */
22struct xdp_rxtx_ring {
23 struct xdp_ring ptrs;
24 struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
25};
26
27/* Used for the fill and completion queues for buffers */
28struct xdp_umem_ring {
29 struct xdp_ring ptrs;
30 u64 desc[0] ____cacheline_aligned_in_smp;
31};
32
33struct xsk_queue {
34 struct xdp_umem_props umem_props;
35 u32 ring_mask;
36 u32 nentries;
37 u32 prod_head;
38 u32 prod_tail;
39 u32 cons_head;
40 u32 cons_tail;
41 struct xdp_ring *ring;
42 u64 invalid_descs;
43};
44
45/* Common functions operating for both RXTX and umem queues */
46
47static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
48{
49 return q ? q->invalid_descs : 0;
50}
51
52static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
53{
54 u32 entries = q->prod_tail - q->cons_tail;
55
56 if (entries == 0) {
57 /* Refresh the local pointer */
58 q->prod_tail = READ_ONCE(q->ring->producer);
59 entries = q->prod_tail - q->cons_tail;
60 }
61
62 return (entries > dcnt) ? dcnt : entries;
63}
64
65static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
66{
67 return q->nentries - (producer - q->cons_tail);
68}
69
70static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
71{
72 u32 free_entries = xskq_nb_free_lazy(q, producer);
73
74 if (free_entries >= dcnt)
75 return free_entries;
76
77 /* Refresh the local tail pointer */
78 q->cons_tail = READ_ONCE(q->ring->consumer);
79 return q->nentries - (producer - q->cons_tail);
80}
81
82/* UMEM queue */
83
84static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
85{
86 if (addr >= q->umem_props.size) {
87 q->invalid_descs++;
88 return false;
89 }
90
91 return true;
92}
93
94static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
95{
96 while (q->cons_tail != q->cons_head) {
97 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
98 unsigned int idx = q->cons_tail & q->ring_mask;
99
100 *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
101 if (xskq_is_valid_addr(q, *addr))
102 return addr;
103
104 q->cons_tail++;
105 }
106
107 return NULL;
108}
109
110static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
111{
112 if (q->cons_tail == q->cons_head) {
113 WRITE_ONCE(q->ring->consumer, q->cons_tail);
114 q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
115
116 /* Order consumer and data */
117 smp_rmb();
118 }
119
120 return xskq_validate_addr(q, addr);
121}
122
123static inline void xskq_discard_addr(struct xsk_queue *q)
124{
125 q->cons_tail++;
126}
127
128static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
129{
130 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
131
132 if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
133 return -ENOSPC;
134
135 ring->desc[q->prod_tail++ & q->ring_mask] = addr;
136
137 /* Order producer and data */
138 smp_wmb();
139
140 WRITE_ONCE(q->ring->producer, q->prod_tail);
141 return 0;
142}
143
144static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
145{
146 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
147
148 if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
149 return -ENOSPC;
150
151 ring->desc[q->prod_head++ & q->ring_mask] = addr;
152 return 0;
153}
154
155static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
156 u32 nb_entries)
157{
158 /* Order producer and data */
159 smp_wmb();
160
161 q->prod_tail += nb_entries;
162 WRITE_ONCE(q->ring->producer, q->prod_tail);
163}
164
165static inline int xskq_reserve_addr(struct xsk_queue *q)
166{
167 if (xskq_nb_free(q, q->prod_head, 1) == 0)
168 return -ENOSPC;
169
170 q->prod_head++;
171 return 0;
172}
173
174/* Rx/Tx queue */
175
176static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
177{
178 if (!xskq_is_valid_addr(q, d->addr))
179 return false;
180
181 if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
182 (d->addr & q->umem_props.chunk_mask)) {
183 q->invalid_descs++;
184 return false;
185 }
186
187 return true;
188}
189
190static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
191 struct xdp_desc *desc)
192{
193 while (q->cons_tail != q->cons_head) {
194 struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
195 unsigned int idx = q->cons_tail & q->ring_mask;
196
197 *desc = READ_ONCE(ring->desc[idx]);
198 if (xskq_is_valid_desc(q, desc))
199 return desc;
200
201 q->cons_tail++;
202 }
203
204 return NULL;
205}
206
207static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
208 struct xdp_desc *desc)
209{
210 if (q->cons_tail == q->cons_head) {
211 WRITE_ONCE(q->ring->consumer, q->cons_tail);
212 q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
213
214 /* Order consumer and data */
215 smp_rmb();
216 }
217
218 return xskq_validate_desc(q, desc);
219}
220
221static inline void xskq_discard_desc(struct xsk_queue *q)
222{
223 q->cons_tail++;
224}
225
226static inline int xskq_produce_batch_desc(struct xsk_queue *q,
227 u64 addr, u32 len)
228{
229 struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
230 unsigned int idx;
231
232 if (xskq_nb_free(q, q->prod_head, 1) == 0)
233 return -ENOSPC;
234
235 idx = (q->prod_head++) & q->ring_mask;
236 ring->desc[idx].addr = addr;
237 ring->desc[idx].len = len;
238
239 return 0;
240}
241
242static inline void xskq_produce_flush_desc(struct xsk_queue *q)
243{
244 /* Order producer and data */
245 smp_wmb();
246
247 q->prod_tail = q->prod_head,
248 WRITE_ONCE(q->ring->producer, q->prod_tail);
249}
250
251static inline bool xskq_full_desc(struct xsk_queue *q)
252{
253 return xskq_nb_avail(q, q->nentries) == q->nentries;
254}
255
256static inline bool xskq_empty_desc(struct xsk_queue *q)
257{
258 return xskq_nb_free(q, q->prod_tail, 1) == q->nentries;
259}
260
261void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
262struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
263void xskq_destroy(struct xsk_queue *q_ops);
264
265#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 6c177ae7a6d9..8308281f3253 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -42,6 +42,7 @@ static void xfrm_state_gc_task(struct work_struct *work);
42 42
43static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; 43static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
44static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation); 44static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
45static struct kmem_cache *xfrm_state_cache __ro_after_init;
45 46
46static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); 47static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
47static HLIST_HEAD(xfrm_state_gc_list); 48static HLIST_HEAD(xfrm_state_gc_list);
@@ -451,7 +452,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
451 } 452 }
452 xfrm_dev_state_free(x); 453 xfrm_dev_state_free(x);
453 security_xfrm_state_free(x); 454 security_xfrm_state_free(x);
454 kfree(x); 455 kmem_cache_free(xfrm_state_cache, x);
455} 456}
456 457
457static void xfrm_state_gc_task(struct work_struct *work) 458static void xfrm_state_gc_task(struct work_struct *work)
@@ -563,7 +564,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
563{ 564{
564 struct xfrm_state *x; 565 struct xfrm_state *x;
565 566
566 x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC); 567 x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO);
567 568
568 if (x) { 569 if (x) {
569 write_pnet(&x->xs_net, net); 570 write_pnet(&x->xs_net, net);
@@ -2313,6 +2314,10 @@ int __net_init xfrm_state_init(struct net *net)
2313{ 2314{
2314 unsigned int sz; 2315 unsigned int sz;
2315 2316
2317 if (net_eq(net, &init_net))
2318 xfrm_state_cache = KMEM_CACHE(xfrm_state,
2319 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
2320
2316 INIT_LIST_HEAD(&net->xfrm.state_all); 2321 INIT_LIST_HEAD(&net->xfrm.state_all);
2317 2322
2318 sz = sizeof(struct hlist_head) * 8; 2323 sz = sizeof(struct hlist_head) * 8;