aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-10-29 02:20:52 -0400
committerIngo Molnar <mingo@kernel.org>2018-10-29 02:20:52 -0400
commitf0718d792b8a6d4b5ddc929e418ac57cc4897375 (patch)
tree3dbaa824ce380e99709fae47c047383ca39c983a /net
parentefe8eaf7b525f1be26fe20d723d2bfbfcd7455fd (diff)
parentb59dfdaef173677b0b7e10f375226c0a1114fd20 (diff)
Merge branch 'linus' into perf/urgent, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c3
-rw-r--r--net/Kconfig11
-rw-r--r--net/atm/common.c2
-rw-r--r--net/batman-adv/Kconfig11
-rw-r--r--net/batman-adv/Makefile3
-rw-r--r--net/batman-adv/bat_iv_ogm.c330
-rw-r--r--net/batman-adv/debugfs.c37
-rw-r--r--net/batman-adv/debugfs.h6
-rw-r--r--net/batman-adv/hard-interface.c47
-rw-r--r--net/batman-adv/icmp_socket.c3
-rw-r--r--net/batman-adv/log.c20
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/originator.c107
-rw-r--r--net/batman-adv/originator.h4
-rw-r--r--net/batman-adv/soft-interface.c2
-rw-r--r--net/batman-adv/trace.c22
-rw-r--r--net/batman-adv/trace.h78
-rw-r--r--net/batman-adv/types.h62
-rw-r--r--net/bluetooth/bnep/core.c7
-rw-r--r--net/bluetooth/bnep/sock.c19
-rw-r--r--net/bluetooth/cmtp/core.c14
-rw-r--r--net/bluetooth/cmtp/sock.c19
-rw-r--r--net/bluetooth/hci_core.c65
-rw-r--r--net/bluetooth/hci_event.c85
-rw-r--r--net/bluetooth/hidp/core.c23
-rw-r--r--net/bluetooth/hidp/hidp.h2
-rw-r--r--net/bluetooth/hidp/sock.c79
-rw-r--r--net/bluetooth/l2cap_core.c102
-rw-r--r--net/bluetooth/rfcomm/tty.c12
-rw-r--r--net/bluetooth/smp.c23
-rw-r--r--net/bpf/test_run.c35
-rw-r--r--net/bpfilter/bpfilter_kern.c1
-rw-r--r--net/bridge/Kconfig2
-rw-r--r--net/bridge/br.c20
-rw-r--r--net/bridge/br_arp_nd_proxy.c15
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_fdb.c24
-rw-r--r--net/bridge/br_if.c9
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_mdb.c36
-rw-r--r--net/bridge/br_multicast.c65
-rw-r--r--net/bridge/br_netfilter_hooks.c7
-rw-r--r--net/bridge/br_netlink.c43
-rw-r--r--net/bridge/br_private.h71
-rw-r--r--net/bridge/br_switchdev.c9
-rw-r--r--net/bridge/br_sysfs_br.c49
-rw-r--r--net/bridge/br_vlan.c88
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/cfrfml.c3
-rw-r--r--net/ceph/crypto.c12
-rw-r--r--net/ceph/crypto.h2
-rw-r--r--net/compat.c10
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c7
-rw-r--r--net/core/dev.c40
-rw-r--r--net/core/devlink.c25
-rw-r--r--net/core/ethtool.c181
-rw-r--r--net/core/fib_rules.c36
-rw-r--r--net/core/filter.c803
-rw-r--r--net/core/flow_dissector.c150
-rw-r--r--net/core/gen_stats.c73
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c210
-rw-r--r--net/core/net_namespace.c6
-rw-r--r--net/core/netclassid_cgroup.c1
-rw-r--r--net/core/netpoll.c21
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/rtnetlink.c381
-rw-r--r--net/core/skbuff.c58
-rw-r--r--net/core/skmsg.c802
-rw-r--r--net/core/sock.c74
-rw-r--r--net/core/sock_map.c1003
-rw-r--r--net/core/xdp.c53
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/decnet/dn_dev.c2
-rw-r--r--net/dns_resolver/dns_key.c67
-rw-r--r--net/dns_resolver/dns_query.c5
-rw-r--r--net/dsa/Kconfig3
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/dsa.c49
-rw-r--r--net/dsa/dsa_priv.h4
-rw-r--r--net/dsa/legacy.c9
-rw-r--r--net/dsa/slave.c31
-rw-r--r--net/dsa/tag_gswip.c109
-rw-r--r--net/ieee802154/6lowpan/reassembly.c3
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/ah4.c4
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c11
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c208
-rw-r--r--net/ipv4/esp4.c11
-rw-r--r--net/ipv4/fib_frontend.c146
-rw-r--r--net/ipv4/fib_semantics.c37
-rw-r--r--net/ipv4/fib_trie.c37
-rw-r--r--net/ipv4/gre_demux.c7
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/ip_fragment.c27
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_vti.c4
-rw-r--r--net/ipv4/ipcomp.c4
-rw-r--r--net/ipv4/ipip.c5
-rw-r--r--net/ipv4/ipmr.c60
-rw-r--r--net/ipv4/ipmr_base.c121
-rw-r--r--net/ipv4/metrics.c30
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c1
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c27
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c48
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/tcp.c56
-rw-r--r--net/ipv4/tcp_bbr.c90
-rw-r--r--net/ipv4/tcp_bpf.c668
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_dctcp.c55
-rw-r--r--net/ipv4/tcp_dctcp.h40
-rw-r--r--net/ipv4/tcp_input.c59
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/tcp_output.c162
-rw-r--r--net/ipv4/tcp_rate.c15
-rw-r--r--net/ipv4/tcp_recovery.c5
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/tcp_ulp.c75
-rw-r--r--net/ipv4/udp.c28
-rw-r--r--net/ipv4/udp_offload.c2
-rw-r--r--net/ipv6/addrconf.c286
-rw-r--r--net/ipv6/addrlabel.c34
-rw-r--r--net/ipv6/af_inet6.c8
-rw-r--r--net/ipv6/esp6.c7
-rw-r--r--net/ipv6/ip6_checksum.c20
-rw-r--r--net/ipv6/ip6_fib.c71
-rw-r--r--net/ipv6/ip6_gre.c26
-rw-r--r--net/ipv6/ip6_input.c3
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ip6mr.c77
-rw-r--r--net/ipv6/ipv6_sockglue.c11
-rw-r--r--net/ipv6/mcast.c2
-rw-r--r--net/ipv6/ndisc.c7
-rw-r--r--net/ipv6/netfilter/ip6t_ipv6header.c5
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c10
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c2
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c19
-rw-r--r--net/ipv6/reassembly.c14
-rw-r--r--net/ipv6/route.c251
-rw-r--r--net/ipv6/sit.c6
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--net/ipv6/udp_offload.c2
-rw-r--r--net/iucv/af_iucv.c46
-rw-r--r--net/llc/af_llc.c11
-rw-r--r--net/llc/llc_core.c4
-rw-r--r--net/mac80211/Kconfig17
-rw-r--r--net/mac80211/Makefile11
-rw-r--r--net/mac80211/cfg.c142
-rw-r--r--net/mac80211/debugfs.c4
-rw-r--r--net/mac80211/debugfs_sta.c364
-rw-r--r--net/mac80211/driver-ops.h26
-rw-r--r--net/mac80211/ibss.c4
-rw-r--r--net/mac80211/ieee80211_i.h11
-rw-r--r--net/mac80211/key.c111
-rw-r--r--net/mac80211/main.c78
-rw-r--r--net/mac80211/mesh.c5
-rw-r--r--net/mac80211/mlme.c130
-rw-r--r--net/mac80211/rate.h13
-rw-r--r--net/mac80211/rc80211_minstrel.c162
-rw-r--r--net/mac80211/rc80211_minstrel.h35
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c68
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c298
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h20
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c58
-rw-r--r--net/mac80211/rx.c55
-rw-r--r--net/mac80211/spectmgmt.c5
-rw-r--r--net/mac80211/sta_info.c27
-rw-r--r--net/mac80211/status.c19
-rw-r--r--net/mac80211/trace.h23
-rw-r--r--net/mac80211/tx.c75
-rw-r--r--net/mac80211/util.c166
-rw-r--r--net/mac80211/vht.c20
-rw-r--r--net/mac802154/llsec.c16
-rw-r--r--net/mac802154/llsec.h2
-rw-r--r--net/mpls/af_mpls.c132
-rw-r--r--net/ncsi/Kconfig6
-rw-r--r--net/ncsi/internal.h21
-rw-r--r--net/ncsi/ncsi-cmd.c38
-rw-r--r--net/ncsi/ncsi-manage.c98
-rw-r--r--net/ncsi/ncsi-netlink.c205
-rw-r--r--net/ncsi/ncsi-netlink.h12
-rw-r--r--net/ncsi/ncsi-pkt.h22
-rw-r--r--net/ncsi/ncsi-rsp.c150
-rw-r--r--net/netfilter/Kconfig7
-rw-r--r--net/netfilter/Makefile1
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c105
-rw-r--r--net/netfilter/nf_conntrack_expect.c3
-rw-r--r--net/netfilter/nf_conntrack_netlink.c73
-rw-r--r--net/netfilter/nf_conntrack_proto.c117
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c155
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c28
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c44
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c78
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c80
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c253
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c251
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c236
-rw-r--r--net/netfilter/nf_conntrack_standalone.c9
-rw-r--r--net/netfilter/nf_flow_table_core.c52
-rw-r--r--net/netfilter/nf_flow_table_ip.c6
-rw-r--r--net/netfilter/nf_nat_helper.c4
-rw-r--r--net/netfilter/nf_nat_redirect.c4
-rw-r--r--net/netfilter/nf_tables_api.c123
-rw-r--r--net/netfilter/nf_tables_core.c28
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c61
-rw-r--r--net/netfilter/nfnetlink_osf.c46
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netfilter/nft_cmp.c6
-rw-r--r--net/netfilter/nft_compat.c24
-rw-r--r--net/netfilter/nft_ct.c22
-rw-r--r--net/netfilter/nft_dup_netdev.c2
-rw-r--r--net/netfilter/nft_dynset.c21
-rw-r--r--net/netfilter/nft_flow_offload.c2
-rw-r--r--net/netfilter/nft_fwd_netdev.c4
-rw-r--r--net/netfilter/nft_lookup.c20
-rw-r--r--net/netfilter/nft_meta.c116
-rw-r--r--net/netfilter/nft_objref.c20
-rw-r--r--net/netfilter/nft_osf.c25
-rw-r--r--net/netfilter/nft_reject.c6
-rw-r--r--net/netfilter/nft_rt.c11
-rw-r--r--net/netfilter/nft_set_hash.c38
-rw-r--r--net/netfilter/nft_set_rbtree.c10
-rw-r--r--net/netfilter/nft_xfrm.c294
-rw-r--r--net/netfilter/xt_CT.c2
-rw-r--r--net/netfilter/xt_IDLETIMER.c4
-rw-r--r--net/netfilter/xt_SECMARK.c2
-rw-r--r--net/netfilter/xt_TEE.c76
-rw-r--r--net/netfilter/xt_cgroup.c72
-rw-r--r--net/netfilter/xt_nat.c2
-rw-r--r--net/netfilter/xt_osf.c8
-rw-r--r--net/netlink/af_netlink.c47
-rw-r--r--net/netlink/af_netlink.h1
-rw-r--r--net/nfc/llcp_sock.c2
-rw-r--r--net/nfc/nci/uart.c7
-rw-r--r--net/openvswitch/conntrack.c8
-rw-r--r--net/openvswitch/datapath.c20
-rw-r--r--net/openvswitch/flow.c22
-rw-r--r--net/openvswitch/vport-internal_dev.c5
-rw-r--r--net/packet/af_packet.c17
-rw-r--r--net/rds/rds.h2
-rw-r--r--net/rds/recv.c19
-rw-r--r--net/rfkill/core.c4
-rw-r--r--net/rxrpc/af_rxrpc.c19
-rw-r--r--net/rxrpc/ar-internal.h7
-rw-r--r--net/rxrpc/call_accept.c2
-rw-r--r--net/rxrpc/conn_object.c7
-rw-r--r--net/rxrpc/input.c2
-rw-r--r--net/rxrpc/local_event.c2
-rw-r--r--net/rxrpc/net_ns.c3
-rw-r--r--net/rxrpc/output.c10
-rw-r--r--net/rxrpc/peer_event.c12
-rw-r--r--net/rxrpc/proc.c126
-rw-r--r--net/rxrpc/recvmsg.c43
-rw-r--r--net/rxrpc/rxkad.c44
-rw-r--r--net/rxrpc/skbuff.c15
-rw-r--r--net/rxrpc/utils.c23
-rw-r--r--net/sched/Kconfig11
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c80
-rw-r--r--net/sched/act_bpf.c3
-rw-r--r--net/sched/act_connmark.c14
-rw-r--r--net/sched/act_csum.c3
-rw-r--r--net/sched/act_gact.c14
-rw-r--r--net/sched/act_ife.c3
-rw-r--r--net/sched/act_ipt.c6
-rw-r--r--net/sched/act_mirred.c8
-rw-r--r--net/sched/act_nat.c18
-rw-r--r--net/sched/act_pedit.c3
-rw-r--r--net/sched/act_police.c199
-rw-r--r--net/sched/act_sample.c3
-rw-r--r--net/sched/act_simple.c3
-rw-r--r--net/sched/act_skbedit.c26
-rw-r--r--net/sched/act_skbmod.c3
-rw-r--r--net/sched/act_tunnel_key.c3
-rw-r--r--net/sched/act_vlan.c3
-rw-r--r--net/sched/cls_api.c249
-rw-r--r--net/sched/cls_flower.c7
-rw-r--r--net/sched/cls_u32.c121
-rw-r--r--net/sched/sch_api.c34
-rw-r--r--net/sched/sch_atm.c2
-rw-r--r--net/sched/sch_cake.c6
-rw-r--r--net/sched/sch_cbq.c2
-rw-r--r--net/sched/sch_cbs.c2
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_dsmark.c2
-rw-r--r--net/sched/sch_fifo.c2
-rw-r--r--net/sched/sch_fq.c103
-rw-r--r--net/sched/sch_fq_codel.c2
-rw-r--r--net/sched/sch_generic.c66
-rw-r--r--net/sched/sch_hfsc.c2
-rw-r--r--net/sched/sch_hhf.c2
-rw-r--r--net/sched/sch_htb.c116
-rw-r--r--net/sched/sch_mq.c4
-rw-r--r--net/sched/sch_mqprio.c4
-rw-r--r--net/sched/sch_multiq.c6
-rw-r--r--net/sched/sch_netem.c16
-rw-r--r--net/sched/sch_pie.c36
-rw-r--r--net/sched/sch_prio.c6
-rw-r--r--net/sched/sch_qfq.c4
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_sfb.c4
-rw-r--r--net/sched/sch_taprio.c962
-rw-r--r--net/sched/sch_tbf.c6
-rw-r--r--net/sctp/outqueue.c8
-rw-r--r--net/sctp/socket.c59
-rw-r--r--net/sctp/ulpqueue.c2
-rw-r--r--net/smc/af_smc.c2
-rw-r--r--net/smc/smc_core.c25
-rw-r--r--net/socket.c20
-rw-r--r--net/strparser/Kconfig4
-rw-r--r--net/sunrpc/auth.c310
-rw-r--r--net/sunrpc/auth_generic.c2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c45
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c87
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c53
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c38
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c18
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c28
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c28
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c1
-rw-r--r--net/sunrpc/auth_null.c6
-rw-r--r--net/sunrpc/auth_unix.c4
-rw-r--r--net/sunrpc/backchannel_rqst.c1
-rw-r--r--net/sunrpc/clnt.c174
-rw-r--r--net/sunrpc/sched.c178
-rw-r--r--net/sunrpc/socklib.c10
-rw-r--r--net/sunrpc/svc_xprt.c2
-rw-r--r--net/sunrpc/svcsock.c6
-rw-r--r--net/sunrpc/xdr.c34
-rw-r--r--net/sunrpc/xprt.c908
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c20
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c131
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c137
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c30
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c15
-rw-r--r--net/sunrpc/xprtrdma/transport.c120
-rw-r--r--net/sunrpc/xprtrdma/verbs.c178
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h18
-rw-r--r--net/sunrpc/xprtsock.c1107
-rw-r--r--net/tipc/bearer.c2
-rw-r--r--net/tipc/msg.c78
-rw-r--r--net/tipc/msg.h11
-rw-r--r--net/tipc/name_distr.c18
-rw-r--r--net/tipc/name_table.c1
-rw-r--r--net/tipc/name_table.h1
-rw-r--r--net/tipc/node.h12
-rw-r--r--net/tipc/socket.c209
-rw-r--r--net/tipc/topsrv.c12
-rw-r--r--net/tipc/udp_media.c18
-rw-r--r--net/tls/Kconfig1
-rw-r--r--net/tls/tls_device.c2
-rw-r--r--net/tls/tls_main.c67
-rw-r--r--net/tls/tls_sw.c1376
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/wireless/core.c83
-rw-r--r--net/wireless/core.h14
-rw-r--r--net/wireless/lib80211_crypt_tkip.c59
-rw-r--r--net/wireless/lib80211_crypt_wep.c52
-rw-r--r--net/wireless/nl80211.c840
-rw-r--r--net/wireless/rdev-ops.h15
-rw-r--r--net/wireless/reg.c121
-rw-r--r--net/wireless/trace.h235
-rw-r--r--net/wireless/util.c160
-rw-r--r--net/xdp/xdp_umem.c106
-rw-r--r--net/xdp/xdp_umem.h12
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c54
-rw-r--r--net/xdp/xsk_queue.c60
-rw-r--r--net/xdp/xsk_queue.h16
-rw-r--r--net/xfrm/xfrm_device.c8
-rw-r--r--net/xfrm/xfrm_hash.h5
-rw-r--r--net/xfrm/xfrm_input.c2
-rw-r--r--net/xfrm/xfrm_interface.c9
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_user.c2
390 files changed, 16654 insertions, 7690 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 546af0e73ac3..ff720f1ebf73 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev)
756 return; 756 return;
757 757
758 vlan->netpoll = NULL; 758 vlan->netpoll = NULL;
759 759 __netpoll_free(netpoll);
760 __netpoll_free_async(netpoll);
761} 760}
762#endif /* CONFIG_NET_POLL_CONTROLLER */ 761#endif /* CONFIG_NET_POLL_CONTROLLER */
763 762
diff --git a/net/Kconfig b/net/Kconfig
index 228dfa382eec..f235edb593ba 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -300,8 +300,11 @@ config BPF_JIT
300 300
301config BPF_STREAM_PARSER 301config BPF_STREAM_PARSER
302 bool "enable BPF STREAM_PARSER" 302 bool "enable BPF STREAM_PARSER"
303 depends on INET
303 depends on BPF_SYSCALL 304 depends on BPF_SYSCALL
305 depends on CGROUP_BPF
304 select STREAM_PARSER 306 select STREAM_PARSER
307 select NET_SOCK_MSG
305 ---help--- 308 ---help---
306 Enabling this allows a stream parser to be used with 309 Enabling this allows a stream parser to be used with
307 BPF_MAP_TYPE_SOCKMAP. 310 BPF_MAP_TYPE_SOCKMAP.
@@ -413,6 +416,14 @@ config GRO_CELLS
413config SOCK_VALIDATE_XMIT 416config SOCK_VALIDATE_XMIT
414 bool 417 bool
415 418
419config NET_SOCK_MSG
420 bool
421 default n
422 help
423 The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
424 ULPs (upper layer modules, e.g. TLS) to process L7 application data
425 with the help of BPF programs.
426
416config NET_DEVLINK 427config NET_DEVLINK
417 tristate "Network physical/parent device Netlink interface" 428 tristate "Network physical/parent device Netlink interface"
418 help 429 help
diff --git a/net/atm/common.c b/net/atm/common.c
index 9f8cb0d2e71e..a38c174fc766 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
653 struct atm_vcc *vcc; 653 struct atm_vcc *vcc;
654 __poll_t mask; 654 __poll_t mask;
655 655
656 sock_poll_wait(file, wait); 656 sock_poll_wait(file, sock, wait);
657 mask = 0; 657 mask = 0;
658 658
659 vcc = ATM_SD(sock); 659 vcc = ATM_SD(sock);
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 361116f77cb9..f75816f58107 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -106,3 +106,14 @@ config BATMAN_ADV_DEBUG
106 say N here. This enables compilation of support for 106 say N here. This enables compilation of support for
107 outputting debugging information to the kernel log. The 107 outputting debugging information to the kernel log. The
108 output is controlled via the module parameter debug. 108 output is controlled via the module parameter debug.
109
110config BATMAN_ADV_TRACING
111 bool "B.A.T.M.A.N. tracing support"
112 depends on BATMAN_ADV
113 depends on EVENT_TRACING
114 help
115 This is an option for use by developers; most people should
116 say N here. Select this option to gather traces like the debug
117 messages using the generic tracing infrastructure of the kernel.
118 BATMAN_ADV_DEBUG must also be selected to get trace events for
119 batadv_dbg.
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index b97ba6fb8353..9b58160fe485 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -42,6 +42,9 @@ batman-adv-y += routing.o
42batman-adv-y += send.o 42batman-adv-y += send.o
43batman-adv-y += soft-interface.o 43batman-adv-y += soft-interface.o
44batman-adv-y += sysfs.o 44batman-adv-y += sysfs.o
45batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o
45batman-adv-y += tp_meter.o 46batman-adv-y += tp_meter.o
46batman-adv-y += translation-table.o 47batman-adv-y += translation-table.o
47batman-adv-y += tvlv.o 48batman-adv-y += tvlv.o
49
50CFLAGS_trace.o := -I$(src)
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 73bf6a93a3cf..d2227091029f 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -138,169 +138,6 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
138} 138}
139 139
140/** 140/**
141 * batadv_iv_ogm_orig_free() - free the private resources allocated for this
142 * orig_node
143 * @orig_node: the orig_node for which the resources have to be free'd
144 */
145static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
146{
147 kfree(orig_node->bat_iv.bcast_own);
148 kfree(orig_node->bat_iv.bcast_own_sum);
149}
150
151/**
152 * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node
153 * to include the new hard-interface
154 * @orig_node: the orig_node that has to be changed
155 * @max_if_num: the current amount of interfaces
156 *
157 * Return: 0 on success, a negative error code otherwise.
158 */
159static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
160 unsigned int max_if_num)
161{
162 void *data_ptr;
163 size_t old_size;
164 int ret = -ENOMEM;
165
166 spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
167
168 old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS;
169 data_ptr = kmalloc_array(max_if_num,
170 BATADV_NUM_WORDS * sizeof(unsigned long),
171 GFP_ATOMIC);
172 if (!data_ptr)
173 goto unlock;
174
175 memcpy(data_ptr, orig_node->bat_iv.bcast_own, old_size);
176 kfree(orig_node->bat_iv.bcast_own);
177 orig_node->bat_iv.bcast_own = data_ptr;
178
179 data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
180 if (!data_ptr)
181 goto unlock;
182
183 memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
184 (max_if_num - 1) * sizeof(u8));
185 kfree(orig_node->bat_iv.bcast_own_sum);
186 orig_node->bat_iv.bcast_own_sum = data_ptr;
187
188 ret = 0;
189
190unlock:
191 spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
192
193 return ret;
194}
195
196/**
197 * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own
198 * @orig_node: the orig_node that has to be changed
199 * @max_if_num: the current amount of interfaces
200 * @del_if_num: the index of the interface being removed
201 */
202static void
203batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node,
204 unsigned int max_if_num,
205 unsigned int del_if_num)
206{
207 size_t chunk_size;
208 size_t if_offset;
209 void *data_ptr;
210
211 lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
212
213 chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS;
214 data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC);
215 if (!data_ptr)
216 /* use old buffer when new one could not be allocated */
217 data_ptr = orig_node->bat_iv.bcast_own;
218
219 /* copy first part */
220 memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
221
222 /* copy second part */
223 if_offset = (del_if_num + 1) * chunk_size;
224 memmove((char *)data_ptr + del_if_num * chunk_size,
225 (uint8_t *)orig_node->bat_iv.bcast_own + if_offset,
226 (max_if_num - del_if_num) * chunk_size);
227
228 /* bcast_own was shrunk down in new buffer; free old one */
229 if (orig_node->bat_iv.bcast_own != data_ptr) {
230 kfree(orig_node->bat_iv.bcast_own);
231 orig_node->bat_iv.bcast_own = data_ptr;
232 }
233}
234
235/**
236 * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum
237 * @orig_node: the orig_node that has to be changed
238 * @max_if_num: the current amount of interfaces
239 * @del_if_num: the index of the interface being removed
240 */
241static void
242batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node,
243 unsigned int max_if_num,
244 unsigned int del_if_num)
245{
246 size_t if_offset;
247 void *data_ptr;
248
249 lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
250
251 data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
252 if (!data_ptr)
253 /* use old buffer when new one could not be allocated */
254 data_ptr = orig_node->bat_iv.bcast_own_sum;
255
256 memmove(data_ptr, orig_node->bat_iv.bcast_own_sum,
257 del_if_num * sizeof(u8));
258
259 if_offset = (del_if_num + 1) * sizeof(u8);
260 memmove((char *)data_ptr + del_if_num * sizeof(u8),
261 orig_node->bat_iv.bcast_own_sum + if_offset,
262 (max_if_num - del_if_num) * sizeof(u8));
263
264 /* bcast_own_sum was shrunk down in new buffer; free old one */
265 if (orig_node->bat_iv.bcast_own_sum != data_ptr) {
266 kfree(orig_node->bat_iv.bcast_own_sum);
267 orig_node->bat_iv.bcast_own_sum = data_ptr;
268 }
269}
270
271/**
272 * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node
273 * to exclude the removed interface
274 * @orig_node: the orig_node that has to be changed
275 * @max_if_num: the current amount of interfaces
276 * @del_if_num: the index of the interface being removed
277 *
278 * Return: 0 on success, a negative error code otherwise.
279 */
280static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
281 unsigned int max_if_num,
282 unsigned int del_if_num)
283{
284 spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
285
286 if (max_if_num == 0) {
287 kfree(orig_node->bat_iv.bcast_own);
288 kfree(orig_node->bat_iv.bcast_own_sum);
289 orig_node->bat_iv.bcast_own = NULL;
290 orig_node->bat_iv.bcast_own_sum = NULL;
291 } else {
292 batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num,
293 del_if_num);
294 batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num,
295 del_if_num);
296 }
297
298 spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
299
300 return 0;
301}
302
303/**
304 * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an 141 * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an
305 * originator 142 * originator
306 * @bat_priv: the bat priv with all the soft interface information 143 * @bat_priv: the bat priv with all the soft interface information
@@ -315,7 +152,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
315{ 152{
316 struct batadv_orig_node *orig_node; 153 struct batadv_orig_node *orig_node;
317 int hash_added; 154 int hash_added;
318 size_t size;
319 155
320 orig_node = batadv_orig_hash_find(bat_priv, addr); 156 orig_node = batadv_orig_hash_find(bat_priv, addr);
321 if (orig_node) 157 if (orig_node)
@@ -327,16 +163,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
327 163
328 spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); 164 spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock);
329 165
330 size = bat_priv->num_ifaces * sizeof(unsigned long) * BATADV_NUM_WORDS;
331 orig_node->bat_iv.bcast_own = kzalloc(size, GFP_ATOMIC);
332 if (!orig_node->bat_iv.bcast_own)
333 goto free_orig_node;
334
335 size = bat_priv->num_ifaces * sizeof(u8);
336 orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC);
337 if (!orig_node->bat_iv.bcast_own_sum)
338 goto free_orig_node;
339
340 kref_get(&orig_node->refcount); 166 kref_get(&orig_node->refcount);
341 hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, 167 hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
342 batadv_choose_orig, orig_node, 168 batadv_choose_orig, orig_node,
@@ -347,8 +173,9 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
347 return orig_node; 173 return orig_node;
348 174
349free_orig_node_hash: 175free_orig_node_hash:
176 /* reference for batadv_hash_add */
350 batadv_orig_node_put(orig_node); 177 batadv_orig_node_put(orig_node);
351free_orig_node: 178 /* reference from batadv_orig_node_new */
352 batadv_orig_node_put(orig_node); 179 batadv_orig_node_put(orig_node);
353 180
354 return NULL; 181 return NULL;
@@ -893,26 +720,30 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
893 struct batadv_hashtable *hash = bat_priv->orig_hash; 720 struct batadv_hashtable *hash = bat_priv->orig_hash;
894 struct hlist_head *head; 721 struct hlist_head *head;
895 struct batadv_orig_node *orig_node; 722 struct batadv_orig_node *orig_node;
723 struct batadv_orig_ifinfo *orig_ifinfo;
896 unsigned long *word; 724 unsigned long *word;
897 u32 i; 725 u32 i;
898 size_t word_index;
899 u8 *w; 726 u8 *w;
900 unsigned int if_num;
901 727
902 for (i = 0; i < hash->size; i++) { 728 for (i = 0; i < hash->size; i++) {
903 head = &hash->table[i]; 729 head = &hash->table[i];
904 730
905 rcu_read_lock(); 731 rcu_read_lock();
906 hlist_for_each_entry_rcu(orig_node, head, hash_entry) { 732 hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
907 spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); 733 hlist_for_each_entry_rcu(orig_ifinfo,
908 word_index = hard_iface->if_num * BATADV_NUM_WORDS; 734 &orig_node->ifinfo_list,
909 word = &orig_node->bat_iv.bcast_own[word_index]; 735 list) {
910 736 if (orig_ifinfo->if_outgoing != hard_iface)
911 batadv_bit_get_packet(bat_priv, word, 1, 0); 737 continue;
912 if_num = hard_iface->if_num; 738
913 w = &orig_node->bat_iv.bcast_own_sum[if_num]; 739 spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
914 *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); 740 word = orig_ifinfo->bat_iv.bcast_own;
915 spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); 741 batadv_bit_get_packet(bat_priv, word, 1, 0);
742 w = &orig_ifinfo->bat_iv.bcast_own_sum;
743 *w = bitmap_weight(word,
744 BATADV_TQ_LOCAL_WINDOW_SIZE);
745 spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
746 }
916 } 747 }
917 rcu_read_unlock(); 748 rcu_read_unlock();
918 } 749 }
@@ -1000,6 +831,35 @@ out:
1000} 831}
1001 832
1002/** 833/**
834 * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface
835 * @orig_node: originator which reproadcasted the OGMs directly
836 * @if_outgoing: interface which transmitted the original OGM and received the
837 * direct rebroadcast
838 *
839 * Return: Number of replied (rebroadcasted) OGMs which were transmitted by
840 * an originator and directly (without intermediate hop) received by a specific
841 * interface
842 */
843static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node,
844 struct batadv_hard_iface *if_outgoing)
845{
846 struct batadv_orig_ifinfo *orig_ifinfo;
847 u8 sum;
848
849 orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing);
850 if (!orig_ifinfo)
851 return 0;
852
853 spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
854 sum = orig_ifinfo->bat_iv.bcast_own_sum;
855 spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
856
857 batadv_orig_ifinfo_put(orig_ifinfo);
858
859 return sum;
860}
861
862/**
1003 * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an 863 * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an
1004 * originator 864 * originator
1005 * @bat_priv: the bat priv with all the soft interface information 865 * @bat_priv: the bat priv with all the soft interface information
@@ -1026,8 +886,6 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
1026 struct batadv_neigh_node *neigh_node = NULL; 886 struct batadv_neigh_node *neigh_node = NULL;
1027 struct batadv_neigh_node *tmp_neigh_node = NULL; 887 struct batadv_neigh_node *tmp_neigh_node = NULL;
1028 struct batadv_neigh_node *router = NULL; 888 struct batadv_neigh_node *router = NULL;
1029 struct batadv_orig_node *orig_node_tmp;
1030 unsigned int if_num;
1031 u8 sum_orig, sum_neigh; 889 u8 sum_orig, sum_neigh;
1032 u8 *neigh_addr; 890 u8 *neigh_addr;
1033 u8 tq_avg; 891 u8 tq_avg;
@@ -1132,18 +990,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
1132 */ 990 */
1133 if (router_ifinfo && 991 if (router_ifinfo &&
1134 neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { 992 neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) {
1135 orig_node_tmp = router->orig_node; 993 sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node,
1136 spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); 994 router->if_incoming);
1137 if_num = router->if_incoming->if_num; 995 sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node,
1138 sum_orig = orig_node_tmp->bat_iv.bcast_own_sum[if_num]; 996 neigh_node->if_incoming);
1139 spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
1140
1141 orig_node_tmp = neigh_node->orig_node;
1142 spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
1143 if_num = neigh_node->if_incoming->if_num;
1144 sum_neigh = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
1145 spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
1146
1147 if (sum_orig >= sum_neigh) 997 if (sum_orig >= sum_neigh)
1148 goto out; 998 goto out;
1149 } 999 }
@@ -1186,7 +1036,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
1186 u8 total_count; 1036 u8 total_count;
1187 u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; 1037 u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
1188 unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; 1038 unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
1189 unsigned int if_num;
1190 unsigned int tq_asym_penalty, inv_asym_penalty; 1039 unsigned int tq_asym_penalty, inv_asym_penalty;
1191 unsigned int combined_tq; 1040 unsigned int combined_tq;
1192 unsigned int tq_iface_penalty; 1041 unsigned int tq_iface_penalty;
@@ -1227,9 +1076,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
1227 orig_node->last_seen = jiffies; 1076 orig_node->last_seen = jiffies;
1228 1077
1229 /* find packet count of corresponding one hop neighbor */ 1078 /* find packet count of corresponding one hop neighbor */
1230 spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); 1079 orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming);
1231 if_num = if_incoming->if_num;
1232 orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num];
1233 neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); 1080 neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
1234 if (neigh_ifinfo) { 1081 if (neigh_ifinfo) {
1235 neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; 1082 neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
@@ -1237,7 +1084,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
1237 } else { 1084 } else {
1238 neigh_rq_count = 0; 1085 neigh_rq_count = 0;
1239 } 1086 }
1240 spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
1241 1087
1242 /* pay attention to not get a value bigger than 100 % */ 1088 /* pay attention to not get a value bigger than 100 % */
1243 if (orig_eq_count > neigh_rq_count) 1089 if (orig_eq_count > neigh_rq_count)
@@ -1622,6 +1468,49 @@ out:
1622} 1468}
1623 1469
1624/** 1470/**
1471 * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it
1472 * @ogm_packet: rebroadcast OGM packet to process
1473 * @if_incoming: the interface where this packet was received
1474 * @orig_node: originator which reproadcasted the OGMs
1475 * @if_incoming_seqno: OGM sequence number when rebroadcast was received
1476 */
1477static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet,
1478 struct batadv_hard_iface *if_incoming,
1479 struct batadv_orig_node *orig_node,
1480 u32 if_incoming_seqno)
1481{
1482 struct batadv_orig_ifinfo *orig_ifinfo;
1483 s32 bit_pos;
1484 u8 *weight;
1485
1486 /* neighbor has to indicate direct link and it has to
1487 * come via the corresponding interface
1488 */
1489 if (!(ogm_packet->flags & BATADV_DIRECTLINK))
1490 return;
1491
1492 if (!batadv_compare_eth(if_incoming->net_dev->dev_addr,
1493 ogm_packet->orig))
1494 return;
1495
1496 orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming);
1497 if (!orig_ifinfo)
1498 return;
1499
1500 /* save packet seqno for bidirectional check */
1501 spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
1502 bit_pos = if_incoming_seqno - 2;
1503 bit_pos -= ntohl(ogm_packet->seqno);
1504 batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos);
1505 weight = &orig_ifinfo->bat_iv.bcast_own_sum;
1506 *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own,
1507 BATADV_TQ_LOCAL_WINDOW_SIZE);
1508 spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
1509
1510 batadv_orig_ifinfo_put(orig_ifinfo);
1511}
1512
1513/**
1625 * batadv_iv_ogm_process() - process an incoming batman iv OGM 1514 * batadv_iv_ogm_process() - process an incoming batman iv OGM
1626 * @skb: the skb containing the OGM 1515 * @skb: the skb containing the OGM
1627 * @ogm_offset: offset to the OGM which should be processed (for aggregates) 1516 * @ogm_offset: offset to the OGM which should be processed (for aggregates)
@@ -1705,37 +1594,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
1705 } 1594 }
1706 1595
1707 if (is_my_orig) { 1596 if (is_my_orig) {
1708 unsigned long *word;
1709 size_t offset;
1710 s32 bit_pos;
1711 unsigned int if_num;
1712 u8 *weight;
1713
1714 orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, 1597 orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
1715 ethhdr->h_source); 1598 ethhdr->h_source);
1716 if (!orig_neigh_node) 1599 if (!orig_neigh_node)
1717 return; 1600 return;
1718 1601
1719 /* neighbor has to indicate direct link and it has to 1602 batadv_iv_ogm_process_reply(ogm_packet, if_incoming,
1720 * come via the corresponding interface 1603 orig_neigh_node, if_incoming_seqno);
1721 * save packet seqno for bidirectional check
1722 */
1723 if (has_directlink_flag &&
1724 batadv_compare_eth(if_incoming->net_dev->dev_addr,
1725 ogm_packet->orig)) {
1726 if_num = if_incoming->if_num;
1727 offset = if_num * BATADV_NUM_WORDS;
1728
1729 spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
1730 word = &orig_neigh_node->bat_iv.bcast_own[offset];
1731 bit_pos = if_incoming_seqno - 2;
1732 bit_pos -= ntohl(ogm_packet->seqno);
1733 batadv_set_bit(word, bit_pos);
1734 weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num];
1735 *weight = bitmap_weight(word,
1736 BATADV_TQ_LOCAL_WINDOW_SIZE);
1737 spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
1738 }
1739 1604
1740 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, 1605 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
1741 "Drop packet: originator packet from myself (via neighbor)\n"); 1606 "Drop packet: originator packet from myself (via neighbor)\n");
@@ -2844,9 +2709,6 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
2844 .print = batadv_iv_ogm_orig_print, 2709 .print = batadv_iv_ogm_orig_print,
2845#endif 2710#endif
2846 .dump = batadv_iv_ogm_orig_dump, 2711 .dump = batadv_iv_ogm_orig_dump,
2847 .free = batadv_iv_ogm_orig_free,
2848 .add_if = batadv_iv_ogm_orig_add_if,
2849 .del_if = batadv_iv_ogm_orig_del_if,
2850 }, 2712 },
2851 .gw = { 2713 .gw = {
2852 .init_sel_class = batadv_iv_init_sel_class, 2714 .init_sel_class = batadv_iv_init_sel_class,
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 3cb82378300b..8b608a2e2653 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -47,8 +47,24 @@
47 47
48static struct dentry *batadv_debugfs; 48static struct dentry *batadv_debugfs;
49 49
50/**
51 * batadv_debugfs_deprecated() - Log use of deprecated batadv debugfs access
52 * @file: file which was accessed
53 * @alt: explanation what can be used as alternative
54 */
55void batadv_debugfs_deprecated(struct file *file, const char *alt)
56{
57 struct dentry *dentry = file_dentry(file);
58 const char *name = dentry->d_name.name;
59
60 pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of debugfs file \"%s\".\n%s",
61 current->comm, task_pid_nr(current), name, alt);
62}
63
50static int batadv_algorithms_open(struct inode *inode, struct file *file) 64static int batadv_algorithms_open(struct inode *inode, struct file *file)
51{ 65{
66 batadv_debugfs_deprecated(file,
67 "Use genl command BATADV_CMD_GET_ROUTING_ALGOS instead\n");
52 return single_open(file, batadv_algo_seq_print_text, NULL); 68 return single_open(file, batadv_algo_seq_print_text, NULL);
53} 69}
54 70
@@ -56,6 +72,8 @@ static int neighbors_open(struct inode *inode, struct file *file)
56{ 72{
57 struct net_device *net_dev = (struct net_device *)inode->i_private; 73 struct net_device *net_dev = (struct net_device *)inode->i_private;
58 74
75 batadv_debugfs_deprecated(file,
76 "Use genl command BATADV_CMD_GET_NEIGHBORS instead\n");
59 return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev); 77 return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev);
60} 78}
61 79
@@ -63,6 +81,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file)
63{ 81{
64 struct net_device *net_dev = (struct net_device *)inode->i_private; 82 struct net_device *net_dev = (struct net_device *)inode->i_private;
65 83
84 batadv_debugfs_deprecated(file,
85 "Use genl command BATADV_CMD_GET_ORIGINATORS instead\n");
66 return single_open(file, batadv_orig_seq_print_text, net_dev); 86 return single_open(file, batadv_orig_seq_print_text, net_dev);
67} 87}
68 88
@@ -79,6 +99,8 @@ static int batadv_originators_hardif_open(struct inode *inode,
79{ 99{
80 struct net_device *net_dev = (struct net_device *)inode->i_private; 100 struct net_device *net_dev = (struct net_device *)inode->i_private;
81 101
102 batadv_debugfs_deprecated(file,
103 "Use genl command BATADV_CMD_GET_HARDIFS instead\n");
82 return single_open(file, batadv_orig_hardif_seq_print_text, net_dev); 104 return single_open(file, batadv_orig_hardif_seq_print_text, net_dev);
83} 105}
84 106
@@ -86,6 +108,8 @@ static int batadv_gateways_open(struct inode *inode, struct file *file)
86{ 108{
87 struct net_device *net_dev = (struct net_device *)inode->i_private; 109 struct net_device *net_dev = (struct net_device *)inode->i_private;
88 110
111 batadv_debugfs_deprecated(file,
112 "Use genl command BATADV_CMD_GET_GATEWAYS instead\n");
89 return single_open(file, batadv_gw_client_seq_print_text, net_dev); 113 return single_open(file, batadv_gw_client_seq_print_text, net_dev);
90} 114}
91 115
@@ -93,6 +117,8 @@ static int batadv_transtable_global_open(struct inode *inode, struct file *file)
93{ 117{
94 struct net_device *net_dev = (struct net_device *)inode->i_private; 118 struct net_device *net_dev = (struct net_device *)inode->i_private;
95 119
120 batadv_debugfs_deprecated(file,
121 "Use genl command BATADV_CMD_GET_TRANSTABLE_GLOBAL instead\n");
96 return single_open(file, batadv_tt_global_seq_print_text, net_dev); 122 return single_open(file, batadv_tt_global_seq_print_text, net_dev);
97} 123}
98 124
@@ -101,6 +127,8 @@ static int batadv_bla_claim_table_open(struct inode *inode, struct file *file)
101{ 127{
102 struct net_device *net_dev = (struct net_device *)inode->i_private; 128 struct net_device *net_dev = (struct net_device *)inode->i_private;
103 129
130 batadv_debugfs_deprecated(file,
131 "Use genl command BATADV_CMD_GET_BLA_CLAIM instead\n");
104 return single_open(file, batadv_bla_claim_table_seq_print_text, 132 return single_open(file, batadv_bla_claim_table_seq_print_text,
105 net_dev); 133 net_dev);
106} 134}
@@ -110,6 +138,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
110{ 138{
111 struct net_device *net_dev = (struct net_device *)inode->i_private; 139 struct net_device *net_dev = (struct net_device *)inode->i_private;
112 140
141 batadv_debugfs_deprecated(file,
142 "Use genl command BATADV_CMD_GET_BLA_BACKBONE instead\n");
113 return single_open(file, batadv_bla_backbone_table_seq_print_text, 143 return single_open(file, batadv_bla_backbone_table_seq_print_text,
114 net_dev); 144 net_dev);
115} 145}
@@ -128,6 +158,8 @@ static int batadv_dat_cache_open(struct inode *inode, struct file *file)
128{ 158{
129 struct net_device *net_dev = (struct net_device *)inode->i_private; 159 struct net_device *net_dev = (struct net_device *)inode->i_private;
130 160
161 batadv_debugfs_deprecated(file,
162 "Use genl command BATADV_CMD_GET_DAT_CACHE instead\n");
131 return single_open(file, batadv_dat_cache_seq_print_text, net_dev); 163 return single_open(file, batadv_dat_cache_seq_print_text, net_dev);
132} 164}
133#endif 165#endif
@@ -136,6 +168,8 @@ static int batadv_transtable_local_open(struct inode *inode, struct file *file)
136{ 168{
137 struct net_device *net_dev = (struct net_device *)inode->i_private; 169 struct net_device *net_dev = (struct net_device *)inode->i_private;
138 170
171 batadv_debugfs_deprecated(file,
172 "Use genl command BATADV_CMD_GET_TRANSTABLE_LOCAL instead\n");
139 return single_open(file, batadv_tt_local_seq_print_text, net_dev); 173 return single_open(file, batadv_tt_local_seq_print_text, net_dev);
140} 174}
141 175
@@ -149,6 +183,7 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
149{ 183{
150 struct net_device *net_dev = (struct net_device *)inode->i_private; 184 struct net_device *net_dev = (struct net_device *)inode->i_private;
151 185
186 batadv_debugfs_deprecated(file, "");
152 return single_open(file, batadv_nc_nodes_seq_print_text, net_dev); 187 return single_open(file, batadv_nc_nodes_seq_print_text, net_dev);
153} 188}
154#endif 189#endif
@@ -165,6 +200,8 @@ static int batadv_mcast_flags_open(struct inode *inode, struct file *file)
165{ 200{
166 struct net_device *net_dev = (struct net_device *)inode->i_private; 201 struct net_device *net_dev = (struct net_device *)inode->i_private;
167 202
203 batadv_debugfs_deprecated(file,
204 "Use genl command BATADV_CMD_GET_MCAST_FLAGS instead\n");
168 return single_open(file, batadv_mcast_flags_seq_print_text, net_dev); 205 return single_open(file, batadv_mcast_flags_seq_print_text, net_dev);
169} 206}
170#endif 207#endif
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 08a592ffbee5..8de018e5c577 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -21,12 +21,14 @@
21 21
22#include "main.h" 22#include "main.h"
23 23
24struct file;
24struct net_device; 25struct net_device;
25 26
26#define BATADV_DEBUGFS_SUBDIR "batman_adv" 27#define BATADV_DEBUGFS_SUBDIR "batman_adv"
27 28
28#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS) 29#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS)
29 30
31void batadv_debugfs_deprecated(struct file *file, const char *alt);
30void batadv_debugfs_init(void); 32void batadv_debugfs_init(void);
31void batadv_debugfs_destroy(void); 33void batadv_debugfs_destroy(void);
32int batadv_debugfs_add_meshif(struct net_device *dev); 34int batadv_debugfs_add_meshif(struct net_device *dev);
@@ -38,6 +40,10 @@ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
38 40
39#else 41#else
40 42
43static inline void batadv_debugfs_deprecated(struct file *file, const char *alt)
44{
45}
46
41static inline void batadv_debugfs_init(void) 47static inline void batadv_debugfs_init(void)
42{ 48{
43} 49}
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 2f0d42f2f913..781c5b6e6e8e 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -763,11 +763,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
763 hard_iface->soft_iface = soft_iface; 763 hard_iface->soft_iface = soft_iface;
764 bat_priv = netdev_priv(hard_iface->soft_iface); 764 bat_priv = netdev_priv(hard_iface->soft_iface);
765 765
766 if (bat_priv->num_ifaces >= UINT_MAX) {
767 ret = -ENOSPC;
768 goto err_dev;
769 }
770
771 ret = netdev_master_upper_dev_link(hard_iface->net_dev, 766 ret = netdev_master_upper_dev_link(hard_iface->net_dev,
772 soft_iface, NULL, NULL, NULL); 767 soft_iface, NULL, NULL, NULL);
773 if (ret) 768 if (ret)
@@ -777,16 +772,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
777 if (ret < 0) 772 if (ret < 0)
778 goto err_upper; 773 goto err_upper;
779 774
780 hard_iface->if_num = bat_priv->num_ifaces;
781 bat_priv->num_ifaces++;
782 hard_iface->if_status = BATADV_IF_INACTIVE; 775 hard_iface->if_status = BATADV_IF_INACTIVE;
783 ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
784 if (ret < 0) {
785 bat_priv->algo_ops->iface.disable(hard_iface);
786 bat_priv->num_ifaces--;
787 hard_iface->if_status = BATADV_IF_NOT_IN_USE;
788 goto err_upper;
789 }
790 776
791 kref_get(&hard_iface->refcount); 777 kref_get(&hard_iface->refcount);
792 hard_iface->batman_adv_ptype.type = ethertype; 778 hard_iface->batman_adv_ptype.type = ethertype;
@@ -834,6 +820,33 @@ err:
834} 820}
835 821
836/** 822/**
823 * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface
824 * @soft_iface: soft interface to check
825 *
826 * This function is only using RCU for locking - the result can therefore be
827 * off when another functions is modifying the list at the same time. The
828 * caller can use the rtnl_lock to make sure that the count is accurate.
829 *
830 * Return: number of connected/enslaved hard interfaces
831 */
832static size_t batadv_hardif_cnt(const struct net_device *soft_iface)
833{
834 struct batadv_hard_iface *hard_iface;
835 size_t count = 0;
836
837 rcu_read_lock();
838 list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
839 if (hard_iface->soft_iface != soft_iface)
840 continue;
841
842 count++;
843 }
844 rcu_read_unlock();
845
846 return count;
847}
848
849/**
837 * batadv_hardif_disable_interface() - Remove hard interface from soft interface 850 * batadv_hardif_disable_interface() - Remove hard interface from soft interface
838 * @hard_iface: hard interface to be removed 851 * @hard_iface: hard interface to be removed
839 * @autodel: whether to delete soft interface when it doesn't contain any other 852 * @autodel: whether to delete soft interface when it doesn't contain any other
@@ -855,9 +868,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
855 dev_remove_pack(&hard_iface->batman_adv_ptype); 868 dev_remove_pack(&hard_iface->batman_adv_ptype);
856 batadv_hardif_put(hard_iface); 869 batadv_hardif_put(hard_iface);
857 870
858 bat_priv->num_ifaces--;
859 batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
860
861 primary_if = batadv_primary_if_get_selected(bat_priv); 871 primary_if = batadv_primary_if_get_selected(bat_priv);
862 if (hard_iface == primary_if) { 872 if (hard_iface == primary_if) {
863 struct batadv_hard_iface *new_if; 873 struct batadv_hard_iface *new_if;
@@ -881,7 +891,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
881 batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); 891 batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface);
882 892
883 /* nobody uses this interface anymore */ 893 /* nobody uses this interface anymore */
884 if (bat_priv->num_ifaces == 0) { 894 if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) {
885 batadv_gw_check_client_stop(bat_priv); 895 batadv_gw_check_client_stop(bat_priv);
886 896
887 if (autodel == BATADV_IF_CLEANUP_AUTO) 897 if (autodel == BATADV_IF_CLEANUP_AUTO)
@@ -917,7 +927,6 @@ batadv_hardif_add_interface(struct net_device *net_dev)
917 if (ret) 927 if (ret)
918 goto free_if; 928 goto free_if;
919 929
920 hard_iface->if_num = 0;
921 hard_iface->net_dev = net_dev; 930 hard_iface->net_dev = net_dev;
922 hard_iface->soft_iface = NULL; 931 hard_iface->soft_iface = NULL;
923 hard_iface->if_status = BATADV_IF_NOT_IN_USE; 932 hard_iface->if_status = BATADV_IF_NOT_IN_USE;
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 55c358ad3331..d70f363c52ae 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -47,6 +47,7 @@
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <uapi/linux/batadv_packet.h> 48#include <uapi/linux/batadv_packet.h>
49 49
50#include "debugfs.h"
50#include "hard-interface.h" 51#include "hard-interface.h"
51#include "log.h" 52#include "log.h"
52#include "originator.h" 53#include "originator.h"
@@ -74,6 +75,8 @@ static int batadv_socket_open(struct inode *inode, struct file *file)
74 if (!try_module_get(THIS_MODULE)) 75 if (!try_module_get(THIS_MODULE))
75 return -EBUSY; 76 return -EBUSY;
76 77
78 batadv_debugfs_deprecated(file, "");
79
77 nonseekable_open(inode, file); 80 nonseekable_open(inode, file);
78 81
79 socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL); 82 socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 853773e45f79..6beb5f067810 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -40,6 +40,9 @@
40#include <linux/wait.h> 40#include <linux/wait.h>
41#include <stdarg.h> 41#include <stdarg.h>
42 42
43#include "debugfs.h"
44#include "trace.h"
45
43#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) 46#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
44 47
45static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; 48static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
@@ -98,13 +101,19 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
98 */ 101 */
99int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) 102int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
100{ 103{
104 struct va_format vaf;
101 va_list args; 105 va_list args;
102 char tmp_log_buf[256];
103 106
104 va_start(args, fmt); 107 va_start(args, fmt);
105 vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); 108
106 batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", 109 vaf.fmt = fmt;
107 jiffies_to_msecs(jiffies), tmp_log_buf); 110 vaf.va = &args;
111
112 batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV",
113 jiffies_to_msecs(jiffies), &vaf);
114
115 trace_batadv_dbg(bat_priv, &vaf);
116
108 va_end(args); 117 va_end(args);
109 118
110 return 0; 119 return 0;
@@ -115,6 +124,9 @@ static int batadv_log_open(struct inode *inode, struct file *file)
115 if (!try_module_get(THIS_MODULE)) 124 if (!try_module_get(THIS_MODULE))
116 return -EBUSY; 125 return -EBUSY;
117 126
127 batadv_debugfs_deprecated(file,
128 "Use tracepoint batadv:batadv_dbg instead\n");
129
118 nonseekable_open(inode, file); 130 nonseekable_open(inode, file);
119 file->private_data = inode->i_private; 131 file->private_data = inode->i_private;
120 return 0; 132 return 0;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 3ccc75ee719c..2002b70e18db 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
25#define BATADV_DRIVER_DEVICE "batman-adv" 25#define BATADV_DRIVER_DEVICE "batman-adv"
26 26
27#ifndef BATADV_SOURCE_VERSION 27#ifndef BATADV_SOURCE_VERSION
28#define BATADV_SOURCE_VERSION "2018.3" 28#define BATADV_SOURCE_VERSION "2018.4"
29#endif 29#endif
30 30
31/* B.A.T.M.A.N. parameters */ 31/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 1d295da3e342..56a981af5c92 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -904,9 +904,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
904 904
905 batadv_frag_purge_orig(orig_node, NULL); 905 batadv_frag_purge_orig(orig_node, NULL);
906 906
907 if (orig_node->bat_priv->algo_ops->orig.free)
908 orig_node->bat_priv->algo_ops->orig.free(orig_node);
909
910 kfree(orig_node->tt_buff); 907 kfree(orig_node->tt_buff);
911 kfree(orig_node); 908 kfree(orig_node);
912} 909}
@@ -1555,107 +1552,3 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
1555 1552
1556 return ret; 1553 return ret;
1557} 1554}
1558
1559/**
1560 * batadv_orig_hash_add_if() - Add interface to originators in orig_hash
1561 * @hard_iface: hard interface to add (already slave of the soft interface)
1562 * @max_if_num: new number of interfaces
1563 *
1564 * Return: 0 on success or negative error number in case of failure
1565 */
1566int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
1567 unsigned int max_if_num)
1568{
1569 struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
1570 struct batadv_algo_ops *bao = bat_priv->algo_ops;
1571 struct batadv_hashtable *hash = bat_priv->orig_hash;
1572 struct hlist_head *head;
1573 struct batadv_orig_node *orig_node;
1574 u32 i;
1575 int ret;
1576
1577 /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
1578 * if_num
1579 */
1580 for (i = 0; i < hash->size; i++) {
1581 head = &hash->table[i];
1582
1583 rcu_read_lock();
1584 hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
1585 ret = 0;
1586 if (bao->orig.add_if)
1587 ret = bao->orig.add_if(orig_node, max_if_num);
1588 if (ret == -ENOMEM)
1589 goto err;
1590 }
1591 rcu_read_unlock();
1592 }
1593
1594 return 0;
1595
1596err:
1597 rcu_read_unlock();
1598 return -ENOMEM;
1599}
1600
1601/**
1602 * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash
1603 * @hard_iface: hard interface to remove (still slave of the soft interface)
1604 * @max_if_num: new number of interfaces
1605 *
1606 * Return: 0 on success or negative error number in case of failure
1607 */
1608int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
1609 unsigned int max_if_num)
1610{
1611 struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
1612 struct batadv_hashtable *hash = bat_priv->orig_hash;
1613 struct hlist_head *head;
1614 struct batadv_hard_iface *hard_iface_tmp;
1615 struct batadv_orig_node *orig_node;
1616 struct batadv_algo_ops *bao = bat_priv->algo_ops;
1617 u32 i;
1618 int ret;
1619
1620 /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
1621 * if_num
1622 */
1623 for (i = 0; i < hash->size; i++) {
1624 head = &hash->table[i];
1625
1626 rcu_read_lock();
1627 hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
1628 ret = 0;
1629 if (bao->orig.del_if)
1630 ret = bao->orig.del_if(orig_node, max_if_num,
1631 hard_iface->if_num);
1632 if (ret == -ENOMEM)
1633 goto err;
1634 }
1635 rcu_read_unlock();
1636 }
1637
1638 /* renumber remaining batman interfaces _inside_ of orig_hash_lock */
1639 rcu_read_lock();
1640 list_for_each_entry_rcu(hard_iface_tmp, &batadv_hardif_list, list) {
1641 if (hard_iface_tmp->if_status == BATADV_IF_NOT_IN_USE)
1642 continue;
1643
1644 if (hard_iface == hard_iface_tmp)
1645 continue;
1646
1647 if (hard_iface->soft_iface != hard_iface_tmp->soft_iface)
1648 continue;
1649
1650 if (hard_iface_tmp->if_num > hard_iface->if_num)
1651 hard_iface_tmp->if_num--;
1652 }
1653 rcu_read_unlock();
1654
1655 hard_iface->if_num = -1;
1656 return 0;
1657
1658err:
1659 rcu_read_unlock();
1660 return -ENOMEM;
1661}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 3b3f59b881e1..a8b4c7b667ec 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -72,10 +72,6 @@ void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
72int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); 72int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
73int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); 73int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
74int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); 74int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
75int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
76 unsigned int max_if_num);
77int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
78 unsigned int max_if_num);
79struct batadv_orig_node_vlan * 75struct batadv_orig_node_vlan *
80batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, 76batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
81 unsigned short vid); 77 unsigned short vid);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 626ddca332db..5db5a0a4c959 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -844,7 +844,6 @@ static int batadv_softif_init_late(struct net_device *dev)
844 atomic_set(&bat_priv->frag_seqno, random_seqno); 844 atomic_set(&bat_priv->frag_seqno, random_seqno);
845 845
846 bat_priv->primary_if = NULL; 846 bat_priv->primary_if = NULL;
847 bat_priv->num_ifaces = 0;
848 847
849 batadv_nc_init_bat_priv(bat_priv); 848 batadv_nc_init_bat_priv(bat_priv);
850 849
@@ -1062,6 +1061,7 @@ static void batadv_softif_init_early(struct net_device *dev)
1062 dev->needs_free_netdev = true; 1061 dev->needs_free_netdev = true;
1063 dev->priv_destructor = batadv_softif_free; 1062 dev->priv_destructor = batadv_softif_free;
1064 dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; 1063 dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
1064 dev->features |= NETIF_F_LLTX;
1065 dev->priv_flags |= IFF_NO_QUEUE; 1065 dev->priv_flags |= IFF_NO_QUEUE;
1066 1066
1067 /* can't call min_mtu, because the needed variables 1067 /* can't call min_mtu, because the needed variables
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
new file mode 100644
index 000000000000..3d57f9981f25
--- /dev/null
+++ b/net/batman-adv/trace.c
@@ -0,0 +1,22 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
3 *
4 * Sven Eckelmann
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public
8 * License as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19#include <linux/module.h>
20
21#define CREATE_TRACE_POINTS
22#include "trace.h"
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
new file mode 100644
index 000000000000..3acda26a30ca
--- /dev/null
+++ b/net/batman-adv/trace.h
@@ -0,0 +1,78 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
3 *
4 * Sven Eckelmann
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public
8 * License as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19#if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
20#define _NET_BATMAN_ADV_TRACE_H_
21
22#include "main.h"
23
24#include <linux/tracepoint.h>
25
26#undef TRACE_SYSTEM
27#define TRACE_SYSTEM batadv
28
29/* provide dummy function when tracing is disabled */
30#if !defined(CONFIG_BATMAN_ADV_TRACING)
31
32#undef TRACE_EVENT
33#define TRACE_EVENT(name, proto, ...) \
34 static inline void trace_ ## name(proto) {}
35
36#endif /* CONFIG_BATMAN_ADV_TRACING */
37
38#define BATADV_MAX_MSG_LEN 256
39
40TRACE_EVENT(batadv_dbg,
41
42 TP_PROTO(struct batadv_priv *bat_priv,
43 struct va_format *vaf),
44
45 TP_ARGS(bat_priv, vaf),
46
47 TP_STRUCT__entry(
48 __string(device, bat_priv->soft_iface->name)
49 __string(driver, KBUILD_MODNAME)
50 __dynamic_array(char, msg, BATADV_MAX_MSG_LEN)
51 ),
52
53 TP_fast_assign(
54 __assign_str(device, bat_priv->soft_iface->name);
55 __assign_str(driver, KBUILD_MODNAME);
56 WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
57 BATADV_MAX_MSG_LEN,
58 vaf->fmt,
59 *vaf->va) >= BATADV_MAX_MSG_LEN);
60 ),
61
62 TP_printk(
63 "%s %s %s",
64 __get_str(driver),
65 __get_str(device),
66 __get_str(msg)
67 )
68);
69
70#endif /* _NET_BATMAN_ADV_TRACE_H_ || TRACE_HEADER_MULTI_READ */
71
72#undef TRACE_INCLUDE_PATH
73#define TRACE_INCLUDE_PATH .
74#undef TRACE_INCLUDE_FILE
75#define TRACE_INCLUDE_FILE trace
76
77/* This part must be outside protection */
78#include <trace/define_trace.h>
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 343d304851a5..45b5592de816 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -167,9 +167,6 @@ struct batadv_hard_iface {
167 /** @list: list node for batadv_hardif_list */ 167 /** @list: list node for batadv_hardif_list */
168 struct list_head list; 168 struct list_head list;
169 169
170 /** @if_num: identificator of the interface */
171 unsigned int if_num;
172
173 /** @if_status: status of the interface for batman-adv */ 170 /** @if_status: status of the interface for batman-adv */
174 char if_status; 171 char if_status;
175 172
@@ -233,6 +230,20 @@ struct batadv_hard_iface {
233}; 230};
234 231
235/** 232/**
233 * struct batadv_orig_ifinfo - B.A.T.M.A.N. IV private orig_ifinfo members
234 */
235struct batadv_orig_ifinfo_bat_iv {
236 /**
237 * @bcast_own: bitfield which counts the number of our OGMs this
238 * orig_node rebroadcasted "back" to us (relative to last_real_seqno)
239 */
240 DECLARE_BITMAP(bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE);
241
242 /** @bcast_own_sum: sum of bcast_own */
243 u8 bcast_own_sum;
244};
245
246/**
236 * struct batadv_orig_ifinfo - originator info per outgoing interface 247 * struct batadv_orig_ifinfo - originator info per outgoing interface
237 */ 248 */
238struct batadv_orig_ifinfo { 249struct batadv_orig_ifinfo {
@@ -257,6 +268,9 @@ struct batadv_orig_ifinfo {
257 /** @batman_seqno_reset: time when the batman seqno window was reset */ 268 /** @batman_seqno_reset: time when the batman seqno window was reset */
258 unsigned long batman_seqno_reset; 269 unsigned long batman_seqno_reset;
259 270
271 /** @bat_iv: B.A.T.M.A.N. IV private structure */
272 struct batadv_orig_ifinfo_bat_iv bat_iv;
273
260 /** @refcount: number of contexts the object is used */ 274 /** @refcount: number of contexts the object is used */
261 struct kref refcount; 275 struct kref refcount;
262 276
@@ -339,19 +353,10 @@ struct batadv_orig_node_vlan {
339 */ 353 */
340struct batadv_orig_bat_iv { 354struct batadv_orig_bat_iv {
341 /** 355 /**
342 * @bcast_own: set of bitfields (one per hard-interface) where each one 356 * @ogm_cnt_lock: lock protecting &batadv_orig_ifinfo_bat_iv.bcast_own,
343 * counts the number of our OGMs this orig_node rebroadcasted "back" to 357 * &batadv_orig_ifinfo_bat_iv.bcast_own_sum,
344 * us (relative to last_real_seqno). Every bitfield is 358 * &batadv_neigh_ifinfo_bat_iv.bat_iv.real_bits and
345 * BATADV_TQ_LOCAL_WINDOW_SIZE bits long. 359 * &batadv_neigh_ifinfo_bat_iv.real_packet_count
346 */
347 unsigned long *bcast_own;
348
349 /** @bcast_own_sum: sum of bcast_own */
350 u8 *bcast_own_sum;
351
352 /**
353 * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
354 * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
355 */ 360 */
356 spinlock_t ogm_cnt_lock; 361 spinlock_t ogm_cnt_lock;
357}; 362};
@@ -1597,9 +1602,6 @@ struct batadv_priv {
1597 /** @batman_queue_left: number of remaining OGM packet slots */ 1602 /** @batman_queue_left: number of remaining OGM packet slots */
1598 atomic_t batman_queue_left; 1603 atomic_t batman_queue_left;
1599 1604
1600 /** @num_ifaces: number of interfaces assigned to this mesh interface */
1601 unsigned int num_ifaces;
1602
1603 /** @mesh_obj: kobject for sysfs mesh subdirectory */ 1605 /** @mesh_obj: kobject for sysfs mesh subdirectory */
1604 struct kobject *mesh_obj; 1606 struct kobject *mesh_obj;
1605 1607
@@ -2179,28 +2181,6 @@ struct batadv_algo_neigh_ops {
2179 * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) 2181 * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
2180 */ 2182 */
2181struct batadv_algo_orig_ops { 2183struct batadv_algo_orig_ops {
2182 /**
2183 * @free: free the resources allocated by the routing algorithm for an
2184 * orig_node object (optional)
2185 */
2186 void (*free)(struct batadv_orig_node *orig_node);
2187
2188 /**
2189 * @add_if: ask the routing algorithm to apply the needed changes to the
2190 * orig_node due to a new hard-interface being added into the mesh
2191 * (optional)
2192 */
2193 int (*add_if)(struct batadv_orig_node *orig_node,
2194 unsigned int max_if_num);
2195
2196 /**
2197 * @del_if: ask the routing algorithm to apply the needed changes to the
2198 * orig_node due to an hard-interface being removed from the mesh
2199 * (optional)
2200 */
2201 int (*del_if)(struct batadv_orig_node *orig_node,
2202 unsigned int max_if_num, unsigned int del_if_num);
2203
2204#ifdef CONFIG_BATMAN_ADV_DEBUGFS 2184#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2205 /** @print: print the originator table (optional) */ 2185 /** @print: print the originator table (optional) */
2206 void (*print)(struct batadv_priv *priv, struct seq_file *seq, 2186 void (*print)(struct batadv_priv *priv, struct seq_file *seq,
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 7b3965861013..43c284158f63 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -489,9 +489,6 @@ static int bnep_session(void *arg)
489 489
490 add_wait_queue(sk_sleep(sk), &wait); 490 add_wait_queue(sk_sleep(sk), &wait);
491 while (1) { 491 while (1) {
492 /* Ensure session->terminate is updated */
493 smp_mb__before_atomic();
494
495 if (atomic_read(&s->terminate)) 492 if (atomic_read(&s->terminate))
496 break; 493 break;
497 /* RX */ 494 /* RX */
@@ -512,6 +509,10 @@ static int bnep_session(void *arg)
512 break; 509 break;
513 netif_wake_queue(dev); 510 netif_wake_queue(dev);
514 511
512 /*
513 * wait_woken() performs the necessary memory barriers
514 * for us; see the header comment for this primitive.
515 */
515 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 516 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
516 } 517 }
517 remove_wait_queue(sk_sleep(sk), &wait); 518 remove_wait_queue(sk_sleep(sk), &wait);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 00deacdcb51c..cfd83c5521ae 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -49,18 +49,17 @@ static int bnep_sock_release(struct socket *sock)
49 return 0; 49 return 0;
50} 50}
51 51
52static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 52static int do_bnep_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
53{ 53{
54 struct bnep_connlist_req cl; 54 struct bnep_connlist_req cl;
55 struct bnep_connadd_req ca; 55 struct bnep_connadd_req ca;
56 struct bnep_conndel_req cd; 56 struct bnep_conndel_req cd;
57 struct bnep_conninfo ci; 57 struct bnep_conninfo ci;
58 struct socket *nsock; 58 struct socket *nsock;
59 void __user *argp = (void __user *)arg;
60 __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE); 59 __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE);
61 int err; 60 int err;
62 61
63 BT_DBG("cmd %x arg %lx", cmd, arg); 62 BT_DBG("cmd %x arg %p", cmd, argp);
64 63
65 switch (cmd) { 64 switch (cmd) {
66 case BNEPCONNADD: 65 case BNEPCONNADD:
@@ -134,16 +133,22 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
134 return 0; 133 return 0;
135} 134}
136 135
136static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
137{
138 return do_bnep_sock_ioctl(sock, cmd, (void __user *)arg);
139}
140
137#ifdef CONFIG_COMPAT 141#ifdef CONFIG_COMPAT
138static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 142static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
139{ 143{
144 void __user *argp = compat_ptr(arg);
140 if (cmd == BNEPGETCONNLIST) { 145 if (cmd == BNEPGETCONNLIST) {
141 struct bnep_connlist_req cl; 146 struct bnep_connlist_req cl;
147 unsigned __user *p = argp;
142 u32 uci; 148 u32 uci;
143 int err; 149 int err;
144 150
145 if (get_user(cl.cnum, (u32 __user *) arg) || 151 if (get_user(cl.cnum, p) || get_user(uci, p + 1))
146 get_user(uci, (u32 __user *) (arg + 4)))
147 return -EFAULT; 152 return -EFAULT;
148 153
149 cl.ci = compat_ptr(uci); 154 cl.ci = compat_ptr(uci);
@@ -153,13 +158,13 @@ static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
153 158
154 err = bnep_get_connlist(&cl); 159 err = bnep_get_connlist(&cl);
155 160
156 if (!err && put_user(cl.cnum, (u32 __user *) arg)) 161 if (!err && put_user(cl.cnum, p))
157 err = -EFAULT; 162 err = -EFAULT;
158 163
159 return err; 164 return err;
160 } 165 }
161 166
162 return bnep_sock_ioctl(sock, cmd, arg); 167 return do_bnep_sock_ioctl(sock, cmd, argp);
163} 168}
164#endif 169#endif
165 170
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 7f26a5a19ff6..07cfa3249f83 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -288,9 +288,6 @@ static int cmtp_session(void *arg)
288 288
289 add_wait_queue(sk_sleep(sk), &wait); 289 add_wait_queue(sk_sleep(sk), &wait);
290 while (1) { 290 while (1) {
291 /* Ensure session->terminate is updated */
292 smp_mb__before_atomic();
293
294 if (atomic_read(&session->terminate)) 291 if (atomic_read(&session->terminate))
295 break; 292 break;
296 if (sk->sk_state != BT_CONNECTED) 293 if (sk->sk_state != BT_CONNECTED)
@@ -306,6 +303,10 @@ static int cmtp_session(void *arg)
306 303
307 cmtp_process_transmit(session); 304 cmtp_process_transmit(session);
308 305
306 /*
307 * wait_woken() performs the necessary memory barriers
308 * for us; see the header comment for this primitive.
309 */
309 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 310 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
310 } 311 }
311 remove_wait_queue(sk_sleep(sk), &wait); 312 remove_wait_queue(sk_sleep(sk), &wait);
@@ -431,9 +432,10 @@ int cmtp_del_connection(struct cmtp_conndel_req *req)
431 /* Stop session thread */ 432 /* Stop session thread */
432 atomic_inc(&session->terminate); 433 atomic_inc(&session->terminate);
433 434
434 /* Ensure session->terminate is updated */ 435 /*
435 smp_mb__after_atomic(); 436 * See the comment preceding the call to wait_woken()
436 437 * in cmtp_session().
438 */
437 wake_up_interruptible(sk_sleep(session->sock->sk)); 439 wake_up_interruptible(sk_sleep(session->sock->sk));
438 } else 440 } else
439 err = -ENOENT; 441 err = -ENOENT;
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index e08f28fadd65..defdd4871919 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -63,17 +63,16 @@ static int cmtp_sock_release(struct socket *sock)
63 return 0; 63 return 0;
64} 64}
65 65
66static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 66static int do_cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
67{ 67{
68 struct cmtp_connadd_req ca; 68 struct cmtp_connadd_req ca;
69 struct cmtp_conndel_req cd; 69 struct cmtp_conndel_req cd;
70 struct cmtp_connlist_req cl; 70 struct cmtp_connlist_req cl;
71 struct cmtp_conninfo ci; 71 struct cmtp_conninfo ci;
72 struct socket *nsock; 72 struct socket *nsock;
73 void __user *argp = (void __user *)arg;
74 int err; 73 int err;
75 74
76 BT_DBG("cmd %x arg %lx", cmd, arg); 75 BT_DBG("cmd %x arg %p", cmd, argp);
77 76
78 switch (cmd) { 77 switch (cmd) {
79 case CMTPCONNADD: 78 case CMTPCONNADD:
@@ -137,16 +136,22 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
137 return -EINVAL; 136 return -EINVAL;
138} 137}
139 138
139static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
140{
141 return do_cmtp_sock_ioctl(sock, cmd, (void __user *)arg);
142}
143
140#ifdef CONFIG_COMPAT 144#ifdef CONFIG_COMPAT
141static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 145static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
142{ 146{
147 void __user *argp = compat_ptr(arg);
143 if (cmd == CMTPGETCONNLIST) { 148 if (cmd == CMTPGETCONNLIST) {
144 struct cmtp_connlist_req cl; 149 struct cmtp_connlist_req cl;
150 u32 __user *p = argp;
145 u32 uci; 151 u32 uci;
146 int err; 152 int err;
147 153
148 if (get_user(cl.cnum, (u32 __user *) arg) || 154 if (get_user(cl.cnum, p) || get_user(uci, p + 1))
149 get_user(uci, (u32 __user *) (arg + 4)))
150 return -EFAULT; 155 return -EFAULT;
151 156
152 cl.ci = compat_ptr(uci); 157 cl.ci = compat_ptr(uci);
@@ -156,13 +161,13 @@ static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
156 161
157 err = cmtp_get_connlist(&cl); 162 err = cmtp_get_connlist(&cl);
158 163
159 if (!err && put_user(cl.cnum, (u32 __user *) arg)) 164 if (!err && put_user(cl.cnum, p))
160 err = -EFAULT; 165 err = -EFAULT;
161 166
162 return err; 167 return err;
163 } 168 }
164 169
165 return cmtp_sock_ioctl(sock, cmd, arg); 170 return do_cmtp_sock_ioctl(sock, cmd, argp);
166} 171}
167#endif 172#endif
168 173
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 74b29c7d841c..7352fe85674b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2839,6 +2839,20 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
2839 return NULL; 2839 return NULL;
2840} 2840}
2841 2841
2842struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
2843 struct list_head *bdaddr_list, bdaddr_t *bdaddr,
2844 u8 type)
2845{
2846 struct bdaddr_list_with_irk *b;
2847
2848 list_for_each_entry(b, bdaddr_list, list) {
2849 if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
2850 return b;
2851 }
2852
2853 return NULL;
2854}
2855
2842void hci_bdaddr_list_clear(struct list_head *bdaddr_list) 2856void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
2843{ 2857{
2844 struct bdaddr_list *b, *n; 2858 struct bdaddr_list *b, *n;
@@ -2871,6 +2885,35 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type)
2871 return 0; 2885 return 0;
2872} 2886}
2873 2887
2888int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
2889 u8 type, u8 *peer_irk, u8 *local_irk)
2890{
2891 struct bdaddr_list_with_irk *entry;
2892
2893 if (!bacmp(bdaddr, BDADDR_ANY))
2894 return -EBADF;
2895
2896 if (hci_bdaddr_list_lookup(list, bdaddr, type))
2897 return -EEXIST;
2898
2899 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
2900 if (!entry)
2901 return -ENOMEM;
2902
2903 bacpy(&entry->bdaddr, bdaddr);
2904 entry->bdaddr_type = type;
2905
2906 if (peer_irk)
2907 memcpy(entry->peer_irk, peer_irk, 16);
2908
2909 if (local_irk)
2910 memcpy(entry->local_irk, local_irk, 16);
2911
2912 list_add(&entry->list, list);
2913
2914 return 0;
2915}
2916
2874int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) 2917int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
2875{ 2918{
2876 struct bdaddr_list *entry; 2919 struct bdaddr_list *entry;
@@ -2890,6 +2933,26 @@ int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
2890 return 0; 2933 return 0;
2891} 2934}
2892 2935
2936int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
2937 u8 type)
2938{
2939 struct bdaddr_list_with_irk *entry;
2940
2941 if (!bacmp(bdaddr, BDADDR_ANY)) {
2942 hci_bdaddr_list_clear(list);
2943 return 0;
2944 }
2945
2946 entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type);
2947 if (!entry)
2948 return -ENOENT;
2949
2950 list_del(&entry->list);
2951 kfree(entry);
2952
2953 return 0;
2954}
2955
2893/* This function requires the caller holds hdev->lock */ 2956/* This function requires the caller holds hdev->lock */
2894struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, 2957struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
2895 bdaddr_t *addr, u8 addr_type) 2958 bdaddr_t *addr, u8 addr_type)
@@ -3084,6 +3147,8 @@ struct hci_dev *hci_alloc_dev(void)
3084 hdev->le_max_tx_time = 0x0148; 3147 hdev->le_max_tx_time = 0x0148;
3085 hdev->le_max_rx_len = 0x001b; 3148 hdev->le_max_rx_len = 0x001b;
3086 hdev->le_max_rx_time = 0x0148; 3149 hdev->le_max_rx_time = 0x0148;
3150 hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE;
3151 hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE;
3087 hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; 3152 hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
3088 hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; 3153 hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
3089 3154
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f12555f23a49..ef9928d7b4fb 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1454,6 +1454,45 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
1454 hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); 1454 hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
1455} 1455}
1456 1456
1457static void hci_cc_le_add_to_resolv_list(struct hci_dev *hdev,
1458 struct sk_buff *skb)
1459{
1460 struct hci_cp_le_add_to_resolv_list *sent;
1461 __u8 status = *((__u8 *) skb->data);
1462
1463 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1464
1465 if (status)
1466 return;
1467
1468 sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST);
1469 if (!sent)
1470 return;
1471
1472 hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
1473 sent->bdaddr_type, sent->peer_irk,
1474 sent->local_irk);
1475}
1476
1477static void hci_cc_le_del_from_resolv_list(struct hci_dev *hdev,
1478 struct sk_buff *skb)
1479{
1480 struct hci_cp_le_del_from_resolv_list *sent;
1481 __u8 status = *((__u8 *) skb->data);
1482
1483 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1484
1485 if (status)
1486 return;
1487
1488 sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST);
1489 if (!sent)
1490 return;
1491
1492 hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
1493 sent->bdaddr_type);
1494}
1495
1457static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev, 1496static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev,
1458 struct sk_buff *skb) 1497 struct sk_buff *skb)
1459{ 1498{
@@ -3279,6 +3318,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
3279 hci_cc_le_write_def_data_len(hdev, skb); 3318 hci_cc_le_write_def_data_len(hdev, skb);
3280 break; 3319 break;
3281 3320
3321 case HCI_OP_LE_ADD_TO_RESOLV_LIST:
3322 hci_cc_le_add_to_resolv_list(hdev, skb);
3323 break;
3324
3325 case HCI_OP_LE_DEL_FROM_RESOLV_LIST:
3326 hci_cc_le_del_from_resolv_list(hdev, skb);
3327 break;
3328
3282 case HCI_OP_LE_CLEAR_RESOLV_LIST: 3329 case HCI_OP_LE_CLEAR_RESOLV_LIST:
3283 hci_cc_le_clear_resolv_list(hdev, skb); 3330 hci_cc_le_clear_resolv_list(hdev, skb);
3284 break; 3331 break;
@@ -4890,31 +4937,27 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
4890 hci_debugfs_create_conn(conn); 4937 hci_debugfs_create_conn(conn);
4891 hci_conn_add_sysfs(conn); 4938 hci_conn_add_sysfs(conn);
4892 4939
4893 if (!status) { 4940 /* The remote features procedure is defined for master
4894 /* The remote features procedure is defined for master 4941 * role only. So only in case of an initiated connection
4895 * role only. So only in case of an initiated connection 4942 * request the remote features.
4896 * request the remote features. 4943 *
4897 * 4944 * If the local controller supports slave-initiated features
4898 * If the local controller supports slave-initiated features 4945 * exchange, then requesting the remote features in slave
4899 * exchange, then requesting the remote features in slave 4946 * role is possible. Otherwise just transition into the
4900 * role is possible. Otherwise just transition into the 4947 * connected state without requesting the remote features.
4901 * connected state without requesting the remote features. 4948 */
4902 */ 4949 if (conn->out ||
4903 if (conn->out || 4950 (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) {
4904 (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) { 4951 struct hci_cp_le_read_remote_features cp;
4905 struct hci_cp_le_read_remote_features cp;
4906 4952
4907 cp.handle = __cpu_to_le16(conn->handle); 4953 cp.handle = __cpu_to_le16(conn->handle);
4908 4954
4909 hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, 4955 hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
4910 sizeof(cp), &cp); 4956 sizeof(cp), &cp);
4911 4957
4912 hci_conn_hold(conn); 4958 hci_conn_hold(conn);
4913 } else {
4914 conn->state = BT_CONNECTED;
4915 hci_connect_cfm(conn, status);
4916 }
4917 } else { 4959 } else {
4960 conn->state = BT_CONNECTED;
4918 hci_connect_cfm(conn, status); 4961 hci_connect_cfm(conn, status);
4919 } 4962 }
4920 4963
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 253975cce943..a442e21f3894 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -649,7 +649,7 @@ static void hidp_process_transmit(struct hidp_session *session,
649} 649}
650 650
651static int hidp_setup_input(struct hidp_session *session, 651static int hidp_setup_input(struct hidp_session *session,
652 struct hidp_connadd_req *req) 652 const struct hidp_connadd_req *req)
653{ 653{
654 struct input_dev *input; 654 struct input_dev *input;
655 int i; 655 int i;
@@ -748,7 +748,7 @@ EXPORT_SYMBOL_GPL(hidp_hid_driver);
748/* This function sets up the hid device. It does not add it 748/* This function sets up the hid device. It does not add it
749 to the HID system. That is done in hidp_add_connection(). */ 749 to the HID system. That is done in hidp_add_connection(). */
750static int hidp_setup_hid(struct hidp_session *session, 750static int hidp_setup_hid(struct hidp_session *session,
751 struct hidp_connadd_req *req) 751 const struct hidp_connadd_req *req)
752{ 752{
753 struct hid_device *hid; 753 struct hid_device *hid;
754 int err; 754 int err;
@@ -807,7 +807,7 @@ fault:
807 807
808/* initialize session devices */ 808/* initialize session devices */
809static int hidp_session_dev_init(struct hidp_session *session, 809static int hidp_session_dev_init(struct hidp_session *session,
810 struct hidp_connadd_req *req) 810 const struct hidp_connadd_req *req)
811{ 811{
812 int ret; 812 int ret;
813 813
@@ -906,7 +906,7 @@ static void hidp_session_dev_work(struct work_struct *work)
906static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, 906static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
907 struct socket *ctrl_sock, 907 struct socket *ctrl_sock,
908 struct socket *intr_sock, 908 struct socket *intr_sock,
909 struct hidp_connadd_req *req, 909 const struct hidp_connadd_req *req,
910 struct l2cap_conn *conn) 910 struct l2cap_conn *conn)
911{ 911{
912 struct hidp_session *session; 912 struct hidp_session *session;
@@ -1074,6 +1074,10 @@ static int hidp_session_start_sync(struct hidp_session *session)
1074static void hidp_session_terminate(struct hidp_session *session) 1074static void hidp_session_terminate(struct hidp_session *session)
1075{ 1075{
1076 atomic_inc(&session->terminate); 1076 atomic_inc(&session->terminate);
1077 /*
1078 * See the comment preceding the call to wait_woken()
1079 * in hidp_session_run().
1080 */
1077 wake_up_interruptible(&hidp_session_wq); 1081 wake_up_interruptible(&hidp_session_wq);
1078} 1082}
1079 1083
@@ -1193,8 +1197,6 @@ static void hidp_session_run(struct hidp_session *session)
1193 * thread is woken up by ->sk_state_changed(). 1197 * thread is woken up by ->sk_state_changed().
1194 */ 1198 */
1195 1199
1196 /* Ensure session->terminate is updated */
1197 smp_mb__before_atomic();
1198 if (atomic_read(&session->terminate)) 1200 if (atomic_read(&session->terminate))
1199 break; 1201 break;
1200 1202
@@ -1228,14 +1230,15 @@ static void hidp_session_run(struct hidp_session *session)
1228 hidp_process_transmit(session, &session->ctrl_transmit, 1230 hidp_process_transmit(session, &session->ctrl_transmit,
1229 session->ctrl_sock); 1231 session->ctrl_sock);
1230 1232
1233 /*
1234 * wait_woken() performs the necessary memory barriers
1235 * for us; see the header comment for this primitive.
1236 */
1231 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 1237 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
1232 } 1238 }
1233 remove_wait_queue(&hidp_session_wq, &wait); 1239 remove_wait_queue(&hidp_session_wq, &wait);
1234 1240
1235 atomic_inc(&session->terminate); 1241 atomic_inc(&session->terminate);
1236
1237 /* Ensure session->terminate is updated */
1238 smp_mb__after_atomic();
1239} 1242}
1240 1243
1241static int hidp_session_wake_function(wait_queue_entry_t *wait, 1244static int hidp_session_wake_function(wait_queue_entry_t *wait,
@@ -1335,7 +1338,7 @@ static int hidp_verify_sockets(struct socket *ctrl_sock,
1335 return 0; 1338 return 0;
1336} 1339}
1337 1340
1338int hidp_connection_add(struct hidp_connadd_req *req, 1341int hidp_connection_add(const struct hidp_connadd_req *req,
1339 struct socket *ctrl_sock, 1342 struct socket *ctrl_sock,
1340 struct socket *intr_sock) 1343 struct socket *intr_sock)
1341{ 1344{
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index 8798492a6e99..6ef88d0a1919 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -122,7 +122,7 @@ struct hidp_connlist_req {
122 struct hidp_conninfo __user *ci; 122 struct hidp_conninfo __user *ci;
123}; 123};
124 124
125int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); 125int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
126int hidp_connection_del(struct hidp_conndel_req *req); 126int hidp_connection_del(struct hidp_conndel_req *req);
127int hidp_get_connlist(struct hidp_connlist_req *req); 127int hidp_get_connlist(struct hidp_connlist_req *req);
128int hidp_get_conninfo(struct hidp_conninfo *ci); 128int hidp_get_conninfo(struct hidp_conninfo *ci);
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 1eaac01f85de..9f85a1943be9 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -46,9 +46,8 @@ static int hidp_sock_release(struct socket *sock)
46 return 0; 46 return 0;
47} 47}
48 48
49static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 49static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
50{ 50{
51 void __user *argp = (void __user *) arg;
52 struct hidp_connadd_req ca; 51 struct hidp_connadd_req ca;
53 struct hidp_conndel_req cd; 52 struct hidp_conndel_req cd;
54 struct hidp_connlist_req cl; 53 struct hidp_connlist_req cl;
@@ -57,7 +56,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
57 struct socket *isock; 56 struct socket *isock;
58 int err; 57 int err;
59 58
60 BT_DBG("cmd %x arg %lx", cmd, arg); 59 BT_DBG("cmd %x arg %p", cmd, argp);
61 60
62 switch (cmd) { 61 switch (cmd) {
63 case HIDPCONNADD: 62 case HIDPCONNADD:
@@ -122,6 +121,11 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
122 return -EINVAL; 121 return -EINVAL;
123} 122}
124 123
124static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
125{
126 return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg);
127}
128
125#ifdef CONFIG_COMPAT 129#ifdef CONFIG_COMPAT
126struct compat_hidp_connadd_req { 130struct compat_hidp_connadd_req {
127 int ctrl_sock; /* Connected control socket */ 131 int ctrl_sock; /* Connected control socket */
@@ -141,13 +145,15 @@ struct compat_hidp_connadd_req {
141 145
142static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 146static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
143{ 147{
148 void __user *argp = compat_ptr(arg);
149 int err;
150
144 if (cmd == HIDPGETCONNLIST) { 151 if (cmd == HIDPGETCONNLIST) {
145 struct hidp_connlist_req cl; 152 struct hidp_connlist_req cl;
153 u32 __user *p = argp;
146 u32 uci; 154 u32 uci;
147 int err;
148 155
149 if (get_user(cl.cnum, (u32 __user *) arg) || 156 if (get_user(cl.cnum, p) || get_user(uci, p + 1))
150 get_user(uci, (u32 __user *) (arg + 4)))
151 return -EFAULT; 157 return -EFAULT;
152 158
153 cl.ci = compat_ptr(uci); 159 cl.ci = compat_ptr(uci);
@@ -157,39 +163,54 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
157 163
158 err = hidp_get_connlist(&cl); 164 err = hidp_get_connlist(&cl);
159 165
160 if (!err && put_user(cl.cnum, (u32 __user *) arg)) 166 if (!err && put_user(cl.cnum, p))
161 err = -EFAULT; 167 err = -EFAULT;
162 168
163 return err; 169 return err;
164 } else if (cmd == HIDPCONNADD) { 170 } else if (cmd == HIDPCONNADD) {
165 struct compat_hidp_connadd_req ca; 171 struct compat_hidp_connadd_req ca32;
166 struct hidp_connadd_req __user *uca; 172 struct hidp_connadd_req ca;
173 struct socket *csock;
174 struct socket *isock;
167 175
168 uca = compat_alloc_user_space(sizeof(*uca)); 176 if (!capable(CAP_NET_ADMIN))
177 return -EPERM;
169 178
170 if (copy_from_user(&ca, (void __user *) arg, sizeof(ca))) 179 if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32)))
171 return -EFAULT; 180 return -EFAULT;
172 181
173 if (put_user(ca.ctrl_sock, &uca->ctrl_sock) || 182 ca.ctrl_sock = ca32.ctrl_sock;
174 put_user(ca.intr_sock, &uca->intr_sock) || 183 ca.intr_sock = ca32.intr_sock;
175 put_user(ca.parser, &uca->parser) || 184 ca.parser = ca32.parser;
176 put_user(ca.rd_size, &uca->rd_size) || 185 ca.rd_size = ca32.rd_size;
177 put_user(compat_ptr(ca.rd_data), &uca->rd_data) || 186 ca.rd_data = compat_ptr(ca32.rd_data);
178 put_user(ca.country, &uca->country) || 187 ca.country = ca32.country;
179 put_user(ca.subclass, &uca->subclass) || 188 ca.subclass = ca32.subclass;
180 put_user(ca.vendor, &uca->vendor) || 189 ca.vendor = ca32.vendor;
181 put_user(ca.product, &uca->product) || 190 ca.product = ca32.product;
182 put_user(ca.version, &uca->version) || 191 ca.version = ca32.version;
183 put_user(ca.flags, &uca->flags) || 192 ca.flags = ca32.flags;
184 put_user(ca.idle_to, &uca->idle_to) || 193 ca.idle_to = ca32.idle_to;
185 copy_to_user(&uca->name[0], &ca.name[0], 128)) 194 memcpy(ca.name, ca32.name, 128);
186 return -EFAULT; 195
196 csock = sockfd_lookup(ca.ctrl_sock, &err);
197 if (!csock)
198 return err;
187 199
188 arg = (unsigned long) uca; 200 isock = sockfd_lookup(ca.intr_sock, &err);
201 if (!isock) {
202 sockfd_put(csock);
203 return err;
204 }
189 205
190 /* Fall through. We don't actually write back any _changes_ 206 err = hidp_connection_add(&ca, csock, isock);
191 to the structure anyway, so there's no need to copy back 207 if (!err && copy_to_user(argp, &ca32, sizeof(ca32)))
192 into the original compat version */ 208 err = -EFAULT;
209
210 sockfd_put(csock);
211 sockfd_put(isock);
212
213 return err;
193 } 214 }
194 215
195 return hidp_sock_ioctl(sock, cmd, arg); 216 return hidp_sock_ioctl(sock, cmd, arg);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d17a4736e47c..2146e0f3b6f8 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -51,9 +51,6 @@ static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
51static LIST_HEAD(chan_list); 51static LIST_HEAD(chan_list);
52static DEFINE_RWLOCK(chan_list_lock); 52static DEFINE_RWLOCK(chan_list_lock);
53 53
54static u16 le_max_credits = L2CAP_LE_MAX_CREDITS;
55static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS;
56
57static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, 54static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
58 u8 code, u8 ident, u16 dlen, void *data); 55 u8 code, u8 ident, u16 dlen, void *data);
59static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, 56static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
@@ -519,8 +516,10 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
519 chan->sdu_last_frag = NULL; 516 chan->sdu_last_frag = NULL;
520 chan->sdu_len = 0; 517 chan->sdu_len = 0;
521 chan->tx_credits = 0; 518 chan->tx_credits = 0;
522 chan->rx_credits = le_max_credits; 519 /* Derive MPS from connection MTU to stop HCI fragmentation */
523 chan->mps = min_t(u16, chan->imtu, le_default_mps); 520 chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
521 /* Give enough credits for a full packet */
522 chan->rx_credits = (chan->imtu / chan->mps) + 1;
524 523
525 skb_queue_head_init(&chan->tx_q); 524 skb_queue_head_init(&chan->tx_q);
526} 525}
@@ -681,9 +680,9 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
681 u16 result; 680 u16 result;
682 681
683 if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) 682 if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
684 result = L2CAP_CR_AUTHORIZATION; 683 result = L2CAP_CR_LE_AUTHORIZATION;
685 else 684 else
686 result = L2CAP_CR_BAD_PSM; 685 result = L2CAP_CR_LE_BAD_PSM;
687 686
688 l2cap_state_change(chan, BT_DISCONN); 687 l2cap_state_change(chan, BT_DISCONN);
689 688
@@ -1282,6 +1281,8 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
1282 if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) 1281 if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
1283 return; 1282 return;
1284 1283
1284 l2cap_le_flowctl_init(chan);
1285
1285 req.psm = chan->psm; 1286 req.psm = chan->psm;
1286 req.scid = cpu_to_le16(chan->scid); 1287 req.scid = cpu_to_le16(chan->scid);
1287 req.mtu = cpu_to_le16(chan->imtu); 1288 req.mtu = cpu_to_le16(chan->imtu);
@@ -3669,7 +3670,7 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
3669 rsp.mtu = cpu_to_le16(chan->imtu); 3670 rsp.mtu = cpu_to_le16(chan->imtu);
3670 rsp.mps = cpu_to_le16(chan->mps); 3671 rsp.mps = cpu_to_le16(chan->mps);
3671 rsp.credits = cpu_to_le16(chan->rx_credits); 3672 rsp.credits = cpu_to_le16(chan->rx_credits);
3672 rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); 3673 rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
3673 3674
3674 l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), 3675 l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
3675 &rsp); 3676 &rsp);
@@ -3815,9 +3816,17 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
3815 3816
3816 result = L2CAP_CR_NO_MEM; 3817 result = L2CAP_CR_NO_MEM;
3817 3818
3819 /* Check for valid dynamic CID range (as per Erratum 3253) */
3820 if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_DYN_END) {
3821 result = L2CAP_CR_INVALID_SCID;
3822 goto response;
3823 }
3824
3818 /* Check if we already have channel with that dcid */ 3825 /* Check if we already have channel with that dcid */
3819 if (__l2cap_get_chan_by_dcid(conn, scid)) 3826 if (__l2cap_get_chan_by_dcid(conn, scid)) {
3827 result = L2CAP_CR_SCID_IN_USE;
3820 goto response; 3828 goto response;
3829 }
3821 3830
3822 chan = pchan->ops->new_connection(pchan); 3831 chan = pchan->ops->new_connection(pchan);
3823 if (!chan) 3832 if (!chan)
@@ -5279,7 +5288,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
5279 credits = __le16_to_cpu(rsp->credits); 5288 credits = __le16_to_cpu(rsp->credits);
5280 result = __le16_to_cpu(rsp->result); 5289 result = __le16_to_cpu(rsp->result);
5281 5290
5282 if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 || 5291 if (result == L2CAP_CR_LE_SUCCESS && (mtu < 23 || mps < 23 ||
5283 dcid < L2CAP_CID_DYN_START || 5292 dcid < L2CAP_CID_DYN_START ||
5284 dcid > L2CAP_CID_LE_DYN_END)) 5293 dcid > L2CAP_CID_LE_DYN_END))
5285 return -EPROTO; 5294 return -EPROTO;
@@ -5300,7 +5309,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
5300 l2cap_chan_lock(chan); 5309 l2cap_chan_lock(chan);
5301 5310
5302 switch (result) { 5311 switch (result) {
5303 case L2CAP_CR_SUCCESS: 5312 case L2CAP_CR_LE_SUCCESS:
5304 if (__l2cap_get_chan_by_dcid(conn, dcid)) { 5313 if (__l2cap_get_chan_by_dcid(conn, dcid)) {
5305 err = -EBADSLT; 5314 err = -EBADSLT;
5306 break; 5315 break;
@@ -5314,8 +5323,8 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
5314 l2cap_chan_ready(chan); 5323 l2cap_chan_ready(chan);
5315 break; 5324 break;
5316 5325
5317 case L2CAP_CR_AUTHENTICATION: 5326 case L2CAP_CR_LE_AUTHENTICATION:
5318 case L2CAP_CR_ENCRYPTION: 5327 case L2CAP_CR_LE_ENCRYPTION:
5319 /* If we already have MITM protection we can't do 5328 /* If we already have MITM protection we can't do
5320 * anything. 5329 * anything.
5321 */ 5330 */
@@ -5458,7 +5467,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
5458 pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, 5467 pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
5459 &conn->hcon->dst, LE_LINK); 5468 &conn->hcon->dst, LE_LINK);
5460 if (!pchan) { 5469 if (!pchan) {
5461 result = L2CAP_CR_BAD_PSM; 5470 result = L2CAP_CR_LE_BAD_PSM;
5462 chan = NULL; 5471 chan = NULL;
5463 goto response; 5472 goto response;
5464 } 5473 }
@@ -5468,33 +5477,31 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
5468 5477
5469 if (!smp_sufficient_security(conn->hcon, pchan->sec_level, 5478 if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
5470 SMP_ALLOW_STK)) { 5479 SMP_ALLOW_STK)) {
5471 result = L2CAP_CR_AUTHENTICATION; 5480 result = L2CAP_CR_LE_AUTHENTICATION;
5472 chan = NULL; 5481 chan = NULL;
5473 goto response_unlock; 5482 goto response_unlock;
5474 } 5483 }
5475 5484
5476 /* Check for valid dynamic CID range */ 5485 /* Check for valid dynamic CID range */
5477 if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { 5486 if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
5478 result = L2CAP_CR_INVALID_SCID; 5487 result = L2CAP_CR_LE_INVALID_SCID;
5479 chan = NULL; 5488 chan = NULL;
5480 goto response_unlock; 5489 goto response_unlock;
5481 } 5490 }
5482 5491
5483 /* Check if we already have channel with that dcid */ 5492 /* Check if we already have channel with that dcid */
5484 if (__l2cap_get_chan_by_dcid(conn, scid)) { 5493 if (__l2cap_get_chan_by_dcid(conn, scid)) {
5485 result = L2CAP_CR_SCID_IN_USE; 5494 result = L2CAP_CR_LE_SCID_IN_USE;
5486 chan = NULL; 5495 chan = NULL;
5487 goto response_unlock; 5496 goto response_unlock;
5488 } 5497 }
5489 5498
5490 chan = pchan->ops->new_connection(pchan); 5499 chan = pchan->ops->new_connection(pchan);
5491 if (!chan) { 5500 if (!chan) {
5492 result = L2CAP_CR_NO_MEM; 5501 result = L2CAP_CR_LE_NO_MEM;
5493 goto response_unlock; 5502 goto response_unlock;
5494 } 5503 }
5495 5504
5496 l2cap_le_flowctl_init(chan);
5497
5498 bacpy(&chan->src, &conn->hcon->src); 5505 bacpy(&chan->src, &conn->hcon->src);
5499 bacpy(&chan->dst, &conn->hcon->dst); 5506 bacpy(&chan->dst, &conn->hcon->dst);
5500 chan->src_type = bdaddr_src_type(conn->hcon); 5507 chan->src_type = bdaddr_src_type(conn->hcon);
@@ -5506,6 +5513,9 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
5506 chan->tx_credits = __le16_to_cpu(req->credits); 5513 chan->tx_credits = __le16_to_cpu(req->credits);
5507 5514
5508 __l2cap_chan_add(conn, chan); 5515 __l2cap_chan_add(conn, chan);
5516
5517 l2cap_le_flowctl_init(chan);
5518
5509 dcid = chan->scid; 5519 dcid = chan->scid;
5510 credits = chan->rx_credits; 5520 credits = chan->rx_credits;
5511 5521
@@ -5524,7 +5534,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
5524 chan->ops->defer(chan); 5534 chan->ops->defer(chan);
5525 } else { 5535 } else {
5526 l2cap_chan_ready(chan); 5536 l2cap_chan_ready(chan);
5527 result = L2CAP_CR_SUCCESS; 5537 result = L2CAP_CR_LE_SUCCESS;
5528 } 5538 }
5529 5539
5530response_unlock: 5540response_unlock:
@@ -6699,13 +6709,10 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
6699 struct l2cap_le_credits pkt; 6709 struct l2cap_le_credits pkt;
6700 u16 return_credits; 6710 u16 return_credits;
6701 6711
6702 /* We return more credits to the sender only after the amount of 6712 return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits;
6703 * credits falls below half of the initial amount.
6704 */
6705 if (chan->rx_credits >= (le_max_credits + 1) / 2)
6706 return;
6707 6713
6708 return_credits = le_max_credits - chan->rx_credits; 6714 if (!return_credits)
6715 return;
6709 6716
6710 BT_DBG("chan %p returning %u credits to sender", chan, return_credits); 6717 BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
6711 6718
@@ -6719,6 +6726,21 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
6719 l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); 6726 l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
6720} 6727}
6721 6728
6729static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb)
6730{
6731 int err;
6732
6733 BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len);
6734
6735 /* Wait recv to confirm reception before updating the credits */
6736 err = chan->ops->recv(chan, skb);
6737
6738 /* Update credits whenever an SDU is received */
6739 l2cap_chan_le_send_credits(chan);
6740
6741 return err;
6742}
6743
6722static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) 6744static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
6723{ 6745{
6724 int err; 6746 int err;
@@ -6737,7 +6759,11 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
6737 chan->rx_credits--; 6759 chan->rx_credits--;
6738 BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits); 6760 BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits);
6739 6761
6740 l2cap_chan_le_send_credits(chan); 6762 /* Update if remote had run out of credits, this should only happens
6763 * if the remote is not using the entire MPS.
6764 */
6765 if (!chan->rx_credits)
6766 l2cap_chan_le_send_credits(chan);
6741 6767
6742 err = 0; 6768 err = 0;
6743 6769
@@ -6763,12 +6789,22 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
6763 } 6789 }
6764 6790
6765 if (skb->len == sdu_len) 6791 if (skb->len == sdu_len)
6766 return chan->ops->recv(chan, skb); 6792 return l2cap_le_recv(chan, skb);
6767 6793
6768 chan->sdu = skb; 6794 chan->sdu = skb;
6769 chan->sdu_len = sdu_len; 6795 chan->sdu_len = sdu_len;
6770 chan->sdu_last_frag = skb; 6796 chan->sdu_last_frag = skb;
6771 6797
6798 /* Detect if remote is not able to use the selected MPS */
6799 if (skb->len + L2CAP_SDULEN_SIZE < chan->mps) {
6800 u16 mps_len = skb->len + L2CAP_SDULEN_SIZE;
6801
6802 /* Adjust the number of credits */
6803 BT_DBG("chan->mps %u -> %u", chan->mps, mps_len);
6804 chan->mps = mps_len;
6805 l2cap_chan_le_send_credits(chan);
6806 }
6807
6772 return 0; 6808 return 0;
6773 } 6809 }
6774 6810
@@ -6785,7 +6821,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
6785 skb = NULL; 6821 skb = NULL;
6786 6822
6787 if (chan->sdu->len == chan->sdu_len) { 6823 if (chan->sdu->len == chan->sdu_len) {
6788 err = chan->ops->recv(chan, chan->sdu); 6824 err = l2cap_le_recv(chan, chan->sdu);
6789 if (!err) { 6825 if (!err) {
6790 chan->sdu = NULL; 6826 chan->sdu = NULL;
6791 chan->sdu_last_frag = NULL; 6827 chan->sdu_last_frag = NULL;
@@ -7102,7 +7138,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
7102 case L2CAP_MODE_BASIC: 7138 case L2CAP_MODE_BASIC:
7103 break; 7139 break;
7104 case L2CAP_MODE_LE_FLOWCTL: 7140 case L2CAP_MODE_LE_FLOWCTL:
7105 l2cap_le_flowctl_init(chan);
7106 break; 7141 break;
7107 case L2CAP_MODE_ERTM: 7142 case L2CAP_MODE_ERTM:
7108 case L2CAP_MODE_STREAMING: 7143 case L2CAP_MODE_STREAMING:
@@ -7645,11 +7680,6 @@ int __init l2cap_init(void)
7645 l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs, 7680 l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs,
7646 NULL, &l2cap_debugfs_fops); 7681 NULL, &l2cap_debugfs_fops);
7647 7682
7648 debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs,
7649 &le_max_credits);
7650 debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs,
7651 &le_default_mps);
7652
7653 return 0; 7683 return 0;
7654} 7684}
7655 7685
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 5e44d842cc5d..0c7d31c6c18c 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -839,18 +839,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned l
839 BT_DBG("TIOCMIWAIT"); 839 BT_DBG("TIOCMIWAIT");
840 break; 840 break;
841 841
842 case TIOCGSERIAL:
843 BT_ERR("TIOCGSERIAL is not supported");
844 return -ENOIOCTLCMD;
845
846 case TIOCSSERIAL:
847 BT_ERR("TIOCSSERIAL is not supported");
848 return -ENOIOCTLCMD;
849
850 case TIOCSERGSTRUCT:
851 BT_ERR("TIOCSERGSTRUCT is not supported");
852 return -ENOIOCTLCMD;
853
854 case TIOCSERGETLSR: 842 case TIOCSERGETLSR:
855 BT_ERR("TIOCSERGETLSR is not supported"); 843 BT_ERR("TIOCSERGETLSR is not supported");
856 return -ENOIOCTLCMD; 844 return -ENOIOCTLCMD;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 73f7211d0431..a1c1b7e8a45c 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -88,9 +88,6 @@ struct smp_dev {
88 u8 local_rand[16]; 88 u8 local_rand[16];
89 bool debug_key; 89 bool debug_key;
90 90
91 u8 min_key_size;
92 u8 max_key_size;
93
94 struct crypto_cipher *tfm_aes; 91 struct crypto_cipher *tfm_aes;
95 struct crypto_shash *tfm_cmac; 92 struct crypto_shash *tfm_cmac;
96 struct crypto_kpp *tfm_ecdh; 93 struct crypto_kpp *tfm_ecdh;
@@ -720,7 +717,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
720 if (rsp == NULL) { 717 if (rsp == NULL) {
721 req->io_capability = conn->hcon->io_capability; 718 req->io_capability = conn->hcon->io_capability;
722 req->oob_flag = oob_flag; 719 req->oob_flag = oob_flag;
723 req->max_key_size = SMP_DEV(hdev)->max_key_size; 720 req->max_key_size = hdev->le_max_key_size;
724 req->init_key_dist = local_dist; 721 req->init_key_dist = local_dist;
725 req->resp_key_dist = remote_dist; 722 req->resp_key_dist = remote_dist;
726 req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); 723 req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -731,7 +728,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
731 728
732 rsp->io_capability = conn->hcon->io_capability; 729 rsp->io_capability = conn->hcon->io_capability;
733 rsp->oob_flag = oob_flag; 730 rsp->oob_flag = oob_flag;
734 rsp->max_key_size = SMP_DEV(hdev)->max_key_size; 731 rsp->max_key_size = hdev->le_max_key_size;
735 rsp->init_key_dist = req->init_key_dist & remote_dist; 732 rsp->init_key_dist = req->init_key_dist & remote_dist;
736 rsp->resp_key_dist = req->resp_key_dist & local_dist; 733 rsp->resp_key_dist = req->resp_key_dist & local_dist;
737 rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); 734 rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -745,7 +742,7 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
745 struct hci_dev *hdev = conn->hcon->hdev; 742 struct hci_dev *hdev = conn->hcon->hdev;
746 struct smp_chan *smp = chan->data; 743 struct smp_chan *smp = chan->data;
747 744
748 if (max_key_size > SMP_DEV(hdev)->max_key_size || 745 if (max_key_size > hdev->le_max_key_size ||
749 max_key_size < SMP_MIN_ENC_KEY_SIZE) 746 max_key_size < SMP_MIN_ENC_KEY_SIZE)
750 return SMP_ENC_KEY_SIZE; 747 return SMP_ENC_KEY_SIZE;
751 748
@@ -3264,8 +3261,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
3264 smp->tfm_aes = tfm_aes; 3261 smp->tfm_aes = tfm_aes;
3265 smp->tfm_cmac = tfm_cmac; 3262 smp->tfm_cmac = tfm_cmac;
3266 smp->tfm_ecdh = tfm_ecdh; 3263 smp->tfm_ecdh = tfm_ecdh;
3267 smp->min_key_size = SMP_MIN_ENC_KEY_SIZE;
3268 smp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
3269 3264
3270create_chan: 3265create_chan:
3271 chan = l2cap_chan_create(); 3266 chan = l2cap_chan_create();
@@ -3391,7 +3386,7 @@ static ssize_t le_min_key_size_read(struct file *file,
3391 struct hci_dev *hdev = file->private_data; 3386 struct hci_dev *hdev = file->private_data;
3392 char buf[4]; 3387 char buf[4];
3393 3388
3394 snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size); 3389 snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size);
3395 3390
3396 return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); 3391 return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
3397} 3392}
@@ -3412,11 +3407,11 @@ static ssize_t le_min_key_size_write(struct file *file,
3412 3407
3413 sscanf(buf, "%hhu", &key_size); 3408 sscanf(buf, "%hhu", &key_size);
3414 3409
3415 if (key_size > SMP_DEV(hdev)->max_key_size || 3410 if (key_size > hdev->le_max_key_size ||
3416 key_size < SMP_MIN_ENC_KEY_SIZE) 3411 key_size < SMP_MIN_ENC_KEY_SIZE)
3417 return -EINVAL; 3412 return -EINVAL;
3418 3413
3419 SMP_DEV(hdev)->min_key_size = key_size; 3414 hdev->le_min_key_size = key_size;
3420 3415
3421 return count; 3416 return count;
3422} 3417}
@@ -3435,7 +3430,7 @@ static ssize_t le_max_key_size_read(struct file *file,
3435 struct hci_dev *hdev = file->private_data; 3430 struct hci_dev *hdev = file->private_data;
3436 char buf[4]; 3431 char buf[4];
3437 3432
3438 snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size); 3433 snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size);
3439 3434
3440 return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); 3435 return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
3441} 3436}
@@ -3457,10 +3452,10 @@ static ssize_t le_max_key_size_write(struct file *file,
3457 sscanf(buf, "%hhu", &key_size); 3452 sscanf(buf, "%hhu", &key_size);
3458 3453
3459 if (key_size > SMP_MAX_ENC_KEY_SIZE || 3454 if (key_size > SMP_MAX_ENC_KEY_SIZE ||
3460 key_size < SMP_DEV(hdev)->min_key_size) 3455 key_size < hdev->le_min_key_size)
3461 return -EINVAL; 3456 return -EINVAL;
3462 3457
3463 SMP_DEV(hdev)->max_key_size = key_size; 3458 hdev->le_max_key_size = key_size;
3464 3459
3465 return count; 3460 return count;
3466} 3461}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f4078830ea50..c89c22c49015 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,9 +10,11 @@
10#include <linux/etherdevice.h> 10#include <linux/etherdevice.h>
11#include <linux/filter.h> 11#include <linux/filter.h>
12#include <linux/sched/signal.h> 12#include <linux/sched/signal.h>
13#include <net/sock.h>
14#include <net/tcp.h>
13 15
14static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, 16static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
15 struct bpf_cgroup_storage *storage) 17 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
16{ 18{
17 u32 ret; 19 u32 ret;
18 20
@@ -28,13 +30,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
28 30
29static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) 31static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
30{ 32{
31 struct bpf_cgroup_storage *storage = NULL; 33 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
34 enum bpf_cgroup_storage_type stype;
32 u64 time_start, time_spent = 0; 35 u64 time_start, time_spent = 0;
33 u32 ret = 0, i; 36 u32 ret = 0, i;
34 37
35 storage = bpf_cgroup_storage_alloc(prog); 38 for_each_cgroup_storage_type(stype) {
36 if (IS_ERR(storage)) 39 storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
37 return PTR_ERR(storage); 40 if (IS_ERR(storage[stype])) {
41 storage[stype] = NULL;
42 for_each_cgroup_storage_type(stype)
43 bpf_cgroup_storage_free(storage[stype]);
44 return -ENOMEM;
45 }
46 }
38 47
39 if (!repeat) 48 if (!repeat)
40 repeat = 1; 49 repeat = 1;
@@ -53,7 +62,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
53 do_div(time_spent, repeat); 62 do_div(time_spent, repeat);
54 *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; 63 *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
55 64
56 bpf_cgroup_storage_free(storage); 65 for_each_cgroup_storage_type(stype)
66 bpf_cgroup_storage_free(storage[stype]);
57 67
58 return ret; 68 return ret;
59} 69}
@@ -107,6 +117,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
107 u32 retval, duration; 117 u32 retval, duration;
108 int hh_len = ETH_HLEN; 118 int hh_len = ETH_HLEN;
109 struct sk_buff *skb; 119 struct sk_buff *skb;
120 struct sock *sk;
110 void *data; 121 void *data;
111 int ret; 122 int ret;
112 123
@@ -129,11 +140,21 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
129 break; 140 break;
130 } 141 }
131 142
143 sk = kzalloc(sizeof(struct sock), GFP_USER);
144 if (!sk) {
145 kfree(data);
146 return -ENOMEM;
147 }
148 sock_net_set(sk, current->nsproxy->net_ns);
149 sock_init_data(NULL, sk);
150
132 skb = build_skb(data, 0); 151 skb = build_skb(data, 0);
133 if (!skb) { 152 if (!skb) {
134 kfree(data); 153 kfree(data);
154 kfree(sk);
135 return -ENOMEM; 155 return -ENOMEM;
136 } 156 }
157 skb->sk = sk;
137 158
138 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 159 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
139 __skb_put(skb, size); 160 __skb_put(skb, size);
@@ -151,6 +172,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
151 172
152 if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { 173 if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
153 kfree_skb(skb); 174 kfree_skb(skb);
175 kfree(sk);
154 return -ENOMEM; 176 return -ENOMEM;
155 } 177 }
156 } 178 }
@@ -163,6 +185,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
163 size = skb_headlen(skb); 185 size = skb_headlen(skb);
164 ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); 186 ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
165 kfree_skb(skb); 187 kfree_skb(skb);
188 kfree(sk);
166 return ret; 189 return ret;
167} 190}
168 191
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 94e88f510c5b..7acfc83087d5 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -92,6 +92,7 @@ static int __init load_umh(void)
92 int err; 92 int err;
93 93
94 /* fork usermode process */ 94 /* fork usermode process */
95 info.cmdline = "bpfilter_umh";
95 err = fork_usermode_blob(&bpfilter_umh_start, 96 err = fork_usermode_blob(&bpfilter_umh_start,
96 &bpfilter_umh_end - &bpfilter_umh_start, 97 &bpfilter_umh_end - &bpfilter_umh_start,
97 &info); 98 &info);
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index aa0d3b2f1bb7..3625d6ade45c 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -17,7 +17,7 @@ config BRIDGE
17 other third party bridge products. 17 other third party bridge products.
18 18
19 In order to use the Ethernet bridge, you'll need the bridge 19 In order to use the Ethernet bridge, you'll need the bridge
20 configuration tools; see <file:Documentation/networking/bridge.txt> 20 configuration tools; see <file:Documentation/networking/bridge.rst>
21 for location. Please read the Bridge mini-HOWTO for more 21 for location. Please read the Bridge mini-HOWTO for more
22 information. 22 information.
23 23
diff --git a/net/bridge/br.c b/net/bridge/br.c
index b0a0b82e2d91..360ad66c21e9 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -151,7 +151,7 @@ static int br_switchdev_event(struct notifier_block *unused,
151 break; 151 break;
152 } 152 }
153 br_fdb_offloaded_set(br, p, fdb_info->addr, 153 br_fdb_offloaded_set(br, p, fdb_info->addr,
154 fdb_info->vid); 154 fdb_info->vid, true);
155 break; 155 break;
156 case SWITCHDEV_FDB_DEL_TO_BRIDGE: 156 case SWITCHDEV_FDB_DEL_TO_BRIDGE:
157 fdb_info = ptr; 157 fdb_info = ptr;
@@ -163,7 +163,7 @@ static int br_switchdev_event(struct notifier_block *unused,
163 case SWITCHDEV_FDB_OFFLOADED: 163 case SWITCHDEV_FDB_OFFLOADED:
164 fdb_info = ptr; 164 fdb_info = ptr;
165 br_fdb_offloaded_set(br, p, fdb_info->addr, 165 br_fdb_offloaded_set(br, p, fdb_info->addr,
166 fdb_info->vid); 166 fdb_info->vid, fdb_info->offloaded);
167 break; 167 break;
168 } 168 }
169 169
@@ -175,6 +175,22 @@ static struct notifier_block br_switchdev_notifier = {
175 .notifier_call = br_switchdev_event, 175 .notifier_call = br_switchdev_event,
176}; 176};
177 177
178void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
179{
180 bool cur = !!br_opt_get(br, opt);
181
182 br_debug(br, "toggle option: %d state: %d -> %d\n",
183 opt, cur, on);
184
185 if (cur == on)
186 return;
187
188 if (on)
189 set_bit(opt, &br->options);
190 else
191 clear_bit(opt, &br->options);
192}
193
178static void __net_exit br_net_exit(struct net *net) 194static void __net_exit br_net_exit(struct net *net)
179{ 195{
180 struct net_device *dev; 196 struct net_device *dev;
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 2cf7716254be..6b78e6351719 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -39,7 +39,7 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
39 } 39 }
40 } 40 }
41 41
42 br->neigh_suppress_enabled = neigh_suppress; 42 br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress);
43} 43}
44 44
45#if IS_ENABLED(CONFIG_INET) 45#if IS_ENABLED(CONFIG_INET)
@@ -155,7 +155,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
155 ipv4_is_multicast(tip)) 155 ipv4_is_multicast(tip))
156 return; 156 return;
157 157
158 if (br->neigh_suppress_enabled) { 158 if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
159 if (p && (p->flags & BR_NEIGH_SUPPRESS)) 159 if (p && (p->flags & BR_NEIGH_SUPPRESS))
160 return; 160 return;
161 if (ipv4_is_zeronet(sip) || sip == tip) { 161 if (ipv4_is_zeronet(sip) || sip == tip) {
@@ -175,7 +175,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
175 return; 175 return;
176 } 176 }
177 177
178 if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) { 178 if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
179 br_is_local_ip(vlandev, tip)) {
179 /* its our local ip, so don't proxy reply 180 /* its our local ip, so don't proxy reply
180 * and don't forward to neigh suppress ports 181 * and don't forward to neigh suppress ports
181 */ 182 */
@@ -213,7 +214,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
213 /* If we have replied or as long as we know the 214 /* If we have replied or as long as we know the
214 * mac, indicate to arp replied 215 * mac, indicate to arp replied
215 */ 216 */
216 if (replied || br->neigh_suppress_enabled) 217 if (replied ||
218 br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
217 BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; 219 BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
218 } 220 }
219 221
@@ -311,7 +313,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
311 /* Neighbor Advertisement */ 313 /* Neighbor Advertisement */
312 memset(na, 0, sizeof(*na) + na_olen); 314 memset(na, 0, sizeof(*na) + na_olen);
313 na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; 315 na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
314 na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */ 316 na->icmph.icmp6_router = (n->flags & NTF_ROUTER) ? 1 : 0;
315 na->icmph.icmp6_override = 1; 317 na->icmph.icmp6_override = 1;
316 na->icmph.icmp6_solicited = 1; 318 na->icmph.icmp6_solicited = 1;
317 na->target = ns->target; 319 na->target = ns->target;
@@ -460,7 +462,8 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
460 * mac, indicate to NEIGH_SUPPRESS ports that we 462 * mac, indicate to NEIGH_SUPPRESS ports that we
461 * have replied 463 * have replied
462 */ 464 */
463 if (replied || br->neigh_suppress_enabled) 465 if (replied ||
466 br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
464 BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; 467 BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
465 } 468 }
466 neigh_release(n); 469 neigh_release(n);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e682a668ce57..c6abf927f0c9 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -67,11 +67,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
67 if (IS_ENABLED(CONFIG_INET) && 67 if (IS_ENABLED(CONFIG_INET) &&
68 (eth->h_proto == htons(ETH_P_ARP) || 68 (eth->h_proto == htons(ETH_P_ARP) ||
69 eth->h_proto == htons(ETH_P_RARP)) && 69 eth->h_proto == htons(ETH_P_RARP)) &&
70 br->neigh_suppress_enabled) { 70 br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
71 br_do_proxy_suppress_arp(skb, br, vid, NULL); 71 br_do_proxy_suppress_arp(skb, br, vid, NULL);
72 } else if (IS_ENABLED(CONFIG_IPV6) && 72 } else if (IS_ENABLED(CONFIG_IPV6) &&
73 skb->protocol == htons(ETH_P_IPV6) && 73 skb->protocol == htons(ETH_P_IPV6) &&
74 br->neigh_suppress_enabled && 74 br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
75 pskb_may_pull(skb, sizeof(struct ipv6hdr) + 75 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
76 sizeof(struct nd_msg)) && 76 sizeof(struct nd_msg)) &&
77 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { 77 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
@@ -228,7 +228,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
228 dev->mtu = new_mtu; 228 dev->mtu = new_mtu;
229 229
230 /* this flag will be cleared if the MTU was automatically adjusted */ 230 /* this flag will be cleared if the MTU was automatically adjusted */
231 br->mtu_set_by_user = true; 231 br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true);
232#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 232#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
233 /* remember the MTU in the rtable for PMTU */ 233 /* remember the MTU in the rtable for PMTU */
234 dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); 234 dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
@@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
344 344
345 p->np = NULL; 345 p->np = NULL;
346 346
347 __netpoll_free_async(np); 347 __netpoll_free(np);
348} 348}
349 349
350#endif 350#endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 502f66349530..e56ba3912a90 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -504,6 +504,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
504 fdb->added_by_user = 0; 504 fdb->added_by_user = 0;
505 fdb->added_by_external_learn = 0; 505 fdb->added_by_external_learn = 0;
506 fdb->offloaded = 0; 506 fdb->offloaded = 0;
507 fdb->is_sticky = 0;
507 fdb->updated = fdb->used = jiffies; 508 fdb->updated = fdb->used = jiffies;
508 if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, 509 if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
509 &fdb->rhnode, 510 &fdb->rhnode,
@@ -584,7 +585,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
584 unsigned long now = jiffies; 585 unsigned long now = jiffies;
585 586
586 /* fastpath: update of existing entry */ 587 /* fastpath: update of existing entry */
587 if (unlikely(source != fdb->dst)) { 588 if (unlikely(source != fdb->dst && !fdb->is_sticky)) {
588 fdb->dst = source; 589 fdb->dst = source;
589 fdb_modified = true; 590 fdb_modified = true;
590 /* Take over HW learned entry */ 591 /* Take over HW learned entry */
@@ -656,6 +657,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
656 ndm->ndm_flags |= NTF_OFFLOADED; 657 ndm->ndm_flags |= NTF_OFFLOADED;
657 if (fdb->added_by_external_learn) 658 if (fdb->added_by_external_learn)
658 ndm->ndm_flags |= NTF_EXT_LEARNED; 659 ndm->ndm_flags |= NTF_EXT_LEARNED;
660 if (fdb->is_sticky)
661 ndm->ndm_flags |= NTF_STICKY;
659 662
660 if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) 663 if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
661 goto nla_put_failure; 664 goto nla_put_failure;
@@ -772,8 +775,10 @@ skip:
772 775
773/* Update (create or replace) forwarding database entry */ 776/* Update (create or replace) forwarding database entry */
774static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, 777static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
775 const __u8 *addr, __u16 state, __u16 flags, __u16 vid) 778 const u8 *addr, u16 state, u16 flags, u16 vid,
779 u8 ndm_flags)
776{ 780{
781 u8 is_sticky = !!(ndm_flags & NTF_STICKY);
777 struct net_bridge_fdb_entry *fdb; 782 struct net_bridge_fdb_entry *fdb;
778 bool modified = false; 783 bool modified = false;
779 784
@@ -789,6 +794,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
789 return -EINVAL; 794 return -EINVAL;
790 } 795 }
791 796
797 if (is_sticky && (state & NUD_PERMANENT))
798 return -EINVAL;
799
792 fdb = br_fdb_find(br, addr, vid); 800 fdb = br_fdb_find(br, addr, vid);
793 if (fdb == NULL) { 801 if (fdb == NULL) {
794 if (!(flags & NLM_F_CREATE)) 802 if (!(flags & NLM_F_CREATE))
@@ -832,6 +840,12 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
832 840
833 modified = true; 841 modified = true;
834 } 842 }
843
844 if (is_sticky != fdb->is_sticky) {
845 fdb->is_sticky = is_sticky;
846 modified = true;
847 }
848
835 fdb->added_by_user = 1; 849 fdb->added_by_user = 1;
836 850
837 fdb->used = jiffies; 851 fdb->used = jiffies;
@@ -865,7 +879,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
865 } else { 879 } else {
866 spin_lock_bh(&br->hash_lock); 880 spin_lock_bh(&br->hash_lock);
867 err = fdb_add_entry(br, p, addr, ndm->ndm_state, 881 err = fdb_add_entry(br, p, addr, ndm->ndm_state,
868 nlh_flags, vid); 882 nlh_flags, vid, ndm->ndm_flags);
869 spin_unlock_bh(&br->hash_lock); 883 spin_unlock_bh(&br->hash_lock);
870 } 884 }
871 885
@@ -1138,7 +1152,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
1138} 1152}
1139 1153
1140void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, 1154void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
1141 const unsigned char *addr, u16 vid) 1155 const unsigned char *addr, u16 vid, bool offloaded)
1142{ 1156{
1143 struct net_bridge_fdb_entry *fdb; 1157 struct net_bridge_fdb_entry *fdb;
1144 1158
@@ -1146,7 +1160,7 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
1146 1160
1147 fdb = br_fdb_find(br, addr, vid); 1161 fdb = br_fdb_find(br, addr, vid);
1148 if (fdb) 1162 if (fdb)
1149 fdb->offloaded = 1; 1163 fdb->offloaded = offloaded;
1150 1164
1151 spin_unlock_bh(&br->hash_lock); 1165 spin_unlock_bh(&br->hash_lock);
1152} 1166}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 0363f1bdc401..9b46d2dc4c22 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -394,8 +394,7 @@ static int find_portno(struct net_bridge *br)
394 struct net_bridge_port *p; 394 struct net_bridge_port *p;
395 unsigned long *inuse; 395 unsigned long *inuse;
396 396
397 inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), 397 inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
398 GFP_KERNEL);
399 if (!inuse) 398 if (!inuse)
400 return -ENOMEM; 399 return -ENOMEM;
401 400
@@ -404,7 +403,7 @@ static int find_portno(struct net_bridge *br)
404 set_bit(p->port_no, inuse); 403 set_bit(p->port_no, inuse);
405 } 404 }
406 index = find_first_zero_bit(inuse, BR_MAX_PORTS); 405 index = find_first_zero_bit(inuse, BR_MAX_PORTS);
407 kfree(inuse); 406 bitmap_free(inuse);
408 407
409 return (index >= BR_MAX_PORTS) ? -EXFULL : index; 408 return (index >= BR_MAX_PORTS) ? -EXFULL : index;
410} 409}
@@ -509,14 +508,14 @@ void br_mtu_auto_adjust(struct net_bridge *br)
509 ASSERT_RTNL(); 508 ASSERT_RTNL();
510 509
511 /* if the bridge MTU was manually configured don't mess with it */ 510 /* if the bridge MTU was manually configured don't mess with it */
512 if (br->mtu_set_by_user) 511 if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
513 return; 512 return;
514 513
515 /* change to the minimum MTU and clear the flag which was set by 514 /* change to the minimum MTU and clear the flag which was set by
516 * the bridge ndo_change_mtu callback 515 * the bridge ndo_change_mtu callback
517 */ 516 */
518 dev_set_mtu(br->dev, br_mtu_min(br)); 517 dev_set_mtu(br->dev, br_mtu_min(br));
519 br->mtu_set_by_user = false; 518 br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false);
520} 519}
521 520
522static void br_set_gso_limits(struct net_bridge *br) 521static void br_set_gso_limits(struct net_bridge *br)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 72074276c088..3ddca11f44c2 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -122,7 +122,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
122 br_do_proxy_suppress_arp(skb, br, vid, p); 122 br_do_proxy_suppress_arp(skb, br, vid, p);
123 } else if (IS_ENABLED(CONFIG_IPV6) && 123 } else if (IS_ENABLED(CONFIG_IPV6) &&
124 skb->protocol == htons(ETH_P_IPV6) && 124 skb->protocol == htons(ETH_P_IPV6) &&
125 br->neigh_suppress_enabled && 125 br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
126 pskb_may_pull(skb, sizeof(struct ipv6hdr) + 126 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
127 sizeof(struct nd_msg)) && 127 sizeof(struct nd_msg)) &&
128 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { 128 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 6d9f48bd374a..a7ea2d431714 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -84,7 +84,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
84 int i, err = 0; 84 int i, err = 0;
85 int idx = 0, s_idx = cb->args[1]; 85 int idx = 0, s_idx = cb->args[1];
86 86
87 if (br->multicast_disabled) 87 if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
88 return 0; 88 return 0;
89 89
90 mdb = rcu_dereference(br->mdb); 90 mdb = rcu_dereference(br->mdb);
@@ -162,6 +162,29 @@ out:
162 return err; 162 return err;
163} 163}
164 164
165static int br_mdb_valid_dump_req(const struct nlmsghdr *nlh,
166 struct netlink_ext_ack *extack)
167{
168 struct br_port_msg *bpm;
169
170 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
171 NL_SET_ERR_MSG_MOD(extack, "Invalid header for mdb dump request");
172 return -EINVAL;
173 }
174
175 bpm = nlmsg_data(nlh);
176 if (bpm->ifindex) {
177 NL_SET_ERR_MSG_MOD(extack, "Filtering by device index is not supported for mdb dump request");
178 return -EINVAL;
179 }
180 if (nlmsg_attrlen(nlh, sizeof(*bpm))) {
181 NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request");
182 return -EINVAL;
183 }
184
185 return 0;
186}
187
165static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) 188static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
166{ 189{
167 struct net_device *dev; 190 struct net_device *dev;
@@ -169,6 +192,13 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
169 struct nlmsghdr *nlh = NULL; 192 struct nlmsghdr *nlh = NULL;
170 int idx = 0, s_idx; 193 int idx = 0, s_idx;
171 194
195 if (cb->strict_check) {
196 int err = br_mdb_valid_dump_req(cb->nlh, cb->extack);
197
198 if (err < 0)
199 return err;
200 }
201
172 s_idx = cb->args[0]; 202 s_idx = cb->args[0];
173 203
174 rcu_read_lock(); 204 rcu_read_lock();
@@ -598,7 +628,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
598 struct net_bridge_port *p; 628 struct net_bridge_port *p;
599 int ret; 629 int ret;
600 630
601 if (!netif_running(br->dev) || br->multicast_disabled) 631 if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
602 return -EINVAL; 632 return -EINVAL;
603 633
604 dev = __dev_get_by_index(net, entry->ifindex); 634 dev = __dev_get_by_index(net, entry->ifindex);
@@ -673,7 +703,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
673 struct br_ip ip; 703 struct br_ip ip;
674 int err = -EINVAL; 704 int err = -EINVAL;
675 705
676 if (!netif_running(br->dev) || br->multicast_disabled) 706 if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
677 return -EINVAL; 707 return -EINVAL;
678 708
679 __mdb_entry_to_br_ip(entry, &ip); 709 __mdb_entry_to_br_ip(entry, &ip);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 20ed7adcf1cc..41cdafbf2ebe 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -158,7 +158,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
158 struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); 158 struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
159 struct br_ip ip; 159 struct br_ip ip;
160 160
161 if (br->multicast_disabled) 161 if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
162 return NULL; 162 return NULL;
163 163
164 if (BR_INPUT_SKB_CB(skb)->igmp) 164 if (BR_INPUT_SKB_CB(skb)->igmp)
@@ -411,7 +411,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
411 iph->frag_off = htons(IP_DF); 411 iph->frag_off = htons(IP_DF);
412 iph->ttl = 1; 412 iph->ttl = 1;
413 iph->protocol = IPPROTO_IGMP; 413 iph->protocol = IPPROTO_IGMP;
414 iph->saddr = br->multicast_query_use_ifaddr ? 414 iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
415 inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; 415 inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
416 iph->daddr = htonl(INADDR_ALLHOSTS_GROUP); 416 iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
417 ((u8 *)&iph[1])[0] = IPOPT_RA; 417 ((u8 *)&iph[1])[0] = IPOPT_RA;
@@ -503,11 +503,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
503 if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, 503 if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
504 &ip6h->saddr)) { 504 &ip6h->saddr)) {
505 kfree_skb(skb); 505 kfree_skb(skb);
506 br->has_ipv6_addr = 0; 506 br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false);
507 return NULL; 507 return NULL;
508 } 508 }
509 509
510 br->has_ipv6_addr = 1; 510 br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
511 ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); 511 ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
512 512
513 hopopt = (u8 *)(ip6h + 1); 513 hopopt = (u8 *)(ip6h + 1);
@@ -628,7 +628,7 @@ static struct net_bridge_mdb_entry *br_multicast_get_group(
628 port ? port->dev->name : br->dev->name); 628 port ? port->dev->name : br->dev->name);
629 err = -E2BIG; 629 err = -E2BIG;
630disable: 630disable:
631 br->multicast_disabled = 1; 631 br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
632 goto err; 632 goto err;
633 } 633 }
634 } 634 }
@@ -894,7 +894,7 @@ static void br_multicast_querier_expired(struct net_bridge *br,
894 struct bridge_mcast_own_query *query) 894 struct bridge_mcast_own_query *query)
895{ 895{
896 spin_lock(&br->multicast_lock); 896 spin_lock(&br->multicast_lock);
897 if (!netif_running(br->dev) || br->multicast_disabled) 897 if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
898 goto out; 898 goto out;
899 899
900 br_multicast_start_querier(br, query); 900 br_multicast_start_querier(br, query);
@@ -965,8 +965,9 @@ static void br_multicast_send_query(struct net_bridge *br,
965 struct br_ip br_group; 965 struct br_ip br_group;
966 unsigned long time; 966 unsigned long time;
967 967
968 if (!netif_running(br->dev) || br->multicast_disabled || 968 if (!netif_running(br->dev) ||
969 !br->multicast_querier) 969 !br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
970 !br_opt_get(br, BROPT_MULTICAST_QUERIER))
970 return; 971 return;
971 972
972 memset(&br_group.u, 0, sizeof(br_group.u)); 973 memset(&br_group.u, 0, sizeof(br_group.u));
@@ -1036,7 +1037,7 @@ static void br_mc_disabled_update(struct net_device *dev, bool value)
1036 .orig_dev = dev, 1037 .orig_dev = dev,
1037 .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, 1038 .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
1038 .flags = SWITCHDEV_F_DEFER, 1039 .flags = SWITCHDEV_F_DEFER,
1039 .u.mc_disabled = value, 1040 .u.mc_disabled = !value,
1040 }; 1041 };
1041 1042
1042 switchdev_port_attr_set(dev, &attr); 1043 switchdev_port_attr_set(dev, &attr);
@@ -1054,7 +1055,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
1054 timer_setup(&port->ip6_own_query.timer, 1055 timer_setup(&port->ip6_own_query.timer,
1055 br_ip6_multicast_port_query_expired, 0); 1056 br_ip6_multicast_port_query_expired, 0);
1056#endif 1057#endif
1057 br_mc_disabled_update(port->dev, port->br->multicast_disabled); 1058 br_mc_disabled_update(port->dev,
1059 br_opt_get(port->br, BROPT_MULTICAST_ENABLED));
1058 1060
1059 port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); 1061 port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
1060 if (!port->mcast_stats) 1062 if (!port->mcast_stats)
@@ -1091,7 +1093,7 @@ static void __br_multicast_enable_port(struct net_bridge_port *port)
1091{ 1093{
1092 struct net_bridge *br = port->br; 1094 struct net_bridge *br = port->br;
1093 1095
1094 if (br->multicast_disabled || !netif_running(br->dev)) 1096 if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev))
1095 return; 1097 return;
1096 1098
1097 br_multicast_enable(&port->ip4_own_query); 1099 br_multicast_enable(&port->ip4_own_query);
@@ -1420,7 +1422,15 @@ static void br_multicast_query_received(struct net_bridge *br,
1420 return; 1422 return;
1421 1423
1422 br_multicast_update_query_timer(br, query, max_delay); 1424 br_multicast_update_query_timer(br, query, max_delay);
1423 br_multicast_mark_router(br, port); 1425
1426 /* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules,
1427 * the arrival port for IGMP Queries where the source address
1428 * is 0.0.0.0 should not be added to router port list.
1429 */
1430 if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) ||
1431 (saddr->proto == htons(ETH_P_IPV6) &&
1432 !ipv6_addr_any(&saddr->u.ip6)))
1433 br_multicast_mark_router(br, port);
1424} 1434}
1425 1435
1426static void br_ip4_multicast_query(struct net_bridge *br, 1436static void br_ip4_multicast_query(struct net_bridge *br,
@@ -1634,7 +1644,7 @@ br_multicast_leave_group(struct net_bridge *br,
1634 if (timer_pending(&other_query->timer)) 1644 if (timer_pending(&other_query->timer))
1635 goto out; 1645 goto out;
1636 1646
1637 if (br->multicast_querier) { 1647 if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
1638 __br_multicast_send_query(br, port, &mp->addr); 1648 __br_multicast_send_query(br, port, &mp->addr);
1639 1649
1640 time = jiffies + br->multicast_last_member_count * 1650 time = jiffies + br->multicast_last_member_count *
@@ -1746,7 +1756,7 @@ static void br_multicast_err_count(const struct net_bridge *br,
1746 struct bridge_mcast_stats __percpu *stats; 1756 struct bridge_mcast_stats __percpu *stats;
1747 struct bridge_mcast_stats *pstats; 1757 struct bridge_mcast_stats *pstats;
1748 1758
1749 if (!br->multicast_stats_enabled) 1759 if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
1750 return; 1760 return;
1751 1761
1752 if (p) 1762 if (p)
@@ -1904,7 +1914,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
1904 BR_INPUT_SKB_CB(skb)->igmp = 0; 1914 BR_INPUT_SKB_CB(skb)->igmp = 0;
1905 BR_INPUT_SKB_CB(skb)->mrouters_only = 0; 1915 BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
1906 1916
1907 if (br->multicast_disabled) 1917 if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
1908 return 0; 1918 return 0;
1909 1919
1910 switch (skb->protocol) { 1920 switch (skb->protocol) {
@@ -1956,8 +1966,6 @@ void br_multicast_init(struct net_bridge *br)
1956 br->hash_max = 512; 1966 br->hash_max = 512;
1957 1967
1958 br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; 1968 br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
1959 br->multicast_querier = 0;
1960 br->multicast_query_use_ifaddr = 0;
1961 br->multicast_last_member_count = 2; 1969 br->multicast_last_member_count = 2;
1962 br->multicast_startup_query_count = 2; 1970 br->multicast_startup_query_count = 2;
1963 1971
@@ -1976,7 +1984,8 @@ void br_multicast_init(struct net_bridge *br)
1976 br->ip6_other_query.delay_time = 0; 1984 br->ip6_other_query.delay_time = 0;
1977 br->ip6_querier.port = NULL; 1985 br->ip6_querier.port = NULL;
1978#endif 1986#endif
1979 br->has_ipv6_addr = 1; 1987 br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true);
1988 br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
1980 1989
1981 spin_lock_init(&br->multicast_lock); 1990 spin_lock_init(&br->multicast_lock);
1982 timer_setup(&br->multicast_router_timer, 1991 timer_setup(&br->multicast_router_timer,
@@ -1998,7 +2007,7 @@ static void __br_multicast_open(struct net_bridge *br,
1998{ 2007{
1999 query->startup_sent = 0; 2008 query->startup_sent = 0;
2000 2009
2001 if (br->multicast_disabled) 2010 if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
2002 return; 2011 return;
2003 2012
2004 mod_timer(&query->timer, jiffies); 2013 mod_timer(&query->timer, jiffies);
@@ -2173,12 +2182,12 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
2173 int err = 0; 2182 int err = 0;
2174 2183
2175 spin_lock_bh(&br->multicast_lock); 2184 spin_lock_bh(&br->multicast_lock);
2176 if (br->multicast_disabled == !val) 2185 if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
2177 goto unlock; 2186 goto unlock;
2178 2187
2179 br_mc_disabled_update(br->dev, !val); 2188 br_mc_disabled_update(br->dev, val);
2180 br->multicast_disabled = !val; 2189 br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
2181 if (br->multicast_disabled) 2190 if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
2182 goto unlock; 2191 goto unlock;
2183 2192
2184 if (!netif_running(br->dev)) 2193 if (!netif_running(br->dev))
@@ -2189,7 +2198,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
2189 if (mdb->old) { 2198 if (mdb->old) {
2190 err = -EEXIST; 2199 err = -EEXIST;
2191rollback: 2200rollback:
2192 br->multicast_disabled = !!val; 2201 br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
2193 goto unlock; 2202 goto unlock;
2194 } 2203 }
2195 2204
@@ -2213,7 +2222,7 @@ bool br_multicast_enabled(const struct net_device *dev)
2213{ 2222{
2214 struct net_bridge *br = netdev_priv(dev); 2223 struct net_bridge *br = netdev_priv(dev);
2215 2224
2216 return !br->multicast_disabled; 2225 return !!br_opt_get(br, BROPT_MULTICAST_ENABLED);
2217} 2226}
2218EXPORT_SYMBOL_GPL(br_multicast_enabled); 2227EXPORT_SYMBOL_GPL(br_multicast_enabled);
2219 2228
@@ -2236,10 +2245,10 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
2236 val = !!val; 2245 val = !!val;
2237 2246
2238 spin_lock_bh(&br->multicast_lock); 2247 spin_lock_bh(&br->multicast_lock);
2239 if (br->multicast_querier == val) 2248 if (br_opt_get(br, BROPT_MULTICAST_QUERIER) == val)
2240 goto unlock; 2249 goto unlock;
2241 2250
2242 br->multicast_querier = val; 2251 br_opt_toggle(br, BROPT_MULTICAST_QUERIER, !!val);
2243 if (!val) 2252 if (!val)
2244 goto unlock; 2253 goto unlock;
2245 2254
@@ -2560,7 +2569,7 @@ void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
2560 struct bridge_mcast_stats __percpu *stats; 2569 struct bridge_mcast_stats __percpu *stats;
2561 2570
2562 /* if multicast_disabled is true then igmp type can't be set */ 2571 /* if multicast_disabled is true then igmp type can't be set */
2563 if (!type || !br->multicast_stats_enabled) 2572 if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
2564 return; 2573 return;
2565 2574
2566 if (p) 2575 if (p)
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 37278dc280eb..b1b5e8516724 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -487,14 +487,15 @@ static unsigned int br_nf_pre_routing(void *priv,
487 br = p->br; 487 br = p->br;
488 488
489 if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { 489 if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
490 if (!brnf_call_ip6tables && !br->nf_call_ip6tables) 490 if (!brnf_call_ip6tables &&
491 !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
491 return NF_ACCEPT; 492 return NF_ACCEPT;
492 493
493 nf_bridge_pull_encap_header_rcsum(skb); 494 nf_bridge_pull_encap_header_rcsum(skb);
494 return br_nf_pre_routing_ipv6(priv, skb, state); 495 return br_nf_pre_routing_ipv6(priv, skb, state);
495 } 496 }
496 497
497 if (!brnf_call_iptables && !br->nf_call_iptables) 498 if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
498 return NF_ACCEPT; 499 return NF_ACCEPT;
499 500
500 if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) 501 if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
@@ -636,7 +637,7 @@ static unsigned int br_nf_forward_arp(void *priv,
636 return NF_ACCEPT; 637 return NF_ACCEPT;
637 br = p->br; 638 br = p->br;
638 639
639 if (!brnf_call_arptables && !br->nf_call_arptables) 640 if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
640 return NF_ACCEPT; 641 return NF_ACCEPT;
641 642
642 if (!IS_ARP(skb)) { 643 if (!IS_ARP(skb)) {
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ec2b58a09f76..3345f1984542 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1034,6 +1034,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
1034 [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, 1034 [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
1035 [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, 1035 [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
1036 [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, 1036 [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
1037 [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
1037}; 1038};
1038 1039
1039static int br_changelink(struct net_device *brdev, struct nlattr *tb[], 1040static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1114,6 +1115,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1114 if (err) 1115 if (err)
1115 return err; 1116 return err;
1116 } 1117 }
1118
1119 if (data[IFLA_BR_VLAN_STATS_PER_PORT]) {
1120 __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]);
1121
1122 err = br_vlan_set_stats_per_port(br, per_port);
1123 if (err)
1124 return err;
1125 }
1117#endif 1126#endif
1118 1127
1119 if (data[IFLA_BR_GROUP_FWD_MASK]) { 1128 if (data[IFLA_BR_GROUP_FWD_MASK]) {
@@ -1139,7 +1148,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1139 spin_lock_bh(&br->lock); 1148 spin_lock_bh(&br->lock);
1140 memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); 1149 memcpy(br->group_addr, new_addr, sizeof(br->group_addr));
1141 spin_unlock_bh(&br->lock); 1150 spin_unlock_bh(&br->lock);
1142 br->group_addr_set = true; 1151 br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
1143 br_recalculate_fwd_mask(br); 1152 br_recalculate_fwd_mask(br);
1144 } 1153 }
1145 1154
@@ -1167,7 +1176,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1167 u8 val; 1176 u8 val;
1168 1177
1169 val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); 1178 val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]);
1170 br->multicast_query_use_ifaddr = !!val; 1179 br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
1171 } 1180 }
1172 1181
1173 if (data[IFLA_BR_MCAST_QUERIER]) { 1182 if (data[IFLA_BR_MCAST_QUERIER]) {
@@ -1244,7 +1253,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1244 __u8 mcast_stats; 1253 __u8 mcast_stats;
1245 1254
1246 mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); 1255 mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
1247 br->multicast_stats_enabled = !!mcast_stats; 1256 br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats);
1248 } 1257 }
1249 1258
1250 if (data[IFLA_BR_MCAST_IGMP_VERSION]) { 1259 if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
@@ -1271,19 +1280,19 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1271 if (data[IFLA_BR_NF_CALL_IPTABLES]) { 1280 if (data[IFLA_BR_NF_CALL_IPTABLES]) {
1272 u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); 1281 u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]);
1273 1282
1274 br->nf_call_iptables = val ? true : false; 1283 br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
1275 } 1284 }
1276 1285
1277 if (data[IFLA_BR_NF_CALL_IP6TABLES]) { 1286 if (data[IFLA_BR_NF_CALL_IP6TABLES]) {
1278 u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); 1287 u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]);
1279 1288
1280 br->nf_call_ip6tables = val ? true : false; 1289 br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
1281 } 1290 }
1282 1291
1283 if (data[IFLA_BR_NF_CALL_ARPTABLES]) { 1292 if (data[IFLA_BR_NF_CALL_ARPTABLES]) {
1284 u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); 1293 u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]);
1285 1294
1286 br->nf_call_arptables = val ? true : false; 1295 br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
1287 } 1296 }
1288#endif 1297#endif
1289 1298
@@ -1327,6 +1336,7 @@ static size_t br_get_size(const struct net_device *brdev)
1327 nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ 1336 nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */
1328 nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ 1337 nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */
1329 nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ 1338 nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */
1339 nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */
1330#endif 1340#endif
1331 nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ 1341 nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */
1332 nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ 1342 nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */
@@ -1416,17 +1426,22 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
1416#ifdef CONFIG_BRIDGE_VLAN_FILTERING 1426#ifdef CONFIG_BRIDGE_VLAN_FILTERING
1417 if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || 1427 if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
1418 nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || 1428 nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
1419 nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled)) 1429 nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
1430 br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) ||
1431 nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT,
1432 br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT)))
1420 return -EMSGSIZE; 1433 return -EMSGSIZE;
1421#endif 1434#endif
1422#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 1435#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
1423 if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || 1436 if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) ||
1424 nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || 1437 nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING,
1438 br_opt_get(br, BROPT_MULTICAST_ENABLED)) ||
1425 nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, 1439 nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
1426 br->multicast_query_use_ifaddr) || 1440 br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
1427 nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || 1441 nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
1442 br_opt_get(br, BROPT_MULTICAST_QUERIER)) ||
1428 nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, 1443 nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
1429 br->multicast_stats_enabled) || 1444 br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
1430 nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, 1445 nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
1431 br->hash_elasticity) || 1446 br->hash_elasticity) ||
1432 nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || 1447 nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
@@ -1469,11 +1484,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
1469#endif 1484#endif
1470#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 1485#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
1471 if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, 1486 if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES,
1472 br->nf_call_iptables ? 1 : 0) || 1487 br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) ||
1473 nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, 1488 nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES,
1474 br->nf_call_ip6tables ? 1 : 0) || 1489 br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) ||
1475 nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, 1490 nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES,
1476 br->nf_call_arptables ? 1 : 0)) 1491 br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0))
1477 return -EMSGSIZE; 1492 return -EMSGSIZE;
1478#endif 1493#endif
1479 1494
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 11ed2029985f..2920e06a5403 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -54,14 +54,12 @@ typedef struct bridge_id bridge_id;
54typedef struct mac_addr mac_addr; 54typedef struct mac_addr mac_addr;
55typedef __u16 port_id; 55typedef __u16 port_id;
56 56
57struct bridge_id 57struct bridge_id {
58{
59 unsigned char prio[2]; 58 unsigned char prio[2];
60 unsigned char addr[ETH_ALEN]; 59 unsigned char addr[ETH_ALEN];
61}; 60};
62 61
63struct mac_addr 62struct mac_addr {
64{
65 unsigned char addr[ETH_ALEN]; 63 unsigned char addr[ETH_ALEN];
66}; 64};
67 65
@@ -181,6 +179,7 @@ struct net_bridge_fdb_entry {
181 struct hlist_node fdb_node; 179 struct hlist_node fdb_node;
182 unsigned char is_local:1, 180 unsigned char is_local:1,
183 is_static:1, 181 is_static:1,
182 is_sticky:1,
184 added_by_user:1, 183 added_by_user:1,
185 added_by_external_learn:1, 184 added_by_external_learn:1,
186 offloaded:1; 185 offloaded:1;
@@ -206,8 +205,7 @@ struct net_bridge_port_group {
206 unsigned char eth_addr[ETH_ALEN]; 205 unsigned char eth_addr[ETH_ALEN];
207}; 206};
208 207
209struct net_bridge_mdb_entry 208struct net_bridge_mdb_entry {
210{
211 struct hlist_node hlist[2]; 209 struct hlist_node hlist[2];
212 struct net_bridge *br; 210 struct net_bridge *br;
213 struct net_bridge_port_group __rcu *ports; 211 struct net_bridge_port_group __rcu *ports;
@@ -217,8 +215,7 @@ struct net_bridge_mdb_entry
217 bool host_joined; 215 bool host_joined;
218}; 216};
219 217
220struct net_bridge_mdb_htable 218struct net_bridge_mdb_htable {
221{
222 struct hlist_head *mhash; 219 struct hlist_head *mhash;
223 struct rcu_head rcu; 220 struct rcu_head rcu;
224 struct net_bridge_mdb_htable *old; 221 struct net_bridge_mdb_htable *old;
@@ -309,16 +306,32 @@ static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_devi
309 rcu_dereference_rtnl(dev->rx_handler_data) : NULL; 306 rcu_dereference_rtnl(dev->rx_handler_data) : NULL;
310} 307}
311 308
309enum net_bridge_opts {
310 BROPT_VLAN_ENABLED,
311 BROPT_VLAN_STATS_ENABLED,
312 BROPT_NF_CALL_IPTABLES,
313 BROPT_NF_CALL_IP6TABLES,
314 BROPT_NF_CALL_ARPTABLES,
315 BROPT_GROUP_ADDR_SET,
316 BROPT_MULTICAST_ENABLED,
317 BROPT_MULTICAST_QUERIER,
318 BROPT_MULTICAST_QUERY_USE_IFADDR,
319 BROPT_MULTICAST_STATS_ENABLED,
320 BROPT_HAS_IPV6_ADDR,
321 BROPT_NEIGH_SUPPRESS_ENABLED,
322 BROPT_MTU_SET_BY_USER,
323 BROPT_VLAN_STATS_PER_PORT,
324};
325
312struct net_bridge { 326struct net_bridge {
313 spinlock_t lock; 327 spinlock_t lock;
314 spinlock_t hash_lock; 328 spinlock_t hash_lock;
315 struct list_head port_list; 329 struct list_head port_list;
316 struct net_device *dev; 330 struct net_device *dev;
317 struct pcpu_sw_netstats __percpu *stats; 331 struct pcpu_sw_netstats __percpu *stats;
332 unsigned long options;
318 /* These fields are accessed on each packet */ 333 /* These fields are accessed on each packet */
319#ifdef CONFIG_BRIDGE_VLAN_FILTERING 334#ifdef CONFIG_BRIDGE_VLAN_FILTERING
320 u8 vlan_enabled;
321 u8 vlan_stats_enabled;
322 __be16 vlan_proto; 335 __be16 vlan_proto;
323 u16 default_pvid; 336 u16 default_pvid;
324 struct net_bridge_vlan_group __rcu *vlgrp; 337 struct net_bridge_vlan_group __rcu *vlgrp;
@@ -330,9 +343,6 @@ struct net_bridge {
330 struct rtable fake_rtable; 343 struct rtable fake_rtable;
331 struct rt6_info fake_rt6_info; 344 struct rt6_info fake_rt6_info;
332 }; 345 };
333 bool nf_call_iptables;
334 bool nf_call_ip6tables;
335 bool nf_call_arptables;
336#endif 346#endif
337 u16 group_fwd_mask; 347 u16 group_fwd_mask;
338 u16 group_fwd_mask_required; 348 u16 group_fwd_mask_required;
@@ -340,7 +350,6 @@ struct net_bridge {
340 /* STP */ 350 /* STP */
341 bridge_id designated_root; 351 bridge_id designated_root;
342 bridge_id bridge_id; 352 bridge_id bridge_id;
343 u32 root_path_cost;
344 unsigned char topology_change; 353 unsigned char topology_change;
345 unsigned char topology_change_detected; 354 unsigned char topology_change_detected;
346 u16 root_port; 355 u16 root_port;
@@ -352,9 +361,9 @@ struct net_bridge {
352 unsigned long bridge_hello_time; 361 unsigned long bridge_hello_time;
353 unsigned long bridge_forward_delay; 362 unsigned long bridge_forward_delay;
354 unsigned long bridge_ageing_time; 363 unsigned long bridge_ageing_time;
364 u32 root_path_cost;
355 365
356 u8 group_addr[ETH_ALEN]; 366 u8 group_addr[ETH_ALEN];
357 bool group_addr_set;
358 367
359 enum { 368 enum {
360 BR_NO_STP, /* no spanning tree */ 369 BR_NO_STP, /* no spanning tree */
@@ -363,13 +372,6 @@ struct net_bridge {
363 } stp_enabled; 372 } stp_enabled;
364 373
365#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 374#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
366 unsigned char multicast_router;
367
368 u8 multicast_disabled:1;
369 u8 multicast_querier:1;
370 u8 multicast_query_use_ifaddr:1;
371 u8 has_ipv6_addr:1;
372 u8 multicast_stats_enabled:1;
373 375
374 u32 hash_elasticity; 376 u32 hash_elasticity;
375 u32 hash_max; 377 u32 hash_max;
@@ -378,7 +380,11 @@ struct net_bridge {
378 u32 multicast_startup_query_count; 380 u32 multicast_startup_query_count;
379 381
380 u8 multicast_igmp_version; 382 u8 multicast_igmp_version;
381 383 u8 multicast_router;
384#if IS_ENABLED(CONFIG_IPV6)
385 u8 multicast_mld_version;
386#endif
387 spinlock_t multicast_lock;
382 unsigned long multicast_last_member_interval; 388 unsigned long multicast_last_member_interval;
383 unsigned long multicast_membership_interval; 389 unsigned long multicast_membership_interval;
384 unsigned long multicast_querier_interval; 390 unsigned long multicast_querier_interval;
@@ -386,7 +392,6 @@ struct net_bridge {
386 unsigned long multicast_query_response_interval; 392 unsigned long multicast_query_response_interval;
387 unsigned long multicast_startup_query_interval; 393 unsigned long multicast_startup_query_interval;
388 394
389 spinlock_t multicast_lock;
390 struct net_bridge_mdb_htable __rcu *mdb; 395 struct net_bridge_mdb_htable __rcu *mdb;
391 struct hlist_head router_list; 396 struct hlist_head router_list;
392 397
@@ -399,7 +404,6 @@ struct net_bridge {
399 struct bridge_mcast_other_query ip6_other_query; 404 struct bridge_mcast_other_query ip6_other_query;
400 struct bridge_mcast_own_query ip6_own_query; 405 struct bridge_mcast_own_query ip6_own_query;
401 struct bridge_mcast_querier ip6_querier; 406 struct bridge_mcast_querier ip6_querier;
402 u8 multicast_mld_version;
403#endif /* IS_ENABLED(CONFIG_IPV6) */ 407#endif /* IS_ENABLED(CONFIG_IPV6) */
404#endif 408#endif
405 409
@@ -413,8 +417,6 @@ struct net_bridge {
413#ifdef CONFIG_NET_SWITCHDEV 417#ifdef CONFIG_NET_SWITCHDEV
414 int offload_fwd_mark; 418 int offload_fwd_mark;
415#endif 419#endif
416 bool neigh_suppress_enabled;
417 bool mtu_set_by_user;
418 struct hlist_head fdb_list; 420 struct hlist_head fdb_list;
419}; 421};
420 422
@@ -492,6 +494,14 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
492 return true; 494 return true;
493} 495}
494 496
497static inline int br_opt_get(const struct net_bridge *br,
498 enum net_bridge_opts opt)
499{
500 return test_bit(opt, &br->options);
501}
502
503void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
504
495/* br_device.c */ 505/* br_device.c */
496void br_dev_setup(struct net_device *dev); 506void br_dev_setup(struct net_device *dev);
497void br_dev_delete(struct net_device *dev, struct list_head *list); 507void br_dev_delete(struct net_device *dev, struct list_head *list);
@@ -564,7 +574,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
564 const unsigned char *addr, u16 vid, 574 const unsigned char *addr, u16 vid,
565 bool swdev_notify); 575 bool swdev_notify);
566void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, 576void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
567 const unsigned char *addr, u16 vid); 577 const unsigned char *addr, u16 vid, bool offloaded);
568 578
569/* br_forward.c */ 579/* br_forward.c */
570enum br_pkt_type { 580enum br_pkt_type {
@@ -698,8 +708,8 @@ __br_multicast_querier_exists(struct net_bridge *br,
698{ 708{
699 bool own_querier_enabled; 709 bool own_querier_enabled;
700 710
701 if (br->multicast_querier) { 711 if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
702 if (is_ipv6 && !br->has_ipv6_addr) 712 if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR))
703 own_querier_enabled = false; 713 own_querier_enabled = false;
704 else 714 else
705 own_querier_enabled = true; 715 own_querier_enabled = true;
@@ -850,6 +860,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
850int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); 860int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
851int br_vlan_set_proto(struct net_bridge *br, unsigned long val); 861int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
852int br_vlan_set_stats(struct net_bridge *br, unsigned long val); 862int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
863int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
853int br_vlan_init(struct net_bridge *br); 864int br_vlan_init(struct net_bridge *br);
854int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); 865int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
855int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); 866int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index d77f807420c4..b993df770675 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -103,7 +103,7 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
103static void 103static void
104br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, 104br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
105 u16 vid, struct net_device *dev, 105 u16 vid, struct net_device *dev,
106 bool added_by_user) 106 bool added_by_user, bool offloaded)
107{ 107{
108 struct switchdev_notifier_fdb_info info; 108 struct switchdev_notifier_fdb_info info;
109 unsigned long notifier_type; 109 unsigned long notifier_type;
@@ -111,6 +111,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
111 info.addr = mac; 111 info.addr = mac;
112 info.vid = vid; 112 info.vid = vid;
113 info.added_by_user = added_by_user; 113 info.added_by_user = added_by_user;
114 info.offloaded = offloaded;
114 notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; 115 notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
115 call_switchdev_notifiers(notifier_type, dev, &info.info); 116 call_switchdev_notifiers(notifier_type, dev, &info.info);
116} 117}
@@ -126,13 +127,15 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
126 br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, 127 br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
127 fdb->key.vlan_id, 128 fdb->key.vlan_id,
128 fdb->dst->dev, 129 fdb->dst->dev,
129 fdb->added_by_user); 130 fdb->added_by_user,
131 fdb->offloaded);
130 break; 132 break;
131 case RTM_NEWNEIGH: 133 case RTM_NEWNEIGH:
132 br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, 134 br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
133 fdb->key.vlan_id, 135 fdb->key.vlan_id,
134 fdb->dst->dev, 136 fdb->dst->dev,
135 fdb->added_by_user); 137 fdb->added_by_user,
138 fdb->offloaded);
136 break; 139 break;
137 } 140 }
138} 141}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 0318a69888d4..60182bef6341 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -303,7 +303,7 @@ static ssize_t group_addr_store(struct device *d,
303 ether_addr_copy(br->group_addr, new_addr); 303 ether_addr_copy(br->group_addr, new_addr);
304 spin_unlock_bh(&br->lock); 304 spin_unlock_bh(&br->lock);
305 305
306 br->group_addr_set = true; 306 br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
307 br_recalculate_fwd_mask(br); 307 br_recalculate_fwd_mask(br);
308 netdev_state_change(br->dev); 308 netdev_state_change(br->dev);
309 309
@@ -349,7 +349,7 @@ static ssize_t multicast_snooping_show(struct device *d,
349 char *buf) 349 char *buf)
350{ 350{
351 struct net_bridge *br = to_bridge(d); 351 struct net_bridge *br = to_bridge(d);
352 return sprintf(buf, "%d\n", !br->multicast_disabled); 352 return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
353} 353}
354 354
355static ssize_t multicast_snooping_store(struct device *d, 355static ssize_t multicast_snooping_store(struct device *d,
@@ -365,12 +365,13 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d,
365 char *buf) 365 char *buf)
366{ 366{
367 struct net_bridge *br = to_bridge(d); 367 struct net_bridge *br = to_bridge(d);
368 return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr); 368 return sprintf(buf, "%d\n",
369 br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
369} 370}
370 371
371static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val) 372static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val)
372{ 373{
373 br->multicast_query_use_ifaddr = !!val; 374 br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
374 return 0; 375 return 0;
375} 376}
376 377
@@ -388,7 +389,7 @@ static ssize_t multicast_querier_show(struct device *d,
388 char *buf) 389 char *buf)
389{ 390{
390 struct net_bridge *br = to_bridge(d); 391 struct net_bridge *br = to_bridge(d);
391 return sprintf(buf, "%d\n", br->multicast_querier); 392 return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER));
392} 393}
393 394
394static ssize_t multicast_querier_store(struct device *d, 395static ssize_t multicast_querier_store(struct device *d,
@@ -636,12 +637,13 @@ static ssize_t multicast_stats_enabled_show(struct device *d,
636{ 637{
637 struct net_bridge *br = to_bridge(d); 638 struct net_bridge *br = to_bridge(d);
638 639
639 return sprintf(buf, "%u\n", br->multicast_stats_enabled); 640 return sprintf(buf, "%d\n",
641 br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
640} 642}
641 643
642static int set_stats_enabled(struct net_bridge *br, unsigned long val) 644static int set_stats_enabled(struct net_bridge *br, unsigned long val)
643{ 645{
644 br->multicast_stats_enabled = !!val; 646 br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val);
645 return 0; 647 return 0;
646} 648}
647 649
@@ -678,12 +680,12 @@ static ssize_t nf_call_iptables_show(
678 struct device *d, struct device_attribute *attr, char *buf) 680 struct device *d, struct device_attribute *attr, char *buf)
679{ 681{
680 struct net_bridge *br = to_bridge(d); 682 struct net_bridge *br = to_bridge(d);
681 return sprintf(buf, "%u\n", br->nf_call_iptables); 683 return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
682} 684}
683 685
684static int set_nf_call_iptables(struct net_bridge *br, unsigned long val) 686static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
685{ 687{
686 br->nf_call_iptables = val ? true : false; 688 br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
687 return 0; 689 return 0;
688} 690}
689 691
@@ -699,12 +701,12 @@ static ssize_t nf_call_ip6tables_show(
699 struct device *d, struct device_attribute *attr, char *buf) 701 struct device *d, struct device_attribute *attr, char *buf)
700{ 702{
701 struct net_bridge *br = to_bridge(d); 703 struct net_bridge *br = to_bridge(d);
702 return sprintf(buf, "%u\n", br->nf_call_ip6tables); 704 return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
703} 705}
704 706
705static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val) 707static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
706{ 708{
707 br->nf_call_ip6tables = val ? true : false; 709 br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
708 return 0; 710 return 0;
709} 711}
710 712
@@ -720,12 +722,12 @@ static ssize_t nf_call_arptables_show(
720 struct device *d, struct device_attribute *attr, char *buf) 722 struct device *d, struct device_attribute *attr, char *buf)
721{ 723{
722 struct net_bridge *br = to_bridge(d); 724 struct net_bridge *br = to_bridge(d);
723 return sprintf(buf, "%u\n", br->nf_call_arptables); 725 return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
724} 726}
725 727
726static int set_nf_call_arptables(struct net_bridge *br, unsigned long val) 728static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
727{ 729{
728 br->nf_call_arptables = val ? true : false; 730 br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
729 return 0; 731 return 0;
730} 732}
731 733
@@ -743,7 +745,7 @@ static ssize_t vlan_filtering_show(struct device *d,
743 char *buf) 745 char *buf)
744{ 746{
745 struct net_bridge *br = to_bridge(d); 747 struct net_bridge *br = to_bridge(d);
746 return sprintf(buf, "%d\n", br->vlan_enabled); 748 return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
747} 749}
748 750
749static ssize_t vlan_filtering_store(struct device *d, 751static ssize_t vlan_filtering_store(struct device *d,
@@ -791,7 +793,7 @@ static ssize_t vlan_stats_enabled_show(struct device *d,
791 char *buf) 793 char *buf)
792{ 794{
793 struct net_bridge *br = to_bridge(d); 795 struct net_bridge *br = to_bridge(d);
794 return sprintf(buf, "%u\n", br->vlan_stats_enabled); 796 return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
795} 797}
796 798
797static ssize_t vlan_stats_enabled_store(struct device *d, 799static ssize_t vlan_stats_enabled_store(struct device *d,
@@ -801,6 +803,22 @@ static ssize_t vlan_stats_enabled_store(struct device *d,
801 return store_bridge_parm(d, buf, len, br_vlan_set_stats); 803 return store_bridge_parm(d, buf, len, br_vlan_set_stats);
802} 804}
803static DEVICE_ATTR_RW(vlan_stats_enabled); 805static DEVICE_ATTR_RW(vlan_stats_enabled);
806
807static ssize_t vlan_stats_per_port_show(struct device *d,
808 struct device_attribute *attr,
809 char *buf)
810{
811 struct net_bridge *br = to_bridge(d);
812 return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
813}
814
815static ssize_t vlan_stats_per_port_store(struct device *d,
816 struct device_attribute *attr,
817 const char *buf, size_t len)
818{
819 return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port);
820}
821static DEVICE_ATTR_RW(vlan_stats_per_port);
804#endif 822#endif
805 823
806static struct attribute *bridge_attrs[] = { 824static struct attribute *bridge_attrs[] = {
@@ -854,6 +872,7 @@ static struct attribute *bridge_attrs[] = {
854 &dev_attr_vlan_protocol.attr, 872 &dev_attr_vlan_protocol.attr,
855 &dev_attr_default_pvid.attr, 873 &dev_attr_default_pvid.attr,
856 &dev_attr_vlan_stats_enabled.attr, 874 &dev_attr_vlan_stats_enabled.attr,
875 &dev_attr_vlan_stats_per_port.attr,
857#endif 876#endif
858 NULL 877 NULL
859}; 878};
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 7df269092103..8c9297a01947 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -190,6 +190,19 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv)
190 } 190 }
191} 191}
192 192
193static void nbp_vlan_rcu_free(struct rcu_head *rcu)
194{
195 struct net_bridge_vlan *v;
196
197 v = container_of(rcu, struct net_bridge_vlan, rcu);
198 WARN_ON(br_vlan_is_master(v));
199 /* if we had per-port stats configured then free them here */
200 if (v->brvlan->stats != v->stats)
201 free_percpu(v->stats);
202 v->stats = NULL;
203 kfree(v);
204}
205
193/* This is the shared VLAN add function which works for both ports and bridge 206/* This is the shared VLAN add function which works for both ports and bridge
194 * devices. There are four possible calls to this function in terms of the 207 * devices. There are four possible calls to this function in terms of the
195 * vlan entry type: 208 * vlan entry type:
@@ -245,7 +258,15 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
245 if (!masterv) 258 if (!masterv)
246 goto out_filt; 259 goto out_filt;
247 v->brvlan = masterv; 260 v->brvlan = masterv;
248 v->stats = masterv->stats; 261 if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
262 v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
263 if (!v->stats) {
264 err = -ENOMEM;
265 goto out_filt;
266 }
267 } else {
268 v->stats = masterv->stats;
269 }
249 } else { 270 } else {
250 err = br_switchdev_port_vlan_add(dev, v->vid, flags); 271 err = br_switchdev_port_vlan_add(dev, v->vid, flags);
251 if (err && err != -EOPNOTSUPP) 272 if (err && err != -EOPNOTSUPP)
@@ -282,6 +303,10 @@ out_filt:
282 if (p) { 303 if (p) {
283 __vlan_vid_del(dev, br, v->vid); 304 __vlan_vid_del(dev, br, v->vid);
284 if (masterv) { 305 if (masterv) {
306 if (v->stats && masterv->stats != v->stats)
307 free_percpu(v->stats);
308 v->stats = NULL;
309
285 br_vlan_put_master(masterv); 310 br_vlan_put_master(masterv);
286 v->brvlan = NULL; 311 v->brvlan = NULL;
287 } 312 }
@@ -329,7 +354,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
329 rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, 354 rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
330 br_vlan_rht_params); 355 br_vlan_rht_params);
331 __vlan_del_list(v); 356 __vlan_del_list(v);
332 kfree_rcu(v, rcu); 357 call_rcu(&v->rcu, nbp_vlan_rcu_free);
333 } 358 }
334 359
335 br_vlan_put_master(masterv); 360 br_vlan_put_master(masterv);
@@ -386,7 +411,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
386 return NULL; 411 return NULL;
387 } 412 }
388 } 413 }
389 if (br->vlan_stats_enabled) { 414 if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
390 stats = this_cpu_ptr(v->stats); 415 stats = this_cpu_ptr(v->stats);
391 u64_stats_update_begin(&stats->syncp); 416 u64_stats_update_begin(&stats->syncp);
392 stats->tx_bytes += skb->len; 417 stats->tx_bytes += skb->len;
@@ -475,14 +500,14 @@ static bool __allowed_ingress(const struct net_bridge *br,
475 skb->vlan_tci |= pvid; 500 skb->vlan_tci |= pvid;
476 501
477 /* if stats are disabled we can avoid the lookup */ 502 /* if stats are disabled we can avoid the lookup */
478 if (!br->vlan_stats_enabled) 503 if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED))
479 return true; 504 return true;
480 } 505 }
481 v = br_vlan_find(vg, *vid); 506 v = br_vlan_find(vg, *vid);
482 if (!v || !br_vlan_should_use(v)) 507 if (!v || !br_vlan_should_use(v))
483 goto drop; 508 goto drop;
484 509
485 if (br->vlan_stats_enabled) { 510 if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
486 stats = this_cpu_ptr(v->stats); 511 stats = this_cpu_ptr(v->stats);
487 u64_stats_update_begin(&stats->syncp); 512 u64_stats_update_begin(&stats->syncp);
488 stats->rx_bytes += skb->len; 513 stats->rx_bytes += skb->len;
@@ -504,7 +529,7 @@ bool br_allowed_ingress(const struct net_bridge *br,
504 /* If VLAN filtering is disabled on the bridge, all packets are 529 /* If VLAN filtering is disabled on the bridge, all packets are
505 * permitted. 530 * permitted.
506 */ 531 */
507 if (!br->vlan_enabled) { 532 if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
508 BR_INPUT_SKB_CB(skb)->vlan_filtered = false; 533 BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
509 return true; 534 return true;
510 } 535 }
@@ -538,7 +563,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
538 struct net_bridge *br = p->br; 563 struct net_bridge *br = p->br;
539 564
540 /* If filtering was disabled at input, let it pass. */ 565 /* If filtering was disabled at input, let it pass. */
541 if (!br->vlan_enabled) 566 if (!br_opt_get(br, BROPT_VLAN_ENABLED))
542 return true; 567 return true;
543 568
544 vg = nbp_vlan_group_rcu(p); 569 vg = nbp_vlan_group_rcu(p);
@@ -695,11 +720,12 @@ struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
695/* Must be protected by RTNL. */ 720/* Must be protected by RTNL. */
696static void recalculate_group_addr(struct net_bridge *br) 721static void recalculate_group_addr(struct net_bridge *br)
697{ 722{
698 if (br->group_addr_set) 723 if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
699 return; 724 return;
700 725
701 spin_lock_bh(&br->lock); 726 spin_lock_bh(&br->lock);
702 if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) { 727 if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
728 br->vlan_proto == htons(ETH_P_8021Q)) {
703 /* Bridge Group Address */ 729 /* Bridge Group Address */
704 br->group_addr[5] = 0x00; 730 br->group_addr[5] = 0x00;
705 } else { /* vlan_enabled && ETH_P_8021AD */ 731 } else { /* vlan_enabled && ETH_P_8021AD */
@@ -712,7 +738,8 @@ static void recalculate_group_addr(struct net_bridge *br)
712/* Must be protected by RTNL. */ 738/* Must be protected by RTNL. */
713void br_recalculate_fwd_mask(struct net_bridge *br) 739void br_recalculate_fwd_mask(struct net_bridge *br)
714{ 740{
715 if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) 741 if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
742 br->vlan_proto == htons(ETH_P_8021Q))
716 br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; 743 br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
717 else /* vlan_enabled && ETH_P_8021AD */ 744 else /* vlan_enabled && ETH_P_8021AD */
718 br->group_fwd_mask_required = BR_GROUPFWD_8021AD & 745 br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
@@ -729,14 +756,14 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
729 }; 756 };
730 int err; 757 int err;
731 758
732 if (br->vlan_enabled == val) 759 if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
733 return 0; 760 return 0;
734 761
735 err = switchdev_port_attr_set(br->dev, &attr); 762 err = switchdev_port_attr_set(br->dev, &attr);
736 if (err && err != -EOPNOTSUPP) 763 if (err && err != -EOPNOTSUPP)
737 return err; 764 return err;
738 765
739 br->vlan_enabled = val; 766 br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
740 br_manage_promisc(br); 767 br_manage_promisc(br);
741 recalculate_group_addr(br); 768 recalculate_group_addr(br);
742 br_recalculate_fwd_mask(br); 769 br_recalculate_fwd_mask(br);
@@ -753,7 +780,7 @@ bool br_vlan_enabled(const struct net_device *dev)
753{ 780{
754 struct net_bridge *br = netdev_priv(dev); 781 struct net_bridge *br = netdev_priv(dev);
755 782
756 return !!br->vlan_enabled; 783 return br_opt_get(br, BROPT_VLAN_ENABLED);
757} 784}
758EXPORT_SYMBOL_GPL(br_vlan_enabled); 785EXPORT_SYMBOL_GPL(br_vlan_enabled);
759 786
@@ -819,7 +846,31 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
819 switch (val) { 846 switch (val) {
820 case 0: 847 case 0:
821 case 1: 848 case 1:
822 br->vlan_stats_enabled = val; 849 br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
850 break;
851 default:
852 return -EINVAL;
853 }
854
855 return 0;
856}
857
858int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
859{
860 struct net_bridge_port *p;
861
862 /* allow to change the option if there are no port vlans configured */
863 list_for_each_entry(p, &br->port_list, list) {
864 struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
865
866 if (vg->num_vlans)
867 return -EBUSY;
868 }
869
870 switch (val) {
871 case 0:
872 case 1:
873 br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
823 break; 874 break;
824 default: 875 default:
825 return -EINVAL; 876 return -EINVAL;
@@ -877,8 +928,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
877 return 0; 928 return 0;
878 } 929 }
879 930
880 changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), 931 changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
881 GFP_KERNEL);
882 if (!changed) 932 if (!changed)
883 return -ENOMEM; 933 return -ENOMEM;
884 934
@@ -925,7 +975,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
925 br->default_pvid = pvid; 975 br->default_pvid = pvid;
926 976
927out: 977out:
928 kfree(changed); 978 bitmap_free(changed);
929 return err; 979 return err;
930 980
931err_port: 981err_port:
@@ -965,7 +1015,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
965 goto out; 1015 goto out;
966 1016
967 /* Only allow default pvid change when filtering is disabled */ 1017 /* Only allow default pvid change when filtering is disabled */
968 if (br->vlan_enabled) { 1018 if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
969 pr_info_once("Please disable vlan filtering to change default_pvid\n"); 1019 pr_info_once("Please disable vlan filtering to change default_pvid\n");
970 err = -EPERM; 1020 err = -EPERM;
971 goto out; 1021 goto out;
@@ -1019,7 +1069,7 @@ int nbp_vlan_init(struct net_bridge_port *p)
1019 .orig_dev = p->br->dev, 1069 .orig_dev = p->br->dev,
1020 .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, 1070 .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
1021 .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, 1071 .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
1022 .u.vlan_filtering = p->br->vlan_enabled, 1072 .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
1023 }; 1073 };
1024 struct net_bridge_vlan_group *vg; 1074 struct net_bridge_vlan_group *vg;
1025 int ret = -ENOMEM; 1075 int ret = -ENOMEM;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index d18965f3291f..416717c57cd1 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file,
941 __poll_t mask; 941 __poll_t mask;
942 struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); 942 struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
943 943
944 sock_poll_wait(file, wait); 944 sock_poll_wait(file, sock, wait);
945 mask = 0; 945 mask = 0;
946 946
947 /* exceptional events? */ 947 /* exceptional events? */
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index b82440e1fcb4..a931a71ef6df 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -264,9 +264,6 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
264 frontpkt = rearpkt; 264 frontpkt = rearpkt;
265 rearpkt = NULL; 265 rearpkt = NULL;
266 266
267 err = -ENOMEM;
268 if (frontpkt == NULL)
269 goto out;
270 err = -EPROTO; 267 err = -EPROTO;
271 if (cfpkt_add_head(frontpkt, head, 6) < 0) 268 if (cfpkt_add_head(frontpkt, head, 6) < 0)
272 goto out; 269 goto out;
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 02172c408ff2..5d6724cee38f 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -46,9 +46,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf)
46 goto fail; 46 goto fail;
47 } 47 }
48 48
49 /* crypto_alloc_skcipher() allocates with GFP_KERNEL */ 49 /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */
50 noio_flag = memalloc_noio_save(); 50 noio_flag = memalloc_noio_save();
51 key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); 51 key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0);
52 memalloc_noio_restore(noio_flag); 52 memalloc_noio_restore(noio_flag);
53 if (IS_ERR(key->tfm)) { 53 if (IS_ERR(key->tfm)) {
54 ret = PTR_ERR(key->tfm); 54 ret = PTR_ERR(key->tfm);
@@ -56,7 +56,7 @@ static int set_secret(struct ceph_crypto_key *key, void *buf)
56 goto fail; 56 goto fail;
57 } 57 }
58 58
59 ret = crypto_skcipher_setkey(key->tfm, key->key, key->len); 59 ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len);
60 if (ret) 60 if (ret)
61 goto fail; 61 goto fail;
62 62
@@ -136,7 +136,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
136 if (key) { 136 if (key) {
137 kfree(key->key); 137 kfree(key->key);
138 key->key = NULL; 138 key->key = NULL;
139 crypto_free_skcipher(key->tfm); 139 crypto_free_sync_skcipher(key->tfm);
140 key->tfm = NULL; 140 key->tfm = NULL;
141 } 141 }
142} 142}
@@ -216,7 +216,7 @@ static void teardown_sgtable(struct sg_table *sgt)
216static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, 216static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
217 void *buf, int buf_len, int in_len, int *pout_len) 217 void *buf, int buf_len, int in_len, int *pout_len)
218{ 218{
219 SKCIPHER_REQUEST_ON_STACK(req, key->tfm); 219 SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
220 struct sg_table sgt; 220 struct sg_table sgt;
221 struct scatterlist prealloc_sg; 221 struct scatterlist prealloc_sg;
222 char iv[AES_BLOCK_SIZE] __aligned(8); 222 char iv[AES_BLOCK_SIZE] __aligned(8);
@@ -232,7 +232,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
232 return ret; 232 return ret;
233 233
234 memcpy(iv, aes_iv, AES_BLOCK_SIZE); 234 memcpy(iv, aes_iv, AES_BLOCK_SIZE);
235 skcipher_request_set_tfm(req, key->tfm); 235 skcipher_request_set_sync_tfm(req, key->tfm);
236 skcipher_request_set_callback(req, 0, NULL, NULL); 236 skcipher_request_set_callback(req, 0, NULL, NULL);
237 skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); 237 skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
238 238
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index bb45c7d43739..96ef4d860bc9 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -13,7 +13,7 @@ struct ceph_crypto_key {
13 struct ceph_timespec created; 13 struct ceph_timespec created;
14 int len; 14 int len;
15 void *key; 15 void *key;
16 struct crypto_skcipher *tfm; 16 struct crypto_sync_skcipher *tfm;
17}; 17};
18 18
19int ceph_crypto_key_clone(struct ceph_crypto_key *dst, 19int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
diff --git a/net/compat.c b/net/compat.c
index 3b2105f6549d..47a614b370cd 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -812,21 +812,21 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len
812 812
813static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, 813static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
814 unsigned int vlen, unsigned int flags, 814 unsigned int vlen, unsigned int flags,
815 struct compat_timespec __user *timeout) 815 struct old_timespec32 __user *timeout)
816{ 816{
817 int datagrams; 817 int datagrams;
818 struct timespec ktspec; 818 struct timespec64 ktspec;
819 819
820 if (timeout == NULL) 820 if (timeout == NULL)
821 return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, 821 return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
822 flags | MSG_CMSG_COMPAT, NULL); 822 flags | MSG_CMSG_COMPAT, NULL);
823 823
824 if (compat_get_timespec(&ktspec, timeout)) 824 if (compat_get_timespec64(&ktspec, timeout))
825 return -EFAULT; 825 return -EFAULT;
826 826
827 datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, 827 datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
828 flags | MSG_CMSG_COMPAT, &ktspec); 828 flags | MSG_CMSG_COMPAT, &ktspec);
829 if (datagrams > 0 && compat_put_timespec(&ktspec, timeout)) 829 if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout))
830 datagrams = -EFAULT; 830 datagrams = -EFAULT;
831 831
832 return datagrams; 832 return datagrams;
@@ -834,7 +834,7 @@ static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
834 834
835COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, 835COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
836 unsigned int, vlen, unsigned int, flags, 836 unsigned int, vlen, unsigned int, flags,
837 struct compat_timespec __user *, timeout) 837 struct old_timespec32 __user *, timeout)
838{ 838{
839 return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); 839 return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
840} 840}
diff --git a/net/core/Makefile b/net/core/Makefile
index 80175e6a2eb8..fccd31e0e7f7 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
16obj-y += net-sysfs.o 16obj-y += net-sysfs.o
17obj-$(CONFIG_PAGE_POOL) += page_pool.o 17obj-$(CONFIG_PAGE_POOL) += page_pool.o
18obj-$(CONFIG_PROC_FS) += net-procfs.o 18obj-$(CONFIG_PROC_FS) += net-procfs.o
19obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
19obj-$(CONFIG_NET_PKTGEN) += pktgen.o 20obj-$(CONFIG_NET_PKTGEN) += pktgen.o
20obj-$(CONFIG_NETPOLL) += netpoll.o 21obj-$(CONFIG_NETPOLL) += netpoll.o
21obj-$(CONFIG_FIB_RULES) += fib_rules.o 22obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
27obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o 28obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
28obj-$(CONFIG_LWTUNNEL) += lwtunnel.o 29obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
29obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o 30obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
31obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
30obj-$(CONFIG_DST_CACHE) += dst_cache.o 32obj-$(CONFIG_DST_CACHE) += dst_cache.o
31obj-$(CONFIG_HWBM) += hwbm.o 33obj-$(CONFIG_HWBM) += hwbm.o
32obj-$(CONFIG_NET_DEVLINK) += devlink.o 34obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9aac0d63d53e..57f3a6fcfc1e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -808,8 +808,9 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
808 return -EINVAL; 808 return -EINVAL;
809 } 809 }
810 810
811 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 811 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
812 netdev_rx_csum_fault(skb->dev); 812 !skb->csum_complete_sw)
813 netdev_rx_csum_fault(NULL);
813 } 814 }
814 return 0; 815 return 0;
815fault: 816fault:
@@ -837,7 +838,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
837 struct sock *sk = sock->sk; 838 struct sock *sk = sock->sk;
838 __poll_t mask; 839 __poll_t mask;
839 840
840 sock_poll_wait(file, wait); 841 sock_poll_wait(file, sock, wait);
841 mask = 0; 842 mask = 0;
842 843
843 /* exceptional events? */ 844 /* exceptional events? */
diff --git a/net/core/dev.c b/net/core/dev.c
index 93243479085f..022ad73d6253 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1976,6 +1976,17 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1976 return false; 1976 return false;
1977} 1977}
1978 1978
1979/**
1980 * dev_nit_active - return true if any network interface taps are in use
1981 *
1982 * @dev: network device to check for the presence of taps
1983 */
1984bool dev_nit_active(struct net_device *dev)
1985{
1986 return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
1987}
1988EXPORT_SYMBOL_GPL(dev_nit_active);
1989
1979/* 1990/*
1980 * Support routine. Sends outgoing frames to any network 1991 * Support routine. Sends outgoing frames to any network
1981 * taps currently in use. 1992 * taps currently in use.
@@ -1991,6 +2002,9 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1991 rcu_read_lock(); 2002 rcu_read_lock();
1992again: 2003again:
1993 list_for_each_entry_rcu(ptype, ptype_list, list) { 2004 list_for_each_entry_rcu(ptype, ptype_list, list) {
2005 if (ptype->ignore_outgoing)
2006 continue;
2007
1994 /* Never send packets back to the socket 2008 /* Never send packets back to the socket
1995 * they originated from - MvS (miquels@drinkel.ow.org) 2009 * they originated from - MvS (miquels@drinkel.ow.org)
1996 */ 2010 */
@@ -3230,7 +3244,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3230 unsigned int len; 3244 unsigned int len;
3231 int rc; 3245 int rc;
3232 3246
3233 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 3247 if (dev_nit_active(dev))
3234 dev_queue_xmit_nit(skb, dev); 3248 dev_queue_xmit_nit(skb, dev);
3235 3249
3236 len = skb->len; 3250 len = skb->len;
@@ -3250,7 +3264,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
3250 while (skb) { 3264 while (skb) {
3251 struct sk_buff *next = skb->next; 3265 struct sk_buff *next = skb->next;
3252 3266
3253 skb->next = NULL; 3267 skb_mark_not_on_list(skb);
3254 rc = xmit_one(skb, dev, txq, next != NULL); 3268 rc = xmit_one(skb, dev, txq, next != NULL);
3255 if (unlikely(!dev_xmit_complete(rc))) { 3269 if (unlikely(!dev_xmit_complete(rc))) {
3256 skb->next = next; 3270 skb->next = next;
@@ -3350,7 +3364,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
3350 3364
3351 for (; skb != NULL; skb = next) { 3365 for (; skb != NULL; skb = next) {
3352 next = skb->next; 3366 next = skb->next;
3353 skb->next = NULL; 3367 skb_mark_not_on_list(skb);
3354 3368
3355 /* in case skb wont be segmented, point to itself */ 3369 /* in case skb wont be segmented, point to itself */
3356 skb->prev = skb; 3370 skb->prev = skb;
@@ -4277,6 +4291,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4277 struct netdev_rx_queue *rxqueue; 4291 struct netdev_rx_queue *rxqueue;
4278 void *orig_data, *orig_data_end; 4292 void *orig_data, *orig_data_end;
4279 u32 metalen, act = XDP_DROP; 4293 u32 metalen, act = XDP_DROP;
4294 __be16 orig_eth_type;
4295 struct ethhdr *eth;
4296 bool orig_bcast;
4280 int hlen, off; 4297 int hlen, off;
4281 u32 mac_len; 4298 u32 mac_len;
4282 4299
@@ -4317,6 +4334,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4317 xdp->data_hard_start = skb->data - skb_headroom(skb); 4334 xdp->data_hard_start = skb->data - skb_headroom(skb);
4318 orig_data_end = xdp->data_end; 4335 orig_data_end = xdp->data_end;
4319 orig_data = xdp->data; 4336 orig_data = xdp->data;
4337 eth = (struct ethhdr *)xdp->data;
4338 orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4339 orig_eth_type = eth->h_proto;
4320 4340
4321 rxqueue = netif_get_rxqueue(skb); 4341 rxqueue = netif_get_rxqueue(skb);
4322 xdp->rxq = &rxqueue->xdp_rxq; 4342 xdp->rxq = &rxqueue->xdp_rxq;
@@ -4340,6 +4360,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4340 4360
4341 } 4361 }
4342 4362
4363 /* check if XDP changed eth hdr such SKB needs update */
4364 eth = (struct ethhdr *)xdp->data;
4365 if ((orig_eth_type != eth->h_proto) ||
4366 (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4367 __skb_push(skb, ETH_HLEN);
4368 skb->protocol = eth_type_trans(skb, skb->dev);
4369 }
4370
4343 switch (act) { 4371 switch (act) {
4344 case XDP_REDIRECT: 4372 case XDP_REDIRECT:
4345 case XDP_TX: 4373 case XDP_TX:
@@ -5314,8 +5342,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5314 list_for_each_entry_safe_reverse(skb, p, head, list) { 5342 list_for_each_entry_safe_reverse(skb, p, head, list) {
5315 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 5343 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5316 return; 5344 return;
5317 list_del(&skb->list); 5345 skb_list_del_init(skb);
5318 skb->next = NULL;
5319 napi_gro_complete(skb); 5346 napi_gro_complete(skb);
5320 napi->gro_hash[index].count--; 5347 napi->gro_hash[index].count--;
5321 } 5348 }
@@ -5500,8 +5527,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
5500 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 5527 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5501 5528
5502 if (pp) { 5529 if (pp) {
5503 list_del(&pp->list); 5530 skb_list_del_init(pp);
5504 pp->next = NULL;
5505 napi_gro_complete(pp); 5531 napi_gro_complete(pp);
5506 napi->gro_hash[hash].count--; 5532 napi->gro_hash[hash].count--;
5507 } 5533 }
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 6bc42933be4a..3a4b29a13d31 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1626,7 +1626,7 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
1626 if (!ops->eswitch_mode_set) 1626 if (!ops->eswitch_mode_set)
1627 return -EOPNOTSUPP; 1627 return -EOPNOTSUPP;
1628 mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); 1628 mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
1629 err = ops->eswitch_mode_set(devlink, mode); 1629 err = ops->eswitch_mode_set(devlink, mode, info->extack);
1630 if (err) 1630 if (err)
1631 return err; 1631 return err;
1632 } 1632 }
@@ -1636,7 +1636,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
1636 return -EOPNOTSUPP; 1636 return -EOPNOTSUPP;
1637 inline_mode = nla_get_u8( 1637 inline_mode = nla_get_u8(
1638 info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); 1638 info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
1639 err = ops->eswitch_inline_mode_set(devlink, inline_mode); 1639 err = ops->eswitch_inline_mode_set(devlink, inline_mode,
1640 info->extack);
1640 if (err) 1641 if (err)
1641 return err; 1642 return err;
1642 } 1643 }
@@ -1645,7 +1646,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
1645 if (!ops->eswitch_encap_mode_set) 1646 if (!ops->eswitch_encap_mode_set)
1646 return -EOPNOTSUPP; 1647 return -EOPNOTSUPP;
1647 encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); 1648 encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
1648 err = ops->eswitch_encap_mode_set(devlink, encap_mode); 1649 err = ops->eswitch_encap_mode_set(devlink, encap_mode,
1650 info->extack);
1649 if (err) 1651 if (err)
1650 return err; 1652 return err;
1651 } 1653 }
@@ -2675,6 +2677,21 @@ static const struct devlink_param devlink_param_generic[] = {
2675 .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, 2677 .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
2676 .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, 2678 .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
2677 }, 2679 },
2680 {
2681 .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
2682 .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
2683 .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
2684 },
2685 {
2686 .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
2687 .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
2688 .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
2689 },
2690 {
2691 .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
2692 .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
2693 .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
2694 },
2678}; 2695};
2679 2696
2680static int devlink_param_generic_verify(const struct devlink_param *param) 2697static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -3495,7 +3512,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
3495 start_offset = *((u64 *)&cb->args[0]); 3512 start_offset = *((u64 *)&cb->args[0]);
3496 3513
3497 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, 3514 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
3498 attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); 3515 attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack);
3499 if (err) 3516 if (err)
3500 goto out; 3517 goto out;
3501 3518
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index aeabc4831fca..d05402868575 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -27,6 +27,7 @@
27#include <linux/rtnetlink.h> 27#include <linux/rtnetlink.h>
28#include <linux/sched/signal.h> 28#include <linux/sched/signal.h>
29#include <linux/net.h> 29#include <linux/net.h>
30#include <net/xdp_sock.h>
30 31
31/* 32/*
32 * Some useful ethtool_ops methods that're device independent. 33 * Some useful ethtool_ops methods that're device independent.
@@ -539,47 +540,17 @@ struct ethtool_link_usettings {
539 } link_modes; 540 } link_modes;
540}; 541};
541 542
542/* Internal kernel helper to query a device ethtool_link_settings. 543/* Internal kernel helper to query a device ethtool_link_settings. */
543 *
544 * Backward compatibility note: for compatibility with legacy drivers
545 * that implement only the ethtool_cmd API, this has to work with both
546 * drivers implementing get_link_ksettings API and drivers
547 * implementing get_settings API. When drivers implement get_settings
548 * and report ethtool_cmd deprecated fields
549 * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
550 * because the resulting struct ethtool_link_settings does not report them.
551 */
552int __ethtool_get_link_ksettings(struct net_device *dev, 544int __ethtool_get_link_ksettings(struct net_device *dev,
553 struct ethtool_link_ksettings *link_ksettings) 545 struct ethtool_link_ksettings *link_ksettings)
554{ 546{
555 int err;
556 struct ethtool_cmd cmd;
557
558 ASSERT_RTNL(); 547 ASSERT_RTNL();
559 548
560 if (dev->ethtool_ops->get_link_ksettings) { 549 if (!dev->ethtool_ops->get_link_ksettings)
561 memset(link_ksettings, 0, sizeof(*link_ksettings));
562 return dev->ethtool_ops->get_link_ksettings(dev,
563 link_ksettings);
564 }
565
566 /* driver doesn't support %ethtool_link_ksettings API. revert to
567 * legacy %ethtool_cmd API, unless it's not supported either.
568 * TODO: remove when ethtool_ops::get_settings disappears internally
569 */
570 if (!dev->ethtool_ops->get_settings)
571 return -EOPNOTSUPP; 550 return -EOPNOTSUPP;
572 551
573 memset(&cmd, 0, sizeof(cmd)); 552 memset(link_ksettings, 0, sizeof(*link_ksettings));
574 cmd.cmd = ETHTOOL_GSET; 553 return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
575 err = dev->ethtool_ops->get_settings(dev, &cmd);
576 if (err < 0)
577 return err;
578
579 /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
580 */
581 convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
582 return err;
583} 554}
584EXPORT_SYMBOL(__ethtool_get_link_ksettings); 555EXPORT_SYMBOL(__ethtool_get_link_ksettings);
585 556
@@ -635,16 +606,7 @@ store_link_ksettings_for_user(void __user *to,
635 return 0; 606 return 0;
636} 607}
637 608
638/* Query device for its ethtool_link_settings. 609/* Query device for its ethtool_link_settings. */
639 *
640 * Backward compatibility note: this function must fail when driver
641 * does not implement ethtool::get_link_ksettings, even if legacy
642 * ethtool_ops::get_settings is implemented. This tells new versions
643 * of ethtool that they should use the legacy API %ETHTOOL_GSET for
644 * this driver, so that they can correctly access the ethtool_cmd
645 * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
646 * implements ethtool_ops::get_settings anymore.
647 */
648static int ethtool_get_link_ksettings(struct net_device *dev, 610static int ethtool_get_link_ksettings(struct net_device *dev,
649 void __user *useraddr) 611 void __user *useraddr)
650{ 612{
@@ -652,7 +614,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
652 struct ethtool_link_ksettings link_ksettings; 614 struct ethtool_link_ksettings link_ksettings;
653 615
654 ASSERT_RTNL(); 616 ASSERT_RTNL();
655
656 if (!dev->ethtool_ops->get_link_ksettings) 617 if (!dev->ethtool_ops->get_link_ksettings)
657 return -EOPNOTSUPP; 618 return -EOPNOTSUPP;
658 619
@@ -699,16 +660,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
699 return store_link_ksettings_for_user(useraddr, &link_ksettings); 660 return store_link_ksettings_for_user(useraddr, &link_ksettings);
700} 661}
701 662
702/* Update device ethtool_link_settings. 663/* Update device ethtool_link_settings. */
703 *
704 * Backward compatibility note: this function must fail when driver
705 * does not implement ethtool::set_link_ksettings, even if legacy
706 * ethtool_ops::set_settings is implemented. This tells new versions
707 * of ethtool that they should use the legacy API %ETHTOOL_SSET for
708 * this driver, so that they can correctly update the ethtool_cmd
709 * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
710 * implements ethtool_ops::get_settings anymore.
711 */
712static int ethtool_set_link_ksettings(struct net_device *dev, 664static int ethtool_set_link_ksettings(struct net_device *dev,
713 void __user *useraddr) 665 void __user *useraddr)
714{ 666{
@@ -746,51 +698,31 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
746 698
747/* Query device for its ethtool_cmd settings. 699/* Query device for its ethtool_cmd settings.
748 * 700 *
749 * Backward compatibility note: for compatibility with legacy ethtool, 701 * Backward compatibility note: for compatibility with legacy ethtool, this is
750 * this has to work with both drivers implementing get_link_ksettings 702 * now implemented via get_link_ksettings. When driver reports higher link mode
751 * API and drivers implementing get_settings API. When drivers 703 * bits, a kernel warning is logged once (with name of 1st driver/device) to
752 * implement get_link_ksettings and report higher link mode bits, a 704 * recommend user to upgrade ethtool, but the command is successful (only the
753 * kernel warning is logged once (with name of 1st driver/device) to 705 * lower link mode bits reported back to user). Deprecated fields from
754 * recommend user to upgrade ethtool, but the command is successful 706 * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero.
755 * (only the lower link mode bits reported back to user).
756 */ 707 */
757static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) 708static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
758{ 709{
710 struct ethtool_link_ksettings link_ksettings;
759 struct ethtool_cmd cmd; 711 struct ethtool_cmd cmd;
712 int err;
760 713
761 ASSERT_RTNL(); 714 ASSERT_RTNL();
715 if (!dev->ethtool_ops->get_link_ksettings)
716 return -EOPNOTSUPP;
762 717
763 if (dev->ethtool_ops->get_link_ksettings) { 718 memset(&link_ksettings, 0, sizeof(link_ksettings));
764 /* First, use link_ksettings API if it is supported */ 719 err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
765 int err; 720 if (err < 0)
766 struct ethtool_link_ksettings link_ksettings; 721 return err;
767 722 convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings);
768 memset(&link_ksettings, 0, sizeof(link_ksettings));
769 err = dev->ethtool_ops->get_link_ksettings(dev,
770 &link_ksettings);
771 if (err < 0)
772 return err;
773 convert_link_ksettings_to_legacy_settings(&cmd,
774 &link_ksettings);
775
776 /* send a sensible cmd tag back to user */
777 cmd.cmd = ETHTOOL_GSET;
778 } else {
779 /* driver doesn't support %ethtool_link_ksettings
780 * API. revert to legacy %ethtool_cmd API, unless it's
781 * not supported either.
782 */
783 int err;
784
785 if (!dev->ethtool_ops->get_settings)
786 return -EOPNOTSUPP;
787 723
788 memset(&cmd, 0, sizeof(cmd)); 724 /* send a sensible cmd tag back to user */
789 cmd.cmd = ETHTOOL_GSET; 725 cmd.cmd = ETHTOOL_GSET;
790 err = dev->ethtool_ops->get_settings(dev, &cmd);
791 if (err < 0)
792 return err;
793 }
794 726
795 if (copy_to_user(useraddr, &cmd, sizeof(cmd))) 727 if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
796 return -EFAULT; 728 return -EFAULT;
@@ -800,48 +732,29 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
800 732
801/* Update device link settings with given ethtool_cmd. 733/* Update device link settings with given ethtool_cmd.
802 * 734 *
803 * Backward compatibility note: for compatibility with legacy ethtool, 735 * Backward compatibility note: for compatibility with legacy ethtool, this is
804 * this has to work with both drivers implementing set_link_ksettings 736 * now always implemented via set_link_settings. When user's request updates
805 * API and drivers implementing set_settings API. When drivers 737 * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
806 * implement set_link_ksettings and user's request updates deprecated 738 * warning is logged once (with name of 1st driver/device) to recommend user to
807 * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel 739 * upgrade ethtool, and the request is rejected.
808 * warning is logged once (with name of 1st driver/device) to
809 * recommend user to upgrade ethtool, and the request is rejected.
810 */ 740 */
811static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) 741static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
812{ 742{
743 struct ethtool_link_ksettings link_ksettings;
813 struct ethtool_cmd cmd; 744 struct ethtool_cmd cmd;
814 745
815 ASSERT_RTNL(); 746 ASSERT_RTNL();
816 747
817 if (copy_from_user(&cmd, useraddr, sizeof(cmd))) 748 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
818 return -EFAULT; 749 return -EFAULT;
819 750 if (!dev->ethtool_ops->set_link_ksettings)
820 /* first, try new %ethtool_link_ksettings API. */
821 if (dev->ethtool_ops->set_link_ksettings) {
822 struct ethtool_link_ksettings link_ksettings;
823
824 if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
825 &cmd))
826 return -EINVAL;
827
828 link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
829 link_ksettings.base.link_mode_masks_nwords
830 = __ETHTOOL_LINK_MODE_MASK_NU32;
831 return dev->ethtool_ops->set_link_ksettings(dev,
832 &link_ksettings);
833 }
834
835 /* legacy %ethtool_cmd API */
836
837 /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
838 * disappears internally
839 */
840
841 if (!dev->ethtool_ops->set_settings)
842 return -EOPNOTSUPP; 751 return -EOPNOTSUPP;
843 752
844 return dev->ethtool_ops->set_settings(dev, &cmd); 753 if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd))
754 return -EINVAL;
755 link_ksettings.base.link_mode_masks_nwords =
756 __ETHTOOL_LINK_MODE_MASK_NU32;
757 return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
845} 758}
846 759
847static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, 760static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
@@ -1753,8 +1666,10 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
1753static noinline_for_stack int ethtool_set_channels(struct net_device *dev, 1666static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
1754 void __user *useraddr) 1667 void __user *useraddr)
1755{ 1668{
1756 struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; 1669 struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
1670 u16 from_channel, to_channel;
1757 u32 max_rx_in_use = 0; 1671 u32 max_rx_in_use = 0;
1672 unsigned int i;
1758 1673
1759 if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) 1674 if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
1760 return -EOPNOTSUPP; 1675 return -EOPNOTSUPP;
@@ -1762,13 +1677,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
1762 if (copy_from_user(&channels, useraddr, sizeof(channels))) 1677 if (copy_from_user(&channels, useraddr, sizeof(channels)))
1763 return -EFAULT; 1678 return -EFAULT;
1764 1679
1765 dev->ethtool_ops->get_channels(dev, &max); 1680 dev->ethtool_ops->get_channels(dev, &curr);
1766 1681
1767 /* ensure new counts are within the maximums */ 1682 /* ensure new counts are within the maximums */
1768 if ((channels.rx_count > max.max_rx) || 1683 if (channels.rx_count > curr.max_rx ||
1769 (channels.tx_count > max.max_tx) || 1684 channels.tx_count > curr.max_tx ||
1770 (channels.combined_count > max.max_combined) || 1685 channels.combined_count > curr.max_combined ||
1771 (channels.other_count > max.max_other)) 1686 channels.other_count > curr.max_other)
1772 return -EINVAL; 1687 return -EINVAL;
1773 1688
1774 /* ensure the new Rx count fits within the configured Rx flow 1689 /* ensure the new Rx count fits within the configured Rx flow
@@ -1778,6 +1693,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
1778 (channels.combined_count + channels.rx_count) <= max_rx_in_use) 1693 (channels.combined_count + channels.rx_count) <= max_rx_in_use)
1779 return -EINVAL; 1694 return -EINVAL;
1780 1695
1696 /* Disabling channels, query zero-copy AF_XDP sockets */
1697 from_channel = channels.combined_count +
1698 min(channels.rx_count, channels.tx_count);
1699 to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
1700 for (i = from_channel; i < to_channel; i++)
1701 if (xdp_get_umem_from_qid(dev, i))
1702 return -EINVAL;
1703
1781 return dev->ethtool_ops->set_channels(dev, &channels); 1704 return dev->ethtool_ops->set_channels(dev, &channels);
1782} 1705}
1783 1706
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 0ff3953f64aa..ffbb827723a2 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1063,13 +1063,47 @@ skip:
1063 return err; 1063 return err;
1064} 1064}
1065 1065
1066static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
1067 struct netlink_ext_ack *extack)
1068{
1069 struct fib_rule_hdr *frh;
1070
1071 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
1072 NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
1073 return -EINVAL;
1074 }
1075
1076 frh = nlmsg_data(nlh);
1077 if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
1078 frh->res1 || frh->res2 || frh->action || frh->flags) {
1079 NL_SET_ERR_MSG(extack,
1080 "Invalid values in header for fib rule dump request");
1081 return -EINVAL;
1082 }
1083
1084 if (nlmsg_attrlen(nlh, sizeof(*frh))) {
1085 NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request");
1086 return -EINVAL;
1087 }
1088
1089 return 0;
1090}
1091
1066static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) 1092static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
1067{ 1093{
1094 const struct nlmsghdr *nlh = cb->nlh;
1068 struct net *net = sock_net(skb->sk); 1095 struct net *net = sock_net(skb->sk);
1069 struct fib_rules_ops *ops; 1096 struct fib_rules_ops *ops;
1070 int idx = 0, family; 1097 int idx = 0, family;
1071 1098
1072 family = rtnl_msg_family(cb->nlh); 1099 if (cb->strict_check) {
1100 int err = fib_valid_dumprule_req(nlh, cb->extack);
1101
1102 if (err < 0)
1103 return err;
1104 }
1105
1106 family = rtnl_msg_family(nlh);
1073 if (family != AF_UNSPEC) { 1107 if (family != AF_UNSPEC) {
1074 /* Protocol specific dump request */ 1108 /* Protocol specific dump request */
1075 ops = lookup_rules_ops(net, family); 1109 ops = lookup_rules_ops(net, family);
diff --git a/net/core/filter.c b/net/core/filter.c
index 5e00f2b85a56..35c6933c2622 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
38#include <net/protocol.h> 38#include <net/protocol.h>
39#include <net/netlink.h> 39#include <net/netlink.h>
40#include <linux/skbuff.h> 40#include <linux/skbuff.h>
41#include <linux/skmsg.h>
41#include <net/sock.h> 42#include <net/sock.h>
42#include <net/flow_dissector.h> 43#include <net/flow_dissector.h>
43#include <linux/errno.h> 44#include <linux/errno.h>
@@ -58,13 +59,17 @@
58#include <net/busy_poll.h> 59#include <net/busy_poll.h>
59#include <net/tcp.h> 60#include <net/tcp.h>
60#include <net/xfrm.h> 61#include <net/xfrm.h>
62#include <net/udp.h>
61#include <linux/bpf_trace.h> 63#include <linux/bpf_trace.h>
62#include <net/xdp_sock.h> 64#include <net/xdp_sock.h>
63#include <linux/inetdevice.h> 65#include <linux/inetdevice.h>
66#include <net/inet_hashtables.h>
67#include <net/inet6_hashtables.h>
64#include <net/ip_fib.h> 68#include <net/ip_fib.h>
65#include <net/flow.h> 69#include <net/flow.h>
66#include <net/arp.h> 70#include <net/arp.h>
67#include <net/ipv6.h> 71#include <net/ipv6.h>
72#include <net/net_namespace.h>
68#include <linux/seg6_local.h> 73#include <linux/seg6_local.h>
69#include <net/seg6.h> 74#include <net/seg6.h>
70#include <net/seg6_local.h> 75#include <net/seg6_local.h>
@@ -2138,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
2138 .arg2_type = ARG_ANYTHING, 2143 .arg2_type = ARG_ANYTHING,
2139}; 2144};
2140 2145
2141BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, 2146BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
2142 struct bpf_map *, map, void *, key, u64, flags)
2143{
2144 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2145
2146 /* If user passes invalid input drop the packet. */
2147 if (unlikely(flags & ~(BPF_F_INGRESS)))
2148 return SK_DROP;
2149
2150 tcb->bpf.flags = flags;
2151 tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
2152 if (!tcb->bpf.sk_redir)
2153 return SK_DROP;
2154
2155 return SK_PASS;
2156}
2157
2158static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
2159 .func = bpf_sk_redirect_hash,
2160 .gpl_only = false,
2161 .ret_type = RET_INTEGER,
2162 .arg1_type = ARG_PTR_TO_CTX,
2163 .arg2_type = ARG_CONST_MAP_PTR,
2164 .arg3_type = ARG_PTR_TO_MAP_KEY,
2165 .arg4_type = ARG_ANYTHING,
2166};
2167
2168BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
2169 struct bpf_map *, map, u32, key, u64, flags)
2170{
2171 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2172
2173 /* If user passes invalid input drop the packet. */
2174 if (unlikely(flags & ~(BPF_F_INGRESS)))
2175 return SK_DROP;
2176
2177 tcb->bpf.flags = flags;
2178 tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
2179 if (!tcb->bpf.sk_redir)
2180 return SK_DROP;
2181
2182 return SK_PASS;
2183}
2184
2185struct sock *do_sk_redirect_map(struct sk_buff *skb)
2186{
2187 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2188
2189 return tcb->bpf.sk_redir;
2190}
2191
2192static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
2193 .func = bpf_sk_redirect_map,
2194 .gpl_only = false,
2195 .ret_type = RET_INTEGER,
2196 .arg1_type = ARG_PTR_TO_CTX,
2197 .arg2_type = ARG_CONST_MAP_PTR,
2198 .arg3_type = ARG_ANYTHING,
2199 .arg4_type = ARG_ANYTHING,
2200};
2201
2202BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
2203 struct bpf_map *, map, void *, key, u64, flags)
2204{
2205 /* If user passes invalid input drop the packet. */
2206 if (unlikely(flags & ~(BPF_F_INGRESS)))
2207 return SK_DROP;
2208
2209 msg->flags = flags;
2210 msg->sk_redir = __sock_hash_lookup_elem(map, key);
2211 if (!msg->sk_redir)
2212 return SK_DROP;
2213
2214 return SK_PASS;
2215}
2216
2217static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
2218 .func = bpf_msg_redirect_hash,
2219 .gpl_only = false,
2220 .ret_type = RET_INTEGER,
2221 .arg1_type = ARG_PTR_TO_CTX,
2222 .arg2_type = ARG_CONST_MAP_PTR,
2223 .arg3_type = ARG_PTR_TO_MAP_KEY,
2224 .arg4_type = ARG_ANYTHING,
2225};
2226
2227BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
2228 struct bpf_map *, map, u32, key, u64, flags)
2229{
2230 /* If user passes invalid input drop the packet. */
2231 if (unlikely(flags & ~(BPF_F_INGRESS)))
2232 return SK_DROP;
2233
2234 msg->flags = flags;
2235 msg->sk_redir = __sock_map_lookup_elem(map, key);
2236 if (!msg->sk_redir)
2237 return SK_DROP;
2238
2239 return SK_PASS;
2240}
2241
2242struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
2243{
2244 return msg->sk_redir;
2245}
2246
2247static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
2248 .func = bpf_msg_redirect_map,
2249 .gpl_only = false,
2250 .ret_type = RET_INTEGER,
2251 .arg1_type = ARG_PTR_TO_CTX,
2252 .arg2_type = ARG_CONST_MAP_PTR,
2253 .arg3_type = ARG_ANYTHING,
2254 .arg4_type = ARG_ANYTHING,
2255};
2256
2257BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
2258{ 2147{
2259 msg->apply_bytes = bytes; 2148 msg->apply_bytes = bytes;
2260 return 0; 2149 return 0;
@@ -2268,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
2268 .arg2_type = ARG_ANYTHING, 2157 .arg2_type = ARG_ANYTHING,
2269}; 2158};
2270 2159
2271BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) 2160BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
2272{ 2161{
2273 msg->cork_bytes = bytes; 2162 msg->cork_bytes = bytes;
2274 return 0; 2163 return 0;
@@ -2282,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
2282 .arg2_type = ARG_ANYTHING, 2171 .arg2_type = ARG_ANYTHING,
2283}; 2172};
2284 2173
2285#define sk_msg_iter_var(var) \ 2174BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
2286 do { \ 2175 u32, end, u64, flags)
2287 var++; \
2288 if (var == MAX_SKB_FRAGS) \
2289 var = 0; \
2290 } while (0)
2291
2292BPF_CALL_4(bpf_msg_pull_data,
2293 struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
2294{ 2176{
2295 unsigned int len = 0, offset = 0, copy = 0, poffset = 0; 2177 u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
2296 int bytes = end - start, bytes_sg_total; 2178 u32 first_sge, last_sge, i, shift, bytes_sg_total;
2297 struct scatterlist *sg = msg->sg_data; 2179 struct scatterlist *sge;
2298 int first_sg, last_sg, i, shift; 2180 u8 *raw, *to, *from;
2299 unsigned char *p, *to, *from;
2300 struct page *page; 2181 struct page *page;
2301 2182
2302 if (unlikely(flags || end <= start)) 2183 if (unlikely(flags || end <= start))
2303 return -EINVAL; 2184 return -EINVAL;
2304 2185
2305 /* First find the starting scatterlist element */ 2186 /* First find the starting scatterlist element */
2306 i = msg->sg_start; 2187 i = msg->sg.start;
2307 do { 2188 do {
2308 len = sg[i].length; 2189 len = sk_msg_elem(msg, i)->length;
2309 if (start < offset + len) 2190 if (start < offset + len)
2310 break; 2191 break;
2311 offset += len; 2192 offset += len;
2312 sk_msg_iter_var(i); 2193 sk_msg_iter_var_next(i);
2313 } while (i != msg->sg_end); 2194 } while (i != msg->sg.end);
2314 2195
2315 if (unlikely(start >= offset + len)) 2196 if (unlikely(start >= offset + len))
2316 return -EINVAL; 2197 return -EINVAL;
2317 2198
2318 first_sg = i; 2199 first_sge = i;
2319 /* The start may point into the sg element so we need to also 2200 /* The start may point into the sg element so we need to also
2320 * account for the headroom. 2201 * account for the headroom.
2321 */ 2202 */
2322 bytes_sg_total = start - offset + bytes; 2203 bytes_sg_total = start - offset + bytes;
2323 if (!msg->sg_copy[i] && bytes_sg_total <= len) 2204 if (!msg->sg.copy[i] && bytes_sg_total <= len)
2324 goto out; 2205 goto out;
2325 2206
2326 /* At this point we need to linearize multiple scatterlist 2207 /* At this point we need to linearize multiple scatterlist
@@ -2334,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
2334 * will copy the entire sg entry. 2215 * will copy the entire sg entry.
2335 */ 2216 */
2336 do { 2217 do {
2337 copy += sg[i].length; 2218 copy += sk_msg_elem(msg, i)->length;
2338 sk_msg_iter_var(i); 2219 sk_msg_iter_var_next(i);
2339 if (bytes_sg_total <= copy) 2220 if (bytes_sg_total <= copy)
2340 break; 2221 break;
2341 } while (i != msg->sg_end); 2222 } while (i != msg->sg.end);
2342 last_sg = i; 2223 last_sge = i;
2343 2224
2344 if (unlikely(bytes_sg_total > copy)) 2225 if (unlikely(bytes_sg_total > copy))
2345 return -EINVAL; 2226 return -EINVAL;
@@ -2348,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
2348 get_order(copy)); 2229 get_order(copy));
2349 if (unlikely(!page)) 2230 if (unlikely(!page))
2350 return -ENOMEM; 2231 return -ENOMEM;
2351 p = page_address(page);
2352 2232
2353 i = first_sg; 2233 raw = page_address(page);
2234 i = first_sge;
2354 do { 2235 do {
2355 from = sg_virt(&sg[i]); 2236 sge = sk_msg_elem(msg, i);
2356 len = sg[i].length; 2237 from = sg_virt(sge);
2357 to = p + poffset; 2238 len = sge->length;
2239 to = raw + poffset;
2358 2240
2359 memcpy(to, from, len); 2241 memcpy(to, from, len);
2360 poffset += len; 2242 poffset += len;
2361 sg[i].length = 0; 2243 sge->length = 0;
2362 put_page(sg_page(&sg[i])); 2244 put_page(sg_page(sge));
2363 2245
2364 sk_msg_iter_var(i); 2246 sk_msg_iter_var_next(i);
2365 } while (i != last_sg); 2247 } while (i != last_sge);
2366 2248
2367 sg[first_sg].length = copy; 2249 sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
2368 sg_set_page(&sg[first_sg], page, copy, 0);
2369 2250
2370 /* To repair sg ring we need to shift entries. If we only 2251 /* To repair sg ring we need to shift entries. If we only
2371 * had a single entry though we can just replace it and 2252 * had a single entry though we can just replace it and
2372 * be done. Otherwise walk the ring and shift the entries. 2253 * be done. Otherwise walk the ring and shift the entries.
2373 */ 2254 */
2374 WARN_ON_ONCE(last_sg == first_sg); 2255 WARN_ON_ONCE(last_sge == first_sge);
2375 shift = last_sg > first_sg ? 2256 shift = last_sge > first_sge ?
2376 last_sg - first_sg - 1 : 2257 last_sge - first_sge - 1 :
2377 MAX_SKB_FRAGS - first_sg + last_sg - 1; 2258 MAX_SKB_FRAGS - first_sge + last_sge - 1;
2378 if (!shift) 2259 if (!shift)
2379 goto out; 2260 goto out;
2380 2261
2381 i = first_sg; 2262 i = first_sge;
2382 sk_msg_iter_var(i); 2263 sk_msg_iter_var_next(i);
2383 do { 2264 do {
2384 int move_from; 2265 u32 move_from;
2385 2266
2386 if (i + shift >= MAX_SKB_FRAGS) 2267 if (i + shift >= MAX_MSG_FRAGS)
2387 move_from = i + shift - MAX_SKB_FRAGS; 2268 move_from = i + shift - MAX_MSG_FRAGS;
2388 else 2269 else
2389 move_from = i + shift; 2270 move_from = i + shift;
2390 2271 if (move_from == msg->sg.end)
2391 if (move_from == msg->sg_end)
2392 break; 2272 break;
2393 2273
2394 sg[i] = sg[move_from]; 2274 msg->sg.data[i] = msg->sg.data[move_from];
2395 sg[move_from].length = 0; 2275 msg->sg.data[move_from].length = 0;
2396 sg[move_from].page_link = 0; 2276 msg->sg.data[move_from].page_link = 0;
2397 sg[move_from].offset = 0; 2277 msg->sg.data[move_from].offset = 0;
2398 2278 sk_msg_iter_var_next(i);
2399 sk_msg_iter_var(i);
2400 } while (1); 2279 } while (1);
2401 msg->sg_end -= shift; 2280
2402 if (msg->sg_end < 0) 2281 msg->sg.end = msg->sg.end - shift > msg->sg.end ?
2403 msg->sg_end += MAX_SKB_FRAGS; 2282 msg->sg.end - shift + MAX_MSG_FRAGS :
2283 msg->sg.end - shift;
2404out: 2284out:
2405 msg->data = sg_virt(&sg[first_sg]) + start - offset; 2285 msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
2406 msg->data_end = msg->data + bytes; 2286 msg->data_end = msg->data + bytes;
2407
2408 return 0; 2287 return 0;
2409} 2288}
2410 2289
@@ -2418,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
2418 .arg4_type = ARG_ANYTHING, 2297 .arg4_type = ARG_ANYTHING,
2419}; 2298};
2420 2299
2300BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
2301 u32, len, u64, flags)
2302{
2303 struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
2304 u32 new, i = 0, l, space, copy = 0, offset = 0;
2305 u8 *raw, *to, *from;
2306 struct page *page;
2307
2308 if (unlikely(flags))
2309 return -EINVAL;
2310
2311 /* First find the starting scatterlist element */
2312 i = msg->sg.start;
2313 do {
2314 l = sk_msg_elem(msg, i)->length;
2315
2316 if (start < offset + l)
2317 break;
2318 offset += l;
2319 sk_msg_iter_var_next(i);
2320 } while (i != msg->sg.end);
2321
2322 if (start >= offset + l)
2323 return -EINVAL;
2324
2325 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2326
2327 /* If no space available will fallback to copy, we need at
2328 * least one scatterlist elem available to push data into
2329 * when start aligns to the beginning of an element or two
2330 * when it falls inside an element. We handle the start equals
2331 * offset case because its the common case for inserting a
2332 * header.
2333 */
2334 if (!space || (space == 1 && start != offset))
2335 copy = msg->sg.data[i].length;
2336
2337 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2338 get_order(copy + len));
2339 if (unlikely(!page))
2340 return -ENOMEM;
2341
2342 if (copy) {
2343 int front, back;
2344
2345 raw = page_address(page);
2346
2347 psge = sk_msg_elem(msg, i);
2348 front = start - offset;
2349 back = psge->length - front;
2350 from = sg_virt(psge);
2351
2352 if (front)
2353 memcpy(raw, from, front);
2354
2355 if (back) {
2356 from += front;
2357 to = raw + front + len;
2358
2359 memcpy(to, from, back);
2360 }
2361
2362 put_page(sg_page(psge));
2363 } else if (start - offset) {
2364 psge = sk_msg_elem(msg, i);
2365 rsge = sk_msg_elem_cpy(msg, i);
2366
2367 psge->length = start - offset;
2368 rsge.length -= psge->length;
2369 rsge.offset += start;
2370
2371 sk_msg_iter_var_next(i);
2372 sg_unmark_end(psge);
2373 sk_msg_iter_next(msg, end);
2374 }
2375
2376 /* Slot(s) to place newly allocated data */
2377 new = i;
2378
2379 /* Shift one or two slots as needed */
2380 if (!copy) {
2381 sge = sk_msg_elem_cpy(msg, i);
2382
2383 sk_msg_iter_var_next(i);
2384 sg_unmark_end(&sge);
2385 sk_msg_iter_next(msg, end);
2386
2387 nsge = sk_msg_elem_cpy(msg, i);
2388 if (rsge.length) {
2389 sk_msg_iter_var_next(i);
2390 nnsge = sk_msg_elem_cpy(msg, i);
2391 }
2392
2393 while (i != msg->sg.end) {
2394 msg->sg.data[i] = sge;
2395 sge = nsge;
2396 sk_msg_iter_var_next(i);
2397 if (rsge.length) {
2398 nsge = nnsge;
2399 nnsge = sk_msg_elem_cpy(msg, i);
2400 } else {
2401 nsge = sk_msg_elem_cpy(msg, i);
2402 }
2403 }
2404 }
2405
2406 /* Place newly allocated data buffer */
2407 sk_mem_charge(msg->sk, len);
2408 msg->sg.size += len;
2409 msg->sg.copy[new] = false;
2410 sg_set_page(&msg->sg.data[new], page, len + copy, 0);
2411 if (rsge.length) {
2412 get_page(sg_page(&rsge));
2413 sk_msg_iter_var_next(new);
2414 msg->sg.data[new] = rsge;
2415 }
2416
2417 sk_msg_compute_data_pointers(msg);
2418 return 0;
2419}
2420
2421static const struct bpf_func_proto bpf_msg_push_data_proto = {
2422 .func = bpf_msg_push_data,
2423 .gpl_only = false,
2424 .ret_type = RET_INTEGER,
2425 .arg1_type = ARG_PTR_TO_CTX,
2426 .arg2_type = ARG_ANYTHING,
2427 .arg3_type = ARG_ANYTHING,
2428 .arg4_type = ARG_ANYTHING,
2429};
2430
2421BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 2431BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
2422{ 2432{
2423 return task_get_classid(skb); 2433 return task_get_classid(skb);
@@ -3176,6 +3186,32 @@ static int __bpf_tx_xdp(struct net_device *dev,
3176 return 0; 3186 return 0;
3177} 3187}
3178 3188
3189static noinline int
3190xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
3191 struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
3192{
3193 struct net_device *fwd;
3194 u32 index = ri->ifindex;
3195 int err;
3196
3197 fwd = dev_get_by_index_rcu(dev_net(dev), index);
3198 ri->ifindex = 0;
3199 if (unlikely(!fwd)) {
3200 err = -EINVAL;
3201 goto err;
3202 }
3203
3204 err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
3205 if (unlikely(err))
3206 goto err;
3207
3208 _trace_xdp_redirect(dev, xdp_prog, index);
3209 return 0;
3210err:
3211 _trace_xdp_redirect_err(dev, xdp_prog, index, err);
3212 return err;
3213}
3214
3179static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, 3215static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
3180 struct bpf_map *map, 3216 struct bpf_map *map,
3181 struct xdp_buff *xdp, 3217 struct xdp_buff *xdp,
@@ -3188,7 +3224,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
3188 struct bpf_dtab_netdev *dst = fwd; 3224 struct bpf_dtab_netdev *dst = fwd;
3189 3225
3190 err = dev_map_enqueue(dst, xdp, dev_rx); 3226 err = dev_map_enqueue(dst, xdp, dev_rx);
3191 if (err) 3227 if (unlikely(err))
3192 return err; 3228 return err;
3193 __dev_map_insert_ctx(map, index); 3229 __dev_map_insert_ctx(map, index);
3194 break; 3230 break;
@@ -3197,7 +3233,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
3197 struct bpf_cpu_map_entry *rcpu = fwd; 3233 struct bpf_cpu_map_entry *rcpu = fwd;
3198 3234
3199 err = cpu_map_enqueue(rcpu, xdp, dev_rx); 3235 err = cpu_map_enqueue(rcpu, xdp, dev_rx);
3200 if (err) 3236 if (unlikely(err))
3201 return err; 3237 return err;
3202 __cpu_map_insert_ctx(map, index); 3238 __cpu_map_insert_ctx(map, index);
3203 break; 3239 break;
@@ -3238,7 +3274,7 @@ void xdp_do_flush_map(void)
3238} 3274}
3239EXPORT_SYMBOL_GPL(xdp_do_flush_map); 3275EXPORT_SYMBOL_GPL(xdp_do_flush_map);
3240 3276
3241static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) 3277static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
3242{ 3278{
3243 switch (map->map_type) { 3279 switch (map->map_type) {
3244 case BPF_MAP_TYPE_DEVMAP: 3280 case BPF_MAP_TYPE_DEVMAP:
@@ -3270,9 +3306,9 @@ void bpf_clear_redirect_map(struct bpf_map *map)
3270} 3306}
3271 3307
3272static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, 3308static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
3273 struct bpf_prog *xdp_prog, struct bpf_map *map) 3309 struct bpf_prog *xdp_prog, struct bpf_map *map,
3310 struct bpf_redirect_info *ri)
3274{ 3311{
3275 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3276 u32 index = ri->ifindex; 3312 u32 index = ri->ifindex;
3277 void *fwd = NULL; 3313 void *fwd = NULL;
3278 int err; 3314 int err;
@@ -3281,11 +3317,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
3281 WRITE_ONCE(ri->map, NULL); 3317 WRITE_ONCE(ri->map, NULL);
3282 3318
3283 fwd = __xdp_map_lookup_elem(map, index); 3319 fwd = __xdp_map_lookup_elem(map, index);
3284 if (!fwd) { 3320 if (unlikely(!fwd)) {
3285 err = -EINVAL; 3321 err = -EINVAL;
3286 goto err; 3322 goto err;
3287 } 3323 }
3288 if (ri->map_to_flush && ri->map_to_flush != map) 3324 if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
3289 xdp_do_flush_map(); 3325 xdp_do_flush_map();
3290 3326
3291 err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); 3327 err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
@@ -3305,29 +3341,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
3305{ 3341{
3306 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3342 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3307 struct bpf_map *map = READ_ONCE(ri->map); 3343 struct bpf_map *map = READ_ONCE(ri->map);
3308 struct net_device *fwd;
3309 u32 index = ri->ifindex;
3310 int err;
3311 3344
3312 if (map) 3345 if (likely(map))
3313 return xdp_do_redirect_map(dev, xdp, xdp_prog, map); 3346 return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
3314
3315 fwd = dev_get_by_index_rcu(dev_net(dev), index);
3316 ri->ifindex = 0;
3317 if (unlikely(!fwd)) {
3318 err = -EINVAL;
3319 goto err;
3320 }
3321 3347
3322 err = __bpf_tx_xdp(fwd, NULL, xdp, 0); 3348 return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
3323 if (unlikely(err))
3324 goto err;
3325
3326 _trace_xdp_redirect(dev, xdp_prog, index);
3327 return 0;
3328err:
3329 _trace_xdp_redirect_err(dev, xdp_prog, index, err);
3330 return err;
3331} 3349}
3332EXPORT_SYMBOL_GPL(xdp_do_redirect); 3350EXPORT_SYMBOL_GPL(xdp_do_redirect);
3333 3351
@@ -3915,8 +3933,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
3915 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 3933 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
3916 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 3934 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
3917 break; 3935 break;
3918 case SO_MAX_PACING_RATE: 3936 case SO_MAX_PACING_RATE: /* 32bit version */
3919 sk->sk_max_pacing_rate = val; 3937 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
3920 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 3938 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
3921 sk->sk_max_pacing_rate); 3939 sk->sk_max_pacing_rate);
3922 break; 3940 break;
@@ -4013,6 +4031,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
4013 tp->snd_ssthresh = val; 4031 tp->snd_ssthresh = val;
4014 } 4032 }
4015 break; 4033 break;
4034 case TCP_SAVE_SYN:
4035 if (val < 0 || val > 1)
4036 ret = -EINVAL;
4037 else
4038 tp->save_syn = val;
4039 break;
4016 default: 4040 default:
4017 ret = -EINVAL; 4041 ret = -EINVAL;
4018 } 4042 }
@@ -4042,17 +4066,29 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
4042 4066
4043 if (!sk_fullsock(sk)) 4067 if (!sk_fullsock(sk))
4044 goto err_clear; 4068 goto err_clear;
4045
4046#ifdef CONFIG_INET 4069#ifdef CONFIG_INET
4047 if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { 4070 if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
4048 if (optname == TCP_CONGESTION) { 4071 struct inet_connection_sock *icsk;
4049 struct inet_connection_sock *icsk = inet_csk(sk); 4072 struct tcp_sock *tp;
4073
4074 switch (optname) {
4075 case TCP_CONGESTION:
4076 icsk = inet_csk(sk);
4050 4077
4051 if (!icsk->icsk_ca_ops || optlen <= 1) 4078 if (!icsk->icsk_ca_ops || optlen <= 1)
4052 goto err_clear; 4079 goto err_clear;
4053 strncpy(optval, icsk->icsk_ca_ops->name, optlen); 4080 strncpy(optval, icsk->icsk_ca_ops->name, optlen);
4054 optval[optlen - 1] = 0; 4081 optval[optlen - 1] = 0;
4055 } else { 4082 break;
4083 case TCP_SAVED_SYN:
4084 tp = tcp_sk(sk);
4085
4086 if (optlen <= 0 || !tp->saved_syn ||
4087 optlen > tp->saved_syn[0])
4088 goto err_clear;
4089 memcpy(optval, tp->saved_syn + 1, optlen);
4090 break;
4091 default:
4056 goto err_clear; 4092 goto err_clear;
4057 } 4093 }
4058 } else if (level == SOL_IP) { 4094 } else if (level == SOL_IP) {
@@ -4787,6 +4823,149 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
4787}; 4823};
4788#endif /* CONFIG_IPV6_SEG6_BPF */ 4824#endif /* CONFIG_IPV6_SEG6_BPF */
4789 4825
4826#ifdef CONFIG_INET
4827static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
4828 struct sk_buff *skb, u8 family, u8 proto)
4829{
4830 bool refcounted = false;
4831 struct sock *sk = NULL;
4832 int dif = 0;
4833
4834 if (skb->dev)
4835 dif = skb->dev->ifindex;
4836
4837 if (family == AF_INET) {
4838 __be32 src4 = tuple->ipv4.saddr;
4839 __be32 dst4 = tuple->ipv4.daddr;
4840 int sdif = inet_sdif(skb);
4841
4842 if (proto == IPPROTO_TCP)
4843 sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
4844 src4, tuple->ipv4.sport,
4845 dst4, tuple->ipv4.dport,
4846 dif, sdif, &refcounted);
4847 else
4848 sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
4849 dst4, tuple->ipv4.dport,
4850 dif, sdif, &udp_table, skb);
4851#if IS_ENABLED(CONFIG_IPV6)
4852 } else {
4853 struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
4854 struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
4855 u16 hnum = ntohs(tuple->ipv6.dport);
4856 int sdif = inet6_sdif(skb);
4857
4858 if (proto == IPPROTO_TCP)
4859 sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
4860 src6, tuple->ipv6.sport,
4861 dst6, hnum,
4862 dif, sdif, &refcounted);
4863 else if (likely(ipv6_bpf_stub))
4864 sk = ipv6_bpf_stub->udp6_lib_lookup(net,
4865 src6, tuple->ipv6.sport,
4866 dst6, hnum,
4867 dif, sdif,
4868 &udp_table, skb);
4869#endif
4870 }
4871
4872 if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
4873 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
4874 sk = NULL;
4875 }
4876 return sk;
4877}
4878
4879/* bpf_sk_lookup performs the core lookup for different types of sockets,
4880 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
4881 * Returns the socket as an 'unsigned long' to simplify the casting in the
4882 * callers to satisfy BPF_CALL declarations.
4883 */
4884static unsigned long
4885bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
4886 u8 proto, u64 netns_id, u64 flags)
4887{
4888 struct net *caller_net;
4889 struct sock *sk = NULL;
4890 u8 family = AF_UNSPEC;
4891 struct net *net;
4892
4893 family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
4894 if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
4895 goto out;
4896
4897 if (skb->dev)
4898 caller_net = dev_net(skb->dev);
4899 else
4900 caller_net = sock_net(skb->sk);
4901 if (netns_id) {
4902 net = get_net_ns_by_id(caller_net, netns_id);
4903 if (unlikely(!net))
4904 goto out;
4905 sk = sk_lookup(net, tuple, skb, family, proto);
4906 put_net(net);
4907 } else {
4908 net = caller_net;
4909 sk = sk_lookup(net, tuple, skb, family, proto);
4910 }
4911
4912 if (sk)
4913 sk = sk_to_full_sk(sk);
4914out:
4915 return (unsigned long) sk;
4916}
4917
4918BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
4919 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
4920{
4921 return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
4922}
4923
4924static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
4925 .func = bpf_sk_lookup_tcp,
4926 .gpl_only = false,
4927 .pkt_access = true,
4928 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4929 .arg1_type = ARG_PTR_TO_CTX,
4930 .arg2_type = ARG_PTR_TO_MEM,
4931 .arg3_type = ARG_CONST_SIZE,
4932 .arg4_type = ARG_ANYTHING,
4933 .arg5_type = ARG_ANYTHING,
4934};
4935
4936BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
4937 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
4938{
4939 return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
4940}
4941
4942static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
4943 .func = bpf_sk_lookup_udp,
4944 .gpl_only = false,
4945 .pkt_access = true,
4946 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
4947 .arg1_type = ARG_PTR_TO_CTX,
4948 .arg2_type = ARG_PTR_TO_MEM,
4949 .arg3_type = ARG_CONST_SIZE,
4950 .arg4_type = ARG_ANYTHING,
4951 .arg5_type = ARG_ANYTHING,
4952};
4953
4954BPF_CALL_1(bpf_sk_release, struct sock *, sk)
4955{
4956 if (!sock_flag(sk, SOCK_RCU_FREE))
4957 sock_gen_put(sk);
4958 return 0;
4959}
4960
4961static const struct bpf_func_proto bpf_sk_release_proto = {
4962 .func = bpf_sk_release,
4963 .gpl_only = false,
4964 .ret_type = RET_INTEGER,
4965 .arg1_type = ARG_PTR_TO_SOCKET,
4966};
4967#endif /* CONFIG_INET */
4968
4790bool bpf_helper_changes_pkt_data(void *func) 4969bool bpf_helper_changes_pkt_data(void *func)
4791{ 4970{
4792 if (func == bpf_skb_vlan_push || 4971 if (func == bpf_skb_vlan_push ||
@@ -4806,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func)
4806 func == bpf_xdp_adjust_head || 4985 func == bpf_xdp_adjust_head ||
4807 func == bpf_xdp_adjust_meta || 4986 func == bpf_xdp_adjust_meta ||
4808 func == bpf_msg_pull_data || 4987 func == bpf_msg_pull_data ||
4988 func == bpf_msg_push_data ||
4809 func == bpf_xdp_adjust_tail || 4989 func == bpf_xdp_adjust_tail ||
4810#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4990#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4811 func == bpf_lwt_seg6_store_bytes || 4991 func == bpf_lwt_seg6_store_bytes ||
@@ -4828,6 +5008,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
4828 return &bpf_map_update_elem_proto; 5008 return &bpf_map_update_elem_proto;
4829 case BPF_FUNC_map_delete_elem: 5009 case BPF_FUNC_map_delete_elem:
4830 return &bpf_map_delete_elem_proto; 5010 return &bpf_map_delete_elem_proto;
5011 case BPF_FUNC_map_push_elem:
5012 return &bpf_map_push_elem_proto;
5013 case BPF_FUNC_map_pop_elem:
5014 return &bpf_map_pop_elem_proto;
5015 case BPF_FUNC_map_peek_elem:
5016 return &bpf_map_peek_elem_proto;
4831 case BPF_FUNC_get_prandom_u32: 5017 case BPF_FUNC_get_prandom_u32:
4832 return &bpf_get_prandom_u32_proto; 5018 return &bpf_get_prandom_u32_proto;
4833 case BPF_FUNC_get_smp_processor_id: 5019 case BPF_FUNC_get_smp_processor_id:
@@ -4993,6 +5179,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4993 case BPF_FUNC_skb_ancestor_cgroup_id: 5179 case BPF_FUNC_skb_ancestor_cgroup_id:
4994 return &bpf_skb_ancestor_cgroup_id_proto; 5180 return &bpf_skb_ancestor_cgroup_id_proto;
4995#endif 5181#endif
5182#ifdef CONFIG_INET
5183 case BPF_FUNC_sk_lookup_tcp:
5184 return &bpf_sk_lookup_tcp_proto;
5185 case BPF_FUNC_sk_lookup_udp:
5186 return &bpf_sk_lookup_udp_proto;
5187 case BPF_FUNC_sk_release:
5188 return &bpf_sk_release_proto;
5189#endif
4996 default: 5190 default:
4997 return bpf_base_func_proto(func_id); 5191 return bpf_base_func_proto(func_id);
4998 } 5192 }
@@ -5025,6 +5219,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5025 } 5219 }
5026} 5220}
5027 5221
5222const struct bpf_func_proto bpf_sock_map_update_proto __weak;
5223const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
5224
5028static const struct bpf_func_proto * 5225static const struct bpf_func_proto *
5029sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 5226sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5030{ 5227{
@@ -5048,6 +5245,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5048 } 5245 }
5049} 5246}
5050 5247
5248const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
5249const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
5250
5051static const struct bpf_func_proto * 5251static const struct bpf_func_proto *
5052sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 5252sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5053{ 5253{
@@ -5062,6 +5262,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5062 return &bpf_msg_cork_bytes_proto; 5262 return &bpf_msg_cork_bytes_proto;
5063 case BPF_FUNC_msg_pull_data: 5263 case BPF_FUNC_msg_pull_data:
5064 return &bpf_msg_pull_data_proto; 5264 return &bpf_msg_pull_data_proto;
5265 case BPF_FUNC_msg_push_data:
5266 return &bpf_msg_push_data_proto;
5065 case BPF_FUNC_get_local_storage: 5267 case BPF_FUNC_get_local_storage:
5066 return &bpf_get_local_storage_proto; 5268 return &bpf_get_local_storage_proto;
5067 default: 5269 default:
@@ -5069,6 +5271,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5069 } 5271 }
5070} 5272}
5071 5273
5274const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
5275const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
5276
5072static const struct bpf_func_proto * 5277static const struct bpf_func_proto *
5073sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 5278sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5074{ 5279{
@@ -5093,6 +5298,25 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5093 return &bpf_sk_redirect_hash_proto; 5298 return &bpf_sk_redirect_hash_proto;
5094 case BPF_FUNC_get_local_storage: 5299 case BPF_FUNC_get_local_storage:
5095 return &bpf_get_local_storage_proto; 5300 return &bpf_get_local_storage_proto;
5301#ifdef CONFIG_INET
5302 case BPF_FUNC_sk_lookup_tcp:
5303 return &bpf_sk_lookup_tcp_proto;
5304 case BPF_FUNC_sk_lookup_udp:
5305 return &bpf_sk_lookup_udp_proto;
5306 case BPF_FUNC_sk_release:
5307 return &bpf_sk_release_proto;
5308#endif
5309 default:
5310 return bpf_base_func_proto(func_id);
5311 }
5312}
5313
5314static const struct bpf_func_proto *
5315flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5316{
5317 switch (func_id) {
5318 case BPF_FUNC_skb_load_bytes:
5319 return &bpf_skb_load_bytes_proto;
5096 default: 5320 default:
5097 return bpf_base_func_proto(func_id); 5321 return bpf_base_func_proto(func_id);
5098 } 5322 }
@@ -5216,6 +5440,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
5216 if (size != size_default) 5440 if (size != size_default)
5217 return false; 5441 return false;
5218 break; 5442 break;
5443 case bpf_ctx_range(struct __sk_buff, flow_keys):
5444 if (size != sizeof(struct bpf_flow_keys *))
5445 return false;
5446 break;
5219 default: 5447 default:
5220 /* Only narrow read access allowed for now. */ 5448 /* Only narrow read access allowed for now. */
5221 if (type == BPF_WRITE) { 5449 if (type == BPF_WRITE) {
@@ -5241,6 +5469,7 @@ static bool sk_filter_is_valid_access(int off, int size,
5241 case bpf_ctx_range(struct __sk_buff, data): 5469 case bpf_ctx_range(struct __sk_buff, data):
5242 case bpf_ctx_range(struct __sk_buff, data_meta): 5470 case bpf_ctx_range(struct __sk_buff, data_meta):
5243 case bpf_ctx_range(struct __sk_buff, data_end): 5471 case bpf_ctx_range(struct __sk_buff, data_end):
5472 case bpf_ctx_range(struct __sk_buff, flow_keys):
5244 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 5473 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
5245 return false; 5474 return false;
5246 } 5475 }
@@ -5257,6 +5486,40 @@ static bool sk_filter_is_valid_access(int off, int size,
5257 return bpf_skb_is_valid_access(off, size, type, prog, info); 5486 return bpf_skb_is_valid_access(off, size, type, prog, info);
5258} 5487}
5259 5488
5489static bool cg_skb_is_valid_access(int off, int size,
5490 enum bpf_access_type type,
5491 const struct bpf_prog *prog,
5492 struct bpf_insn_access_aux *info)
5493{
5494 switch (off) {
5495 case bpf_ctx_range(struct __sk_buff, tc_classid):
5496 case bpf_ctx_range(struct __sk_buff, data_meta):
5497 case bpf_ctx_range(struct __sk_buff, flow_keys):
5498 return false;
5499 }
5500 if (type == BPF_WRITE) {
5501 switch (off) {
5502 case bpf_ctx_range(struct __sk_buff, mark):
5503 case bpf_ctx_range(struct __sk_buff, priority):
5504 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
5505 break;
5506 default:
5507 return false;
5508 }
5509 }
5510
5511 switch (off) {
5512 case bpf_ctx_range(struct __sk_buff, data):
5513 info->reg_type = PTR_TO_PACKET;
5514 break;
5515 case bpf_ctx_range(struct __sk_buff, data_end):
5516 info->reg_type = PTR_TO_PACKET_END;
5517 break;
5518 }
5519
5520 return bpf_skb_is_valid_access(off, size, type, prog, info);
5521}
5522
5260static bool lwt_is_valid_access(int off, int size, 5523static bool lwt_is_valid_access(int off, int size,
5261 enum bpf_access_type type, 5524 enum bpf_access_type type,
5262 const struct bpf_prog *prog, 5525 const struct bpf_prog *prog,
@@ -5266,6 +5529,7 @@ static bool lwt_is_valid_access(int off, int size,
5266 case bpf_ctx_range(struct __sk_buff, tc_classid): 5529 case bpf_ctx_range(struct __sk_buff, tc_classid):
5267 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 5530 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
5268 case bpf_ctx_range(struct __sk_buff, data_meta): 5531 case bpf_ctx_range(struct __sk_buff, data_meta):
5532 case bpf_ctx_range(struct __sk_buff, flow_keys):
5269 return false; 5533 return false;
5270 } 5534 }
5271 5535
@@ -5351,23 +5615,29 @@ static bool __sock_filter_check_size(int off, int size,
5351 return size == size_default; 5615 return size == size_default;
5352} 5616}
5353 5617
5354static bool sock_filter_is_valid_access(int off, int size, 5618bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
5355 enum bpf_access_type type, 5619 struct bpf_insn_access_aux *info)
5356 const struct bpf_prog *prog,
5357 struct bpf_insn_access_aux *info)
5358{ 5620{
5359 if (off < 0 || off >= sizeof(struct bpf_sock)) 5621 if (off < 0 || off >= sizeof(struct bpf_sock))
5360 return false; 5622 return false;
5361 if (off % size != 0) 5623 if (off % size != 0)
5362 return false; 5624 return false;
5363 if (!__sock_filter_check_attach_type(off, type,
5364 prog->expected_attach_type))
5365 return false;
5366 if (!__sock_filter_check_size(off, size, info)) 5625 if (!__sock_filter_check_size(off, size, info))
5367 return false; 5626 return false;
5368 return true; 5627 return true;
5369} 5628}
5370 5629
5630static bool sock_filter_is_valid_access(int off, int size,
5631 enum bpf_access_type type,
5632 const struct bpf_prog *prog,
5633 struct bpf_insn_access_aux *info)
5634{
5635 if (!bpf_sock_is_valid_access(off, size, type, info))
5636 return false;
5637 return __sock_filter_check_attach_type(off, type,
5638 prog->expected_attach_type);
5639}
5640
5371static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, 5641static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
5372 const struct bpf_prog *prog, int drop_verdict) 5642 const struct bpf_prog *prog, int drop_verdict)
5373{ 5643{
@@ -5476,6 +5746,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
5476 case bpf_ctx_range(struct __sk_buff, data_end): 5746 case bpf_ctx_range(struct __sk_buff, data_end):
5477 info->reg_type = PTR_TO_PACKET_END; 5747 info->reg_type = PTR_TO_PACKET_END;
5478 break; 5748 break;
5749 case bpf_ctx_range(struct __sk_buff, flow_keys):
5479 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 5750 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
5480 return false; 5751 return false;
5481 } 5752 }
@@ -5677,6 +5948,7 @@ static bool sk_skb_is_valid_access(int off, int size,
5677 switch (off) { 5948 switch (off) {
5678 case bpf_ctx_range(struct __sk_buff, tc_classid): 5949 case bpf_ctx_range(struct __sk_buff, tc_classid):
5679 case bpf_ctx_range(struct __sk_buff, data_meta): 5950 case bpf_ctx_range(struct __sk_buff, data_meta):
5951 case bpf_ctx_range(struct __sk_buff, flow_keys):
5680 return false; 5952 return false;
5681 } 5953 }
5682 5954
@@ -5736,6 +6008,39 @@ static bool sk_msg_is_valid_access(int off, int size,
5736 return true; 6008 return true;
5737} 6009}
5738 6010
6011static bool flow_dissector_is_valid_access(int off, int size,
6012 enum bpf_access_type type,
6013 const struct bpf_prog *prog,
6014 struct bpf_insn_access_aux *info)
6015{
6016 if (type == BPF_WRITE) {
6017 switch (off) {
6018 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
6019 break;
6020 default:
6021 return false;
6022 }
6023 }
6024
6025 switch (off) {
6026 case bpf_ctx_range(struct __sk_buff, data):
6027 info->reg_type = PTR_TO_PACKET;
6028 break;
6029 case bpf_ctx_range(struct __sk_buff, data_end):
6030 info->reg_type = PTR_TO_PACKET_END;
6031 break;
6032 case bpf_ctx_range(struct __sk_buff, flow_keys):
6033 info->reg_type = PTR_TO_FLOW_KEYS;
6034 break;
6035 case bpf_ctx_range(struct __sk_buff, tc_classid):
6036 case bpf_ctx_range(struct __sk_buff, data_meta):
6037 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
6038 return false;
6039 }
6040
6041 return bpf_skb_is_valid_access(off, size, type, prog, info);
6042}
6043
5739static u32 bpf_convert_ctx_access(enum bpf_access_type type, 6044static u32 bpf_convert_ctx_access(enum bpf_access_type type,
5740 const struct bpf_insn *si, 6045 const struct bpf_insn *si,
5741 struct bpf_insn *insn_buf, 6046 struct bpf_insn *insn_buf,
@@ -6030,15 +6335,24 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
6030 bpf_target_off(struct sock_common, 6335 bpf_target_off(struct sock_common,
6031 skc_num, 2, target_size)); 6336 skc_num, 2, target_size));
6032 break; 6337 break;
6338
6339 case offsetof(struct __sk_buff, flow_keys):
6340 off = si->off;
6341 off -= offsetof(struct __sk_buff, flow_keys);
6342 off += offsetof(struct sk_buff, cb);
6343 off += offsetof(struct qdisc_skb_cb, flow_keys);
6344 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
6345 si->src_reg, off);
6346 break;
6033 } 6347 }
6034 6348
6035 return insn - insn_buf; 6349 return insn - insn_buf;
6036} 6350}
6037 6351
6038static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, 6352u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
6039 const struct bpf_insn *si, 6353 const struct bpf_insn *si,
6040 struct bpf_insn *insn_buf, 6354 struct bpf_insn *insn_buf,
6041 struct bpf_prog *prog, u32 *target_size) 6355 struct bpf_prog *prog, u32 *target_size)
6042{ 6356{
6043 struct bpf_insn *insn = insn_buf; 6357 struct bpf_insn *insn = insn_buf;
6044 int off; 6358 int off;
@@ -6748,22 +7062,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6748 7062
6749 switch (si->off) { 7063 switch (si->off) {
6750 case offsetof(struct sk_msg_md, data): 7064 case offsetof(struct sk_msg_md, data):
6751 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), 7065 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
6752 si->dst_reg, si->src_reg, 7066 si->dst_reg, si->src_reg,
6753 offsetof(struct sk_msg_buff, data)); 7067 offsetof(struct sk_msg, data));
6754 break; 7068 break;
6755 case offsetof(struct sk_msg_md, data_end): 7069 case offsetof(struct sk_msg_md, data_end):
6756 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), 7070 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
6757 si->dst_reg, si->src_reg, 7071 si->dst_reg, si->src_reg,
6758 offsetof(struct sk_msg_buff, data_end)); 7072 offsetof(struct sk_msg, data_end));
6759 break; 7073 break;
6760 case offsetof(struct sk_msg_md, family): 7074 case offsetof(struct sk_msg_md, family):
6761 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); 7075 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
6762 7076
6763 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7077 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6764 struct sk_msg_buff, sk), 7078 struct sk_msg, sk),
6765 si->dst_reg, si->src_reg, 7079 si->dst_reg, si->src_reg,
6766 offsetof(struct sk_msg_buff, sk)); 7080 offsetof(struct sk_msg, sk));
6767 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 7081 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
6768 offsetof(struct sock_common, skc_family)); 7082 offsetof(struct sock_common, skc_family));
6769 break; 7083 break;
@@ -6772,9 +7086,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6772 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); 7086 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
6773 7087
6774 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7088 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6775 struct sk_msg_buff, sk), 7089 struct sk_msg, sk),
6776 si->dst_reg, si->src_reg, 7090 si->dst_reg, si->src_reg,
6777 offsetof(struct sk_msg_buff, sk)); 7091 offsetof(struct sk_msg, sk));
6778 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 7092 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6779 offsetof(struct sock_common, skc_daddr)); 7093 offsetof(struct sock_common, skc_daddr));
6780 break; 7094 break;
@@ -6784,9 +7098,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6784 skc_rcv_saddr) != 4); 7098 skc_rcv_saddr) != 4);
6785 7099
6786 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7100 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6787 struct sk_msg_buff, sk), 7101 struct sk_msg, sk),
6788 si->dst_reg, si->src_reg, 7102 si->dst_reg, si->src_reg,
6789 offsetof(struct sk_msg_buff, sk)); 7103 offsetof(struct sk_msg, sk));
6790 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 7104 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6791 offsetof(struct sock_common, 7105 offsetof(struct sock_common,
6792 skc_rcv_saddr)); 7106 skc_rcv_saddr));
@@ -6801,9 +7115,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6801 off = si->off; 7115 off = si->off;
6802 off -= offsetof(struct sk_msg_md, remote_ip6[0]); 7116 off -= offsetof(struct sk_msg_md, remote_ip6[0]);
6803 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7117 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6804 struct sk_msg_buff, sk), 7118 struct sk_msg, sk),
6805 si->dst_reg, si->src_reg, 7119 si->dst_reg, si->src_reg,
6806 offsetof(struct sk_msg_buff, sk)); 7120 offsetof(struct sk_msg, sk));
6807 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 7121 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6808 offsetof(struct sock_common, 7122 offsetof(struct sock_common,
6809 skc_v6_daddr.s6_addr32[0]) + 7123 skc_v6_daddr.s6_addr32[0]) +
@@ -6822,9 +7136,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6822 off = si->off; 7136 off = si->off;
6823 off -= offsetof(struct sk_msg_md, local_ip6[0]); 7137 off -= offsetof(struct sk_msg_md, local_ip6[0]);
6824 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7138 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6825 struct sk_msg_buff, sk), 7139 struct sk_msg, sk),
6826 si->dst_reg, si->src_reg, 7140 si->dst_reg, si->src_reg,
6827 offsetof(struct sk_msg_buff, sk)); 7141 offsetof(struct sk_msg, sk));
6828 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 7142 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
6829 offsetof(struct sock_common, 7143 offsetof(struct sock_common,
6830 skc_v6_rcv_saddr.s6_addr32[0]) + 7144 skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -6838,9 +7152,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6838 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); 7152 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
6839 7153
6840 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7154 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6841 struct sk_msg_buff, sk), 7155 struct sk_msg, sk),
6842 si->dst_reg, si->src_reg, 7156 si->dst_reg, si->src_reg,
6843 offsetof(struct sk_msg_buff, sk)); 7157 offsetof(struct sk_msg, sk));
6844 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 7158 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
6845 offsetof(struct sock_common, skc_dport)); 7159 offsetof(struct sock_common, skc_dport));
6846#ifndef __BIG_ENDIAN_BITFIELD 7160#ifndef __BIG_ENDIAN_BITFIELD
@@ -6852,9 +7166,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
6852 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); 7166 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
6853 7167
6854 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 7168 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
6855 struct sk_msg_buff, sk), 7169 struct sk_msg, sk),
6856 si->dst_reg, si->src_reg, 7170 si->dst_reg, si->src_reg,
6857 offsetof(struct sk_msg_buff, sk)); 7171 offsetof(struct sk_msg, sk));
6858 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 7172 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
6859 offsetof(struct sock_common, skc_num)); 7173 offsetof(struct sock_common, skc_num));
6860 break; 7174 break;
@@ -6898,7 +7212,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
6898 7212
6899const struct bpf_verifier_ops cg_skb_verifier_ops = { 7213const struct bpf_verifier_ops cg_skb_verifier_ops = {
6900 .get_func_proto = cg_skb_func_proto, 7214 .get_func_proto = cg_skb_func_proto,
6901 .is_valid_access = sk_filter_is_valid_access, 7215 .is_valid_access = cg_skb_is_valid_access,
6902 .convert_ctx_access = bpf_convert_ctx_access, 7216 .convert_ctx_access = bpf_convert_ctx_access,
6903}; 7217};
6904 7218
@@ -6950,7 +7264,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = {
6950const struct bpf_verifier_ops cg_sock_verifier_ops = { 7264const struct bpf_verifier_ops cg_sock_verifier_ops = {
6951 .get_func_proto = sock_filter_func_proto, 7265 .get_func_proto = sock_filter_func_proto,
6952 .is_valid_access = sock_filter_is_valid_access, 7266 .is_valid_access = sock_filter_is_valid_access,
6953 .convert_ctx_access = sock_filter_convert_ctx_access, 7267 .convert_ctx_access = bpf_sock_convert_ctx_access,
6954}; 7268};
6955 7269
6956const struct bpf_prog_ops cg_sock_prog_ops = { 7270const struct bpf_prog_ops cg_sock_prog_ops = {
@@ -6993,6 +7307,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
6993const struct bpf_prog_ops sk_msg_prog_ops = { 7307const struct bpf_prog_ops sk_msg_prog_ops = {
6994}; 7308};
6995 7309
7310const struct bpf_verifier_ops flow_dissector_verifier_ops = {
7311 .get_func_proto = flow_dissector_func_proto,
7312 .is_valid_access = flow_dissector_is_valid_access,
7313 .convert_ctx_access = bpf_convert_ctx_access,
7314};
7315
7316const struct bpf_prog_ops flow_dissector_prog_ops = {
7317};
7318
6996int sk_detach_filter(struct sock *sk) 7319int sk_detach_filter(struct sock *sk)
6997{ 7320{
6998 int ret = -ENOENT; 7321 int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..676f3ad629f9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -25,6 +25,9 @@
25#include <net/flow_dissector.h> 25#include <net/flow_dissector.h>
26#include <scsi/fc/fc_fcoe.h> 26#include <scsi/fc/fc_fcoe.h>
27#include <uapi/linux/batadv_packet.h> 27#include <uapi/linux/batadv_packet.h>
28#include <linux/bpf.h>
29
30static DEFINE_MUTEX(flow_dissector_mutex);
28 31
29static void dissector_set_key(struct flow_dissector *flow_dissector, 32static void dissector_set_key(struct flow_dissector *flow_dissector,
30 enum flow_dissector_key_id key_id) 33 enum flow_dissector_key_id key_id)
@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
62} 65}
63EXPORT_SYMBOL(skb_flow_dissector_init); 66EXPORT_SYMBOL(skb_flow_dissector_init);
64 67
68int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
69 struct bpf_prog *prog)
70{
71 struct bpf_prog *attached;
72 struct net *net;
73
74 net = current->nsproxy->net_ns;
75 mutex_lock(&flow_dissector_mutex);
76 attached = rcu_dereference_protected(net->flow_dissector_prog,
77 lockdep_is_held(&flow_dissector_mutex));
78 if (attached) {
79 /* Only one BPF program can be attached at a time */
80 mutex_unlock(&flow_dissector_mutex);
81 return -EEXIST;
82 }
83 rcu_assign_pointer(net->flow_dissector_prog, prog);
84 mutex_unlock(&flow_dissector_mutex);
85 return 0;
86}
87
88int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
89{
90 struct bpf_prog *attached;
91 struct net *net;
92
93 net = current->nsproxy->net_ns;
94 mutex_lock(&flow_dissector_mutex);
95 attached = rcu_dereference_protected(net->flow_dissector_prog,
96 lockdep_is_held(&flow_dissector_mutex));
97 if (!attached) {
98 mutex_unlock(&flow_dissector_mutex);
99 return -ENOENT;
100 }
101 bpf_prog_put(attached);
102 RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
103 mutex_unlock(&flow_dissector_mutex);
104 return 0;
105}
65/** 106/**
66 * skb_flow_get_be16 - extract be16 entity 107 * skb_flow_get_be16 - extract be16 entity
67 * @skb: sk_buff to extract from 108 * @skb: sk_buff to extract from
@@ -382,8 +423,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
382 offset += sizeof(struct gre_base_hdr); 423 offset += sizeof(struct gre_base_hdr);
383 424
384 if (hdr->flags & GRE_CSUM) 425 if (hdr->flags & GRE_CSUM)
385 offset += sizeof(((struct gre_full_hdr *) 0)->csum) + 426 offset += FIELD_SIZEOF(struct gre_full_hdr, csum) +
386 sizeof(((struct gre_full_hdr *) 0)->reserved1); 427 FIELD_SIZEOF(struct gre_full_hdr, reserved1);
387 428
388 if (hdr->flags & GRE_KEY) { 429 if (hdr->flags & GRE_KEY) {
389 const __be32 *keyid; 430 const __be32 *keyid;
@@ -405,11 +446,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
405 else 446 else
406 key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; 447 key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
407 } 448 }
408 offset += sizeof(((struct gre_full_hdr *) 0)->key); 449 offset += FIELD_SIZEOF(struct gre_full_hdr, key);
409 } 450 }
410 451
411 if (hdr->flags & GRE_SEQ) 452 if (hdr->flags & GRE_SEQ)
412 offset += sizeof(((struct pptp_gre_header *) 0)->seq); 453 offset += FIELD_SIZEOF(struct pptp_gre_header, seq);
413 454
414 if (gre_ver == 0) { 455 if (gre_ver == 0) {
415 if (*p_proto == htons(ETH_P_TEB)) { 456 if (*p_proto == htons(ETH_P_TEB)) {
@@ -436,7 +477,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
436 u8 *ppp_hdr; 477 u8 *ppp_hdr;
437 478
438 if (hdr->flags & GRE_ACK) 479 if (hdr->flags & GRE_ACK)
439 offset += sizeof(((struct pptp_gre_header *) 0)->ack); 480 offset += FIELD_SIZEOF(struct pptp_gre_header, ack);
440 481
441 ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, 482 ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
442 sizeof(_ppp_hdr), 483 sizeof(_ppp_hdr),
@@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
588 return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); 629 return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
589} 630}
590 631
632static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
633 struct flow_dissector *flow_dissector,
634 void *target_container)
635{
636 struct flow_dissector_key_control *key_control;
637 struct flow_dissector_key_basic *key_basic;
638 struct flow_dissector_key_addrs *key_addrs;
639 struct flow_dissector_key_ports *key_ports;
640
641 key_control = skb_flow_dissector_target(flow_dissector,
642 FLOW_DISSECTOR_KEY_CONTROL,
643 target_container);
644 key_control->thoff = flow_keys->thoff;
645 if (flow_keys->is_frag)
646 key_control->flags |= FLOW_DIS_IS_FRAGMENT;
647 if (flow_keys->is_first_frag)
648 key_control->flags |= FLOW_DIS_FIRST_FRAG;
649 if (flow_keys->is_encap)
650 key_control->flags |= FLOW_DIS_ENCAPSULATION;
651
652 key_basic = skb_flow_dissector_target(flow_dissector,
653 FLOW_DISSECTOR_KEY_BASIC,
654 target_container);
655 key_basic->n_proto = flow_keys->n_proto;
656 key_basic->ip_proto = flow_keys->ip_proto;
657
658 if (flow_keys->addr_proto == ETH_P_IP &&
659 dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
660 key_addrs = skb_flow_dissector_target(flow_dissector,
661 FLOW_DISSECTOR_KEY_IPV4_ADDRS,
662 target_container);
663 key_addrs->v4addrs.src = flow_keys->ipv4_src;
664 key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
665 key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
666 } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
667 dissector_uses_key(flow_dissector,
668 FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
669 key_addrs = skb_flow_dissector_target(flow_dissector,
670 FLOW_DISSECTOR_KEY_IPV6_ADDRS,
671 target_container);
672 memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
673 sizeof(key_addrs->v6addrs));
674 key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
675 }
676
677 if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
678 key_ports = skb_flow_dissector_target(flow_dissector,
679 FLOW_DISSECTOR_KEY_PORTS,
680 target_container);
681 key_ports->src = flow_keys->sport;
682 key_ports->dst = flow_keys->dport;
683 }
684}
685
591/** 686/**
592 * __skb_flow_dissect - extract the flow_keys struct and return it 687 * __skb_flow_dissect - extract the flow_keys struct and return it
593 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified 688 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
619 struct flow_dissector_key_vlan *key_vlan; 714 struct flow_dissector_key_vlan *key_vlan;
620 enum flow_dissect_ret fdret; 715 enum flow_dissect_ret fdret;
621 enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; 716 enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
717 struct bpf_prog *attached = NULL;
622 int num_hdrs = 0; 718 int num_hdrs = 0;
623 u8 ip_proto = 0; 719 u8 ip_proto = 0;
624 bool ret; 720 bool ret;
@@ -658,6 +754,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
658 FLOW_DISSECTOR_KEY_BASIC, 754 FLOW_DISSECTOR_KEY_BASIC,
659 target_container); 755 target_container);
660 756
757 rcu_read_lock();
758 if (skb) {
759 if (skb->dev)
760 attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog);
761 else if (skb->sk)
762 attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog);
763 else
764 WARN_ON_ONCE(1);
765 }
766 if (attached) {
767 /* Note that even though the const qualifier is discarded
768 * throughout the execution of the BPF program, all changes(the
769 * control block) are reverted after the BPF program returns.
770 * Therefore, __skb_flow_dissect does not alter the skb.
771 */
772 struct bpf_flow_keys flow_keys = {};
773 struct bpf_skb_data_end cb_saved;
774 struct bpf_skb_data_end *cb;
775 u32 result;
776
777 cb = (struct bpf_skb_data_end *)skb->cb;
778
779 /* Save Control Block */
780 memcpy(&cb_saved, cb, sizeof(cb_saved));
781 memset(cb, 0, sizeof(cb_saved));
782
783 /* Pass parameters to the BPF program */
784 cb->qdisc_cb.flow_keys = &flow_keys;
785 flow_keys.nhoff = nhoff;
786
787 bpf_compute_data_pointers((struct sk_buff *)skb);
788 result = BPF_PROG_RUN(attached, skb);
789
790 /* Restore state */
791 memcpy(cb, &cb_saved, sizeof(cb_saved));
792
793 __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
794 target_container);
795 key_control->thoff = min_t(u16, key_control->thoff, skb->len);
796 rcu_read_unlock();
797 return result == BPF_OK;
798 }
799 rcu_read_unlock();
800
661 if (dissector_uses_key(flow_dissector, 801 if (dissector_uses_key(flow_dissector,
662 FLOW_DISSECTOR_KEY_ETH_ADDRS)) { 802 FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
663 struct ethhdr *eth = eth_hdr(skb); 803 struct ethhdr *eth = eth_hdr(skb);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 188d693cb251..9bf1b9ad1780 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -162,6 +162,34 @@ __gnet_stats_copy_basic(const seqcount_t *running,
162} 162}
163EXPORT_SYMBOL(__gnet_stats_copy_basic); 163EXPORT_SYMBOL(__gnet_stats_copy_basic);
164 164
165static int
166___gnet_stats_copy_basic(const seqcount_t *running,
167 struct gnet_dump *d,
168 struct gnet_stats_basic_cpu __percpu *cpu,
169 struct gnet_stats_basic_packed *b,
170 int type)
171{
172 struct gnet_stats_basic_packed bstats = {0};
173
174 __gnet_stats_copy_basic(running, &bstats, cpu, b);
175
176 if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
177 d->tc_stats.bytes = bstats.bytes;
178 d->tc_stats.packets = bstats.packets;
179 }
180
181 if (d->tail) {
182 struct gnet_stats_basic sb;
183
184 memset(&sb, 0, sizeof(sb));
185 sb.bytes = bstats.bytes;
186 sb.packets = bstats.packets;
187 return gnet_stats_copy(d, type, &sb, sizeof(sb),
188 TCA_STATS_PAD);
189 }
190 return 0;
191}
192
165/** 193/**
166 * gnet_stats_copy_basic - copy basic statistics into statistic TLV 194 * gnet_stats_copy_basic - copy basic statistics into statistic TLV
167 * @running: seqcount_t pointer 195 * @running: seqcount_t pointer
@@ -181,29 +209,36 @@ gnet_stats_copy_basic(const seqcount_t *running,
181 struct gnet_stats_basic_cpu __percpu *cpu, 209 struct gnet_stats_basic_cpu __percpu *cpu,
182 struct gnet_stats_basic_packed *b) 210 struct gnet_stats_basic_packed *b)
183{ 211{
184 struct gnet_stats_basic_packed bstats = {0}; 212 return ___gnet_stats_copy_basic(running, d, cpu, b,
185 213 TCA_STATS_BASIC);
186 __gnet_stats_copy_basic(running, &bstats, cpu, b);
187
188 if (d->compat_tc_stats) {
189 d->tc_stats.bytes = bstats.bytes;
190 d->tc_stats.packets = bstats.packets;
191 }
192
193 if (d->tail) {
194 struct gnet_stats_basic sb;
195
196 memset(&sb, 0, sizeof(sb));
197 sb.bytes = bstats.bytes;
198 sb.packets = bstats.packets;
199 return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
200 TCA_STATS_PAD);
201 }
202 return 0;
203} 214}
204EXPORT_SYMBOL(gnet_stats_copy_basic); 215EXPORT_SYMBOL(gnet_stats_copy_basic);
205 216
206/** 217/**
218 * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
219 * @running: seqcount_t pointer
220 * @d: dumping handle
221 * @cpu: copy statistic per cpu
222 * @b: basic statistics
223 *
224 * Appends the basic statistics to the top level TLV created by
225 * gnet_stats_start_copy().
226 *
227 * Returns 0 on success or -1 with the statistic lock released
228 * if the room in the socket buffer was not sufficient.
229 */
230int
231gnet_stats_copy_basic_hw(const seqcount_t *running,
232 struct gnet_dump *d,
233 struct gnet_stats_basic_cpu __percpu *cpu,
234 struct gnet_stats_basic_packed *b)
235{
236 return ___gnet_stats_copy_basic(running, d, cpu, b,
237 TCA_STATS_BASIC_HW);
238}
239EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
240
241/**
207 * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV 242 * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
208 * @d: dumping handle 243 * @d: dumping handle
209 * @rate_est: rate estimator 244 * @rate_est: rate estimator
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index e38e641e98d5..7f51efb2b3ab 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -155,7 +155,7 @@ static void linkwatch_do_dev(struct net_device *dev)
155 clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); 155 clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
156 156
157 rfc2863_policy(dev); 157 rfc2863_policy(dev);
158 if (dev->flags & IFF_UP) { 158 if (dev->flags & IFF_UP && netif_device_present(dev)) {
159 if (netif_carrier_ok(dev)) 159 if (netif_carrier_ok(dev))
160 dev_activate(dev); 160 dev_activate(dev);
161 else 161 else
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 4e07824eec5e..41954e42a2de 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -232,7 +232,8 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
232 } 232 }
233} 233}
234 234
235static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) 235static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
236 bool skip_perm)
236{ 237{
237 int i; 238 int i;
238 struct neigh_hash_table *nht; 239 struct neigh_hash_table *nht;
@@ -250,6 +251,10 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
250 np = &n->next; 251 np = &n->next;
251 continue; 252 continue;
252 } 253 }
254 if (skip_perm && n->nud_state & NUD_PERMANENT) {
255 np = &n->next;
256 continue;
257 }
253 rcu_assign_pointer(*np, 258 rcu_assign_pointer(*np,
254 rcu_dereference_protected(n->next, 259 rcu_dereference_protected(n->next,
255 lockdep_is_held(&tbl->lock))); 260 lockdep_is_held(&tbl->lock)));
@@ -285,21 +290,35 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
285void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) 290void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
286{ 291{
287 write_lock_bh(&tbl->lock); 292 write_lock_bh(&tbl->lock);
288 neigh_flush_dev(tbl, dev); 293 neigh_flush_dev(tbl, dev, false);
289 write_unlock_bh(&tbl->lock); 294 write_unlock_bh(&tbl->lock);
290} 295}
291EXPORT_SYMBOL(neigh_changeaddr); 296EXPORT_SYMBOL(neigh_changeaddr);
292 297
293int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) 298static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
299 bool skip_perm)
294{ 300{
295 write_lock_bh(&tbl->lock); 301 write_lock_bh(&tbl->lock);
296 neigh_flush_dev(tbl, dev); 302 neigh_flush_dev(tbl, dev, skip_perm);
297 pneigh_ifdown_and_unlock(tbl, dev); 303 pneigh_ifdown_and_unlock(tbl, dev);
298 304
299 del_timer_sync(&tbl->proxy_timer); 305 del_timer_sync(&tbl->proxy_timer);
300 pneigh_queue_purge(&tbl->proxy_queue); 306 pneigh_queue_purge(&tbl->proxy_queue);
301 return 0; 307 return 0;
302} 308}
309
310int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev)
311{
312 __neigh_ifdown(tbl, dev, true);
313 return 0;
314}
315EXPORT_SYMBOL(neigh_carrier_down);
316
317int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
318{
319 __neigh_ifdown(tbl, dev, false);
320 return 0;
321}
303EXPORT_SYMBOL(neigh_ifdown); 322EXPORT_SYMBOL(neigh_ifdown);
304 323
305static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) 324static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
@@ -1279,11 +1298,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1279 neigh->arp_queue_len_bytes = 0; 1298 neigh->arp_queue_len_bytes = 0;
1280 } 1299 }
1281out: 1300out:
1282 if (update_isrouter) { 1301 if (update_isrouter)
1283 neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? 1302 neigh_update_is_router(neigh, flags, &notify);
1284 (neigh->flags | NTF_ROUTER) :
1285 (neigh->flags & ~NTF_ROUTER);
1286 }
1287 write_unlock_bh(&neigh->lock); 1303 write_unlock_bh(&neigh->lock);
1288 1304
1289 if (notify) 1305 if (notify)
@@ -1711,7 +1727,8 @@ out:
1711static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, 1727static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
1712 struct netlink_ext_ack *extack) 1728 struct netlink_ext_ack *extack)
1713{ 1729{
1714 int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; 1730 int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
1731 NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
1715 struct net *net = sock_net(skb->sk); 1732 struct net *net = sock_net(skb->sk);
1716 struct ndmsg *ndm; 1733 struct ndmsg *ndm;
1717 struct nlattr *tb[NDA_MAX+1]; 1734 struct nlattr *tb[NDA_MAX+1];
@@ -1786,12 +1803,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
1786 } 1803 }
1787 1804
1788 if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) 1805 if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
1789 flags &= ~NEIGH_UPDATE_F_OVERRIDE; 1806 flags &= ~(NEIGH_UPDATE_F_OVERRIDE |
1807 NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
1790 } 1808 }
1791 1809
1792 if (ndm->ndm_flags & NTF_EXT_LEARNED) 1810 if (ndm->ndm_flags & NTF_EXT_LEARNED)
1793 flags |= NEIGH_UPDATE_F_EXT_LEARNED; 1811 flags |= NEIGH_UPDATE_F_EXT_LEARNED;
1794 1812
1813 if (ndm->ndm_flags & NTF_ROUTER)
1814 flags |= NEIGH_UPDATE_F_ISROUTER;
1815
1795 if (ndm->ndm_flags & NTF_USE) { 1816 if (ndm->ndm_flags & NTF_USE) {
1796 neigh_event_send(neigh, NULL); 1817 neigh_event_send(neigh, NULL);
1797 err = 0; 1818 err = 0;
@@ -2161,15 +2182,47 @@ errout:
2161 return err; 2182 return err;
2162} 2183}
2163 2184
2185static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
2186 struct netlink_ext_ack *extack)
2187{
2188 struct ndtmsg *ndtm;
2189
2190 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) {
2191 NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
2192 return -EINVAL;
2193 }
2194
2195 ndtm = nlmsg_data(nlh);
2196 if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
2197 NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
2198 return -EINVAL;
2199 }
2200
2201 if (nlmsg_attrlen(nlh, sizeof(*ndtm))) {
2202 NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request");
2203 return -EINVAL;
2204 }
2205
2206 return 0;
2207}
2208
2164static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) 2209static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2165{ 2210{
2211 const struct nlmsghdr *nlh = cb->nlh;
2166 struct net *net = sock_net(skb->sk); 2212 struct net *net = sock_net(skb->sk);
2167 int family, tidx, nidx = 0; 2213 int family, tidx, nidx = 0;
2168 int tbl_skip = cb->args[0]; 2214 int tbl_skip = cb->args[0];
2169 int neigh_skip = cb->args[1]; 2215 int neigh_skip = cb->args[1];
2170 struct neigh_table *tbl; 2216 struct neigh_table *tbl;
2171 2217
2172 family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; 2218 if (cb->strict_check) {
2219 int err = neightbl_valid_dump_info(nlh, cb->extack);
2220
2221 if (err < 0)
2222 return err;
2223 }
2224
2225 family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
2173 2226
2174 for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { 2227 for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
2175 struct neigh_parms *p; 2228 struct neigh_parms *p;
@@ -2182,7 +2235,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2182 continue; 2235 continue;
2183 2236
2184 if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, 2237 if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
2185 cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, 2238 nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
2186 NLM_F_MULTI) < 0) 2239 NLM_F_MULTI) < 0)
2187 break; 2240 break;
2188 2241
@@ -2197,7 +2250,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2197 2250
2198 if (neightbl_fill_param_info(skb, tbl, p, 2251 if (neightbl_fill_param_info(skb, tbl, p,
2199 NETLINK_CB(cb->skb).portid, 2252 NETLINK_CB(cb->skb).portid,
2200 cb->nlh->nlmsg_seq, 2253 nlh->nlmsg_seq,
2201 RTM_NEWNEIGHTBL, 2254 RTM_NEWNEIGHTBL,
2202 NLM_F_MULTI) < 0) 2255 NLM_F_MULTI) < 0)
2203 goto out; 2256 goto out;
@@ -2311,7 +2364,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
2311 if (!master_idx) 2364 if (!master_idx)
2312 return false; 2365 return false;
2313 2366
2314 master = netdev_master_upper_dev_get(dev); 2367 master = dev ? netdev_master_upper_dev_get(dev) : NULL;
2315 if (!master || master->ifindex != master_idx) 2368 if (!master || master->ifindex != master_idx)
2316 return true; 2369 return true;
2317 2370
@@ -2320,41 +2373,30 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
2320 2373
2321static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) 2374static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
2322{ 2375{
2323 if (filter_idx && dev->ifindex != filter_idx) 2376 if (filter_idx && (!dev || dev->ifindex != filter_idx))
2324 return true; 2377 return true;
2325 2378
2326 return false; 2379 return false;
2327} 2380}
2328 2381
2382struct neigh_dump_filter {
2383 int master_idx;
2384 int dev_idx;
2385};
2386
2329static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, 2387static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2330 struct netlink_callback *cb) 2388 struct netlink_callback *cb,
2389 struct neigh_dump_filter *filter)
2331{ 2390{
2332 struct net *net = sock_net(skb->sk); 2391 struct net *net = sock_net(skb->sk);
2333 const struct nlmsghdr *nlh = cb->nlh;
2334 struct nlattr *tb[NDA_MAX + 1];
2335 struct neighbour *n; 2392 struct neighbour *n;
2336 int rc, h, s_h = cb->args[1]; 2393 int rc, h, s_h = cb->args[1];
2337 int idx, s_idx = idx = cb->args[2]; 2394 int idx, s_idx = idx = cb->args[2];
2338 struct neigh_hash_table *nht; 2395 struct neigh_hash_table *nht;
2339 int filter_master_idx = 0, filter_idx = 0;
2340 unsigned int flags = NLM_F_MULTI; 2396 unsigned int flags = NLM_F_MULTI;
2341 int err;
2342 2397
2343 err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); 2398 if (filter->dev_idx || filter->master_idx)
2344 if (!err) { 2399 flags |= NLM_F_DUMP_FILTERED;
2345 if (tb[NDA_IFINDEX]) {
2346 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
2347 return -EINVAL;
2348 filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
2349 }
2350 if (tb[NDA_MASTER]) {
2351 if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
2352 return -EINVAL;
2353 filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
2354 }
2355 if (filter_idx || filter_master_idx)
2356 flags |= NLM_F_DUMP_FILTERED;
2357 }
2358 2400
2359 rcu_read_lock_bh(); 2401 rcu_read_lock_bh();
2360 nht = rcu_dereference_bh(tbl->nht); 2402 nht = rcu_dereference_bh(tbl->nht);
@@ -2367,8 +2409,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2367 n = rcu_dereference_bh(n->next)) { 2409 n = rcu_dereference_bh(n->next)) {
2368 if (idx < s_idx || !net_eq(dev_net(n->dev), net)) 2410 if (idx < s_idx || !net_eq(dev_net(n->dev), net))
2369 goto next; 2411 goto next;
2370 if (neigh_ifindex_filtered(n->dev, filter_idx) || 2412 if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
2371 neigh_master_filtered(n->dev, filter_master_idx)) 2413 neigh_master_filtered(n->dev, filter->master_idx))
2372 goto next; 2414 goto next;
2373 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, 2415 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
2374 cb->nlh->nlmsg_seq, 2416 cb->nlh->nlmsg_seq,
@@ -2390,12 +2432,17 @@ out:
2390} 2432}
2391 2433
2392static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, 2434static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2393 struct netlink_callback *cb) 2435 struct netlink_callback *cb,
2436 struct neigh_dump_filter *filter)
2394{ 2437{
2395 struct pneigh_entry *n; 2438 struct pneigh_entry *n;
2396 struct net *net = sock_net(skb->sk); 2439 struct net *net = sock_net(skb->sk);
2397 int rc, h, s_h = cb->args[3]; 2440 int rc, h, s_h = cb->args[3];
2398 int idx, s_idx = idx = cb->args[4]; 2441 int idx, s_idx = idx = cb->args[4];
2442 unsigned int flags = NLM_F_MULTI;
2443
2444 if (filter->dev_idx || filter->master_idx)
2445 flags |= NLM_F_DUMP_FILTERED;
2399 2446
2400 read_lock_bh(&tbl->lock); 2447 read_lock_bh(&tbl->lock);
2401 2448
@@ -2405,10 +2452,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2405 for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { 2452 for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
2406 if (idx < s_idx || pneigh_net(n) != net) 2453 if (idx < s_idx || pneigh_net(n) != net)
2407 goto next; 2454 goto next;
2455 if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
2456 neigh_master_filtered(n->dev, filter->master_idx))
2457 goto next;
2408 if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, 2458 if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
2409 cb->nlh->nlmsg_seq, 2459 cb->nlh->nlmsg_seq,
2410 RTM_NEWNEIGH, 2460 RTM_NEWNEIGH, flags, tbl) < 0) {
2411 NLM_F_MULTI, tbl) < 0) {
2412 read_unlock_bh(&tbl->lock); 2461 read_unlock_bh(&tbl->lock);
2413 rc = -1; 2462 rc = -1;
2414 goto out; 2463 goto out;
@@ -2427,22 +2476,91 @@ out:
2427 2476
2428} 2477}
2429 2478
2479static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
2480 bool strict_check,
2481 struct neigh_dump_filter *filter,
2482 struct netlink_ext_ack *extack)
2483{
2484 struct nlattr *tb[NDA_MAX + 1];
2485 int err, i;
2486
2487 if (strict_check) {
2488 struct ndmsg *ndm;
2489
2490 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
2491 NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
2492 return -EINVAL;
2493 }
2494
2495 ndm = nlmsg_data(nlh);
2496 if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
2497 ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) {
2498 NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
2499 return -EINVAL;
2500 }
2501
2502 err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
2503 NULL, extack);
2504 } else {
2505 err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
2506 NULL, extack);
2507 }
2508 if (err < 0)
2509 return err;
2510
2511 for (i = 0; i <= NDA_MAX; ++i) {
2512 if (!tb[i])
2513 continue;
2514
2515 /* all new attributes should require strict_check */
2516 switch (i) {
2517 case NDA_IFINDEX:
2518 if (nla_len(tb[i]) != sizeof(u32)) {
2519 NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request");
2520 return -EINVAL;
2521 }
2522 filter->dev_idx = nla_get_u32(tb[i]);
2523 break;
2524 case NDA_MASTER:
2525 if (nla_len(tb[i]) != sizeof(u32)) {
2526 NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request");
2527 return -EINVAL;
2528 }
2529 filter->master_idx = nla_get_u32(tb[i]);
2530 break;
2531 default:
2532 if (strict_check) {
2533 NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request");
2534 return -EINVAL;
2535 }
2536 }
2537 }
2538
2539 return 0;
2540}
2541
2430static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) 2542static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2431{ 2543{
2544 const struct nlmsghdr *nlh = cb->nlh;
2545 struct neigh_dump_filter filter = {};
2432 struct neigh_table *tbl; 2546 struct neigh_table *tbl;
2433 int t, family, s_t; 2547 int t, family, s_t;
2434 int proxy = 0; 2548 int proxy = 0;
2435 int err; 2549 int err;
2436 2550
2437 family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; 2551 family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
2438 2552
2439 /* check for full ndmsg structure presence, family member is 2553 /* check for full ndmsg structure presence, family member is
2440 * the same for both structures 2554 * the same for both structures
2441 */ 2555 */
2442 if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && 2556 if (nlmsg_len(nlh) >= sizeof(struct ndmsg) &&
2443 ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) 2557 ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY)
2444 proxy = 1; 2558 proxy = 1;
2445 2559
2560 err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
2561 if (err < 0 && cb->strict_check)
2562 return err;
2563
2446 s_t = cb->args[0]; 2564 s_t = cb->args[0];
2447 2565
2448 for (t = 0; t < NEIGH_NR_TABLES; t++) { 2566 for (t = 0; t < NEIGH_NR_TABLES; t++) {
@@ -2456,9 +2574,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2456 memset(&cb->args[1], 0, sizeof(cb->args) - 2574 memset(&cb->args[1], 0, sizeof(cb->args) -
2457 sizeof(cb->args[0])); 2575 sizeof(cb->args[0]));
2458 if (proxy) 2576 if (proxy)
2459 err = pneigh_dump_table(tbl, skb, cb); 2577 err = pneigh_dump_table(tbl, skb, cb, &filter);
2460 else 2578 else
2461 err = neigh_dump_table(tbl, skb, cb); 2579 err = neigh_dump_table(tbl, skb, cb, &filter);
2462 if (err < 0) 2580 if (err < 0)
2463 break; 2581 break;
2464 } 2582 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 670c84b1bfc2..fefe72774aeb 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -853,6 +853,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
853 .s_idx = cb->args[0], 853 .s_idx = cb->args[0],
854 }; 854 };
855 855
856 if (cb->strict_check &&
857 nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) {
858 NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request");
859 return -EINVAL;
860 }
861
856 spin_lock_bh(&net->nsid_lock); 862 spin_lock_bh(&net->nsid_lock);
857 idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); 863 idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
858 spin_unlock_bh(&net->nsid_lock); 864 spin_unlock_bh(&net->nsid_lock);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 5e4f04004a49..7bf833598615 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -106,6 +106,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
106 iterate_fd(p->files, 0, update_classid_sock, 106 iterate_fd(p->files, 0, update_classid_sock,
107 (void *)(unsigned long)cs->classid); 107 (void *)(unsigned long)cs->classid);
108 task_unlock(p); 108 task_unlock(p);
109 cond_resched();
109 } 110 }
110 css_task_iter_end(&it); 111 css_task_iter_end(&it);
111 112
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 3ae899805f8b..5da9552b186b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -57,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
57 MAX_UDP_CHUNK) 57 MAX_UDP_CHUNK)
58 58
59static void zap_completion_queue(void); 59static void zap_completion_queue(void);
60static void netpoll_async_cleanup(struct work_struct *work);
61 60
62static unsigned int carrier_timeout = 4; 61static unsigned int carrier_timeout = 4;
63module_param(carrier_timeout, uint, 0644); 62module_param(carrier_timeout, uint, 0644);
@@ -589,7 +588,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
589 588
590 np->dev = ndev; 589 np->dev = ndev;
591 strlcpy(np->dev_name, ndev->name, IFNAMSIZ); 590 strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
592 INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
593 591
594 if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { 592 if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
595 np_err(np, "%s doesn't support polling, aborting\n", 593 np_err(np, "%s doesn't support polling, aborting\n",
@@ -788,10 +786,6 @@ void __netpoll_cleanup(struct netpoll *np)
788{ 786{
789 struct netpoll_info *npinfo; 787 struct netpoll_info *npinfo;
790 788
791 /* rtnl_dereference would be preferable here but
792 * rcu_cleanup_netpoll path can put us in here safely without
793 * holding the rtnl, so plain rcu_dereference it is
794 */
795 npinfo = rtnl_dereference(np->dev->npinfo); 789 npinfo = rtnl_dereference(np->dev->npinfo);
796 if (!npinfo) 790 if (!npinfo)
797 return; 791 return;
@@ -812,21 +806,16 @@ void __netpoll_cleanup(struct netpoll *np)
812} 806}
813EXPORT_SYMBOL_GPL(__netpoll_cleanup); 807EXPORT_SYMBOL_GPL(__netpoll_cleanup);
814 808
815static void netpoll_async_cleanup(struct work_struct *work) 809void __netpoll_free(struct netpoll *np)
816{ 810{
817 struct netpoll *np = container_of(work, struct netpoll, cleanup_work); 811 ASSERT_RTNL();
818 812
819 rtnl_lock(); 813 /* Wait for transmitting packets to finish before freeing. */
814 synchronize_rcu_bh();
820 __netpoll_cleanup(np); 815 __netpoll_cleanup(np);
821 rtnl_unlock();
822 kfree(np); 816 kfree(np);
823} 817}
824 818EXPORT_SYMBOL_GPL(__netpoll_free);
825void __netpoll_free_async(struct netpoll *np)
826{
827 schedule_work(&np->cleanup_work);
828}
829EXPORT_SYMBOL_GPL(__netpoll_free_async);
830 819
831void netpoll_cleanup(struct netpoll *np) 820void netpoll_cleanup(struct netpoll *np)
832{ 821{
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7f6938405fa1..6ac919847ce6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3426,7 +3426,7 @@ xmit_more:
3426 net_info_ratelimited("%s xmit error: %d\n", 3426 net_info_ratelimited("%s xmit error: %d\n",
3427 pkt_dev->odevname, ret); 3427 pkt_dev->odevname, ret);
3428 pkt_dev->errors++; 3428 pkt_dev->errors++;
3429 /* fallthru */ 3429 /* fall through */
3430 case NETDEV_TX_BUSY: 3430 case NETDEV_TX_BUSY:
3431 /* Retry it next time */ 3431 /* Retry it next time */
3432 refcount_dec(&(pkt_dev->skb->users)); 3432 refcount_dec(&(pkt_dev->skb->users));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 37c7936124e6..f679c7a7d761 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,7 +59,7 @@
59#include <net/rtnetlink.h> 59#include <net/rtnetlink.h>
60#include <net/net_namespace.h> 60#include <net/net_namespace.h>
61 61
62#define RTNL_MAX_TYPE 48 62#define RTNL_MAX_TYPE 49
63#define RTNL_SLAVE_MAX_TYPE 36 63#define RTNL_SLAVE_MAX_TYPE 36
64 64
65struct rtnl_link { 65struct rtnl_link {
@@ -130,6 +130,12 @@ int rtnl_is_locked(void)
130} 130}
131EXPORT_SYMBOL(rtnl_is_locked); 131EXPORT_SYMBOL(rtnl_is_locked);
132 132
133bool refcount_dec_and_rtnl_lock(refcount_t *r)
134{
135 return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
136}
137EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);
138
133#ifdef CONFIG_PROVE_LOCKING 139#ifdef CONFIG_PROVE_LOCKING
134bool lockdep_rtnl_is_held(void) 140bool lockdep_rtnl_is_held(void)
135{ 141{
@@ -1016,7 +1022,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
1016 + nla_total_size(4) /* IFLA_NEW_NETNSID */ 1022 + nla_total_size(4) /* IFLA_NEW_NETNSID */
1017 + nla_total_size(4) /* IFLA_NEW_IFINDEX */ 1023 + nla_total_size(4) /* IFLA_NEW_IFINDEX */
1018 + nla_total_size(1) /* IFLA_PROTO_DOWN */ 1024 + nla_total_size(1) /* IFLA_PROTO_DOWN */
1019 + nla_total_size(4) /* IFLA_IF_NETNSID */ 1025 + nla_total_size(4) /* IFLA_TARGET_NETNSID */
1020 + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ 1026 + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
1021 + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ 1027 + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
1022 + nla_total_size(4) /* IFLA_MIN_MTU */ 1028 + nla_total_size(4) /* IFLA_MIN_MTU */
@@ -1598,7 +1604,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
1598 ifm->ifi_flags = dev_get_flags(dev); 1604 ifm->ifi_flags = dev_get_flags(dev);
1599 ifm->ifi_change = change; 1605 ifm->ifi_change = change;
1600 1606
1601 if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid)) 1607 if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
1602 goto nla_put_failure; 1608 goto nla_put_failure;
1603 1609
1604 if (nla_put_string(skb, IFLA_IFNAME, dev->name) || 1610 if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
@@ -1737,7 +1743,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1737 [IFLA_XDP] = { .type = NLA_NESTED }, 1743 [IFLA_XDP] = { .type = NLA_NESTED },
1738 [IFLA_EVENT] = { .type = NLA_U32 }, 1744 [IFLA_EVENT] = { .type = NLA_U32 },
1739 [IFLA_GROUP] = { .type = NLA_U32 }, 1745 [IFLA_GROUP] = { .type = NLA_U32 },
1740 [IFLA_IF_NETNSID] = { .type = NLA_S32 }, 1746 [IFLA_TARGET_NETNSID] = { .type = NLA_S32 },
1741 [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, 1747 [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
1742 [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, 1748 [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
1743 [IFLA_MIN_MTU] = { .type = NLA_U32 }, 1749 [IFLA_MIN_MTU] = { .type = NLA_U32 },
@@ -1845,7 +1851,15 @@ static bool link_dump_filtered(struct net_device *dev,
1845 return false; 1851 return false;
1846} 1852}
1847 1853
1848static struct net *get_target_net(struct sock *sk, int netnsid) 1854/**
1855 * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
1856 * @sk: netlink socket
1857 * @netnsid: network namespace identifier
1858 *
1859 * Returns the network namespace identified by netnsid on success or an error
1860 * pointer on failure.
1861 */
1862struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
1849{ 1863{
1850 struct net *net; 1864 struct net *net;
1851 1865
@@ -1862,9 +1876,54 @@ static struct net *get_target_net(struct sock *sk, int netnsid)
1862 } 1876 }
1863 return net; 1877 return net;
1864} 1878}
1879EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);
1880
1881static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
1882 bool strict_check, struct nlattr **tb,
1883 struct netlink_ext_ack *extack)
1884{
1885 int hdrlen;
1886
1887 if (strict_check) {
1888 struct ifinfomsg *ifm;
1889
1890 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
1891 NL_SET_ERR_MSG(extack, "Invalid header for link dump");
1892 return -EINVAL;
1893 }
1894
1895 ifm = nlmsg_data(nlh);
1896 if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
1897 ifm->ifi_change) {
1898 NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
1899 return -EINVAL;
1900 }
1901 if (ifm->ifi_index) {
1902 NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps");
1903 return -EINVAL;
1904 }
1905
1906 return nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX,
1907 ifla_policy, extack);
1908 }
1909
1910 /* A hack to preserve kernel<->userspace interface.
1911 * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
1912 * However, before Linux v3.9 the code here assumed rtgenmsg and that's
1913 * what iproute2 < v3.9.0 used.
1914 * We can detect the old iproute2. Even including the IFLA_EXT_MASK
1915 * attribute, its netlink message is shorter than struct ifinfomsg.
1916 */
1917 hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
1918 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
1919
1920 return nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack);
1921}
1865 1922
1866static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) 1923static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1867{ 1924{
1925 struct netlink_ext_ack *extack = cb->extack;
1926 const struct nlmsghdr *nlh = cb->nlh;
1868 struct net *net = sock_net(skb->sk); 1927 struct net *net = sock_net(skb->sk);
1869 struct net *tgt_net = net; 1928 struct net *tgt_net = net;
1870 int h, s_h; 1929 int h, s_h;
@@ -1877,44 +1936,54 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1877 unsigned int flags = NLM_F_MULTI; 1936 unsigned int flags = NLM_F_MULTI;
1878 int master_idx = 0; 1937 int master_idx = 0;
1879 int netnsid = -1; 1938 int netnsid = -1;
1880 int err; 1939 int err, i;
1881 int hdrlen;
1882 1940
1883 s_h = cb->args[0]; 1941 s_h = cb->args[0];
1884 s_idx = cb->args[1]; 1942 s_idx = cb->args[1];
1885 1943
1886 /* A hack to preserve kernel<->userspace interface. 1944 err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
1887 * The correct header is ifinfomsg. It is consistent with rtnl_getlink. 1945 if (err < 0) {
1888 * However, before Linux v3.9 the code here assumed rtgenmsg and that's 1946 if (cb->strict_check)
1889 * what iproute2 < v3.9.0 used. 1947 return err;
1890 * We can detect the old iproute2. Even including the IFLA_EXT_MASK
1891 * attribute, its netlink message is shorter than struct ifinfomsg.
1892 */
1893 hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
1894 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
1895
1896 if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
1897 ifla_policy, NULL) >= 0) {
1898 if (tb[IFLA_IF_NETNSID]) {
1899 netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
1900 tgt_net = get_target_net(skb->sk, netnsid);
1901 if (IS_ERR(tgt_net))
1902 return PTR_ERR(tgt_net);
1903 }
1904
1905 if (tb[IFLA_EXT_MASK])
1906 ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
1907 1948
1908 if (tb[IFLA_MASTER]) 1949 goto walk_entries;
1909 master_idx = nla_get_u32(tb[IFLA_MASTER]); 1950 }
1910 1951
1911 if (tb[IFLA_LINKINFO]) 1952 for (i = 0; i <= IFLA_MAX; ++i) {
1912 kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); 1953 if (!tb[i])
1954 continue;
1913 1955
1914 if (master_idx || kind_ops) 1956 /* new attributes should only be added with strict checking */
1915 flags |= NLM_F_DUMP_FILTERED; 1957 switch (i) {
1958 case IFLA_TARGET_NETNSID:
1959 netnsid = nla_get_s32(tb[i]);
1960 tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
1961 if (IS_ERR(tgt_net)) {
1962 NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
1963 return PTR_ERR(tgt_net);
1964 }
1965 break;
1966 case IFLA_EXT_MASK:
1967 ext_filter_mask = nla_get_u32(tb[i]);
1968 break;
1969 case IFLA_MASTER:
1970 master_idx = nla_get_u32(tb[i]);
1971 break;
1972 case IFLA_LINKINFO:
1973 kind_ops = linkinfo_to_kind_ops(tb[i]);
1974 break;
1975 default:
1976 if (cb->strict_check) {
1977 NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
1978 return -EINVAL;
1979 }
1980 }
1916 } 1981 }
1917 1982
1983 if (master_idx || kind_ops)
1984 flags |= NLM_F_DUMP_FILTERED;
1985
1986walk_entries:
1918 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 1987 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1919 idx = 0; 1988 idx = 0;
1920 head = &tgt_net->dev_index_head[h]; 1989 head = &tgt_net->dev_index_head[h];
@@ -1926,8 +1995,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
1926 err = rtnl_fill_ifinfo(skb, dev, net, 1995 err = rtnl_fill_ifinfo(skb, dev, net,
1927 RTM_NEWLINK, 1996 RTM_NEWLINK,
1928 NETLINK_CB(cb->skb).portid, 1997 NETLINK_CB(cb->skb).portid,
1929 cb->nlh->nlmsg_seq, 0, 1998 nlh->nlmsg_seq, 0, flags,
1930 flags,
1931 ext_filter_mask, 0, NULL, 0, 1999 ext_filter_mask, 0, NULL, 0,
1932 netnsid); 2000 netnsid);
1933 2001
@@ -1982,7 +2050,7 @@ EXPORT_SYMBOL(rtnl_link_get_net);
1982 * 2050 *
1983 * 1. IFLA_NET_NS_PID 2051 * 1. IFLA_NET_NS_PID
1984 * 2. IFLA_NET_NS_FD 2052 * 2. IFLA_NET_NS_FD
1985 * 3. IFLA_IF_NETNSID 2053 * 3. IFLA_TARGET_NETNSID
1986 */ 2054 */
1987static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, 2055static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
1988 struct nlattr *tb[]) 2056 struct nlattr *tb[])
@@ -1992,10 +2060,10 @@ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
1992 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) 2060 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
1993 return rtnl_link_get_net(src_net, tb); 2061 return rtnl_link_get_net(src_net, tb);
1994 2062
1995 if (!tb[IFLA_IF_NETNSID]) 2063 if (!tb[IFLA_TARGET_NETNSID])
1996 return get_net(src_net); 2064 return get_net(src_net);
1997 2065
1998 net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID])); 2066 net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
1999 if (!net) 2067 if (!net)
2000 return ERR_PTR(-EINVAL); 2068 return ERR_PTR(-EINVAL);
2001 2069
@@ -2036,13 +2104,13 @@ static int rtnl_ensure_unique_netns(struct nlattr *tb[],
2036 return -EOPNOTSUPP; 2104 return -EOPNOTSUPP;
2037 } 2105 }
2038 2106
2039 if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) 2107 if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
2040 goto invalid_attr; 2108 goto invalid_attr;
2041 2109
2042 if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD])) 2110 if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
2043 goto invalid_attr; 2111 goto invalid_attr;
2044 2112
2045 if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID])) 2113 if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
2046 goto invalid_attr; 2114 goto invalid_attr;
2047 2115
2048 return 0; 2116 return 0;
@@ -2318,7 +2386,7 @@ static int do_setlink(const struct sk_buff *skb,
2318 if (err < 0) 2386 if (err < 0)
2319 return err; 2387 return err;
2320 2388
2321 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { 2389 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
2322 struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), 2390 struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
2323 tb, CAP_NET_ADMIN); 2391 tb, CAP_NET_ADMIN);
2324 if (IS_ERR(net)) { 2392 if (IS_ERR(net)) {
@@ -2761,9 +2829,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
2761 if (tb[IFLA_IFNAME]) 2829 if (tb[IFLA_IFNAME])
2762 nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); 2830 nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
2763 2831
2764 if (tb[IFLA_IF_NETNSID]) { 2832 if (tb[IFLA_TARGET_NETNSID]) {
2765 netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); 2833 netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
2766 tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); 2834 tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
2767 if (IS_ERR(tgt_net)) 2835 if (IS_ERR(tgt_net))
2768 return PTR_ERR(tgt_net); 2836 return PTR_ERR(tgt_net);
2769 } 2837 }
@@ -3177,9 +3245,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
3177 if (err < 0) 3245 if (err < 0)
3178 return err; 3246 return err;
3179 3247
3180 if (tb[IFLA_IF_NETNSID]) { 3248 if (tb[IFLA_TARGET_NETNSID]) {
3181 netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); 3249 netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
3182 tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); 3250 tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
3183 if (IS_ERR(tgt_net)) 3251 if (IS_ERR(tgt_net))
3184 return PTR_ERR(tgt_net); 3252 return PTR_ERR(tgt_net);
3185 } 3253 }
@@ -3264,13 +3332,14 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
3264{ 3332{
3265 int idx; 3333 int idx;
3266 int s_idx = cb->family; 3334 int s_idx = cb->family;
3335 int type = cb->nlh->nlmsg_type - RTM_BASE;
3336 int ret = 0;
3267 3337
3268 if (s_idx == 0) 3338 if (s_idx == 0)
3269 s_idx = 1; 3339 s_idx = 1;
3270 3340
3271 for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { 3341 for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
3272 struct rtnl_link **tab; 3342 struct rtnl_link **tab;
3273 int type = cb->nlh->nlmsg_type-RTM_BASE;
3274 struct rtnl_link *link; 3343 struct rtnl_link *link;
3275 rtnl_dumpit_func dumpit; 3344 rtnl_dumpit_func dumpit;
3276 3345
@@ -3297,12 +3366,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
3297 cb->prev_seq = 0; 3366 cb->prev_seq = 0;
3298 cb->seq = 0; 3367 cb->seq = 0;
3299 } 3368 }
3300 if (dumpit(skb, cb)) 3369 ret = dumpit(skb, cb);
3370 if (ret < 0)
3301 break; 3371 break;
3302 } 3372 }
3303 cb->family = idx; 3373 cb->family = idx;
3304 3374
3305 return skb->len; 3375 return skb->len ? : ret;
3306} 3376}
3307 3377
3308struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, 3378struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
@@ -3731,22 +3801,66 @@ out:
3731} 3801}
3732EXPORT_SYMBOL(ndo_dflt_fdb_dump); 3802EXPORT_SYMBOL(ndo_dflt_fdb_dump);
3733 3803
3734static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) 3804static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
3805 int *br_idx, int *brport_idx,
3806 struct netlink_ext_ack *extack)
3807{
3808 struct nlattr *tb[NDA_MAX + 1];
3809 struct ndmsg *ndm;
3810 int err, i;
3811
3812 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
3813 NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
3814 return -EINVAL;
3815 }
3816
3817 ndm = nlmsg_data(nlh);
3818 if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
3819 ndm->ndm_flags || ndm->ndm_type) {
3820 NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request");
3821 return -EINVAL;
3822 }
3823
3824 err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
3825 NULL, extack);
3826 if (err < 0)
3827 return err;
3828
3829 *brport_idx = ndm->ndm_ifindex;
3830 for (i = 0; i <= NDA_MAX; ++i) {
3831 if (!tb[i])
3832 continue;
3833
3834 switch (i) {
3835 case NDA_IFINDEX:
3836 if (nla_len(tb[i]) != sizeof(u32)) {
3837 NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request");
3838 return -EINVAL;
3839 }
3840 *brport_idx = nla_get_u32(tb[NDA_IFINDEX]);
3841 break;
3842 case NDA_MASTER:
3843 if (nla_len(tb[i]) != sizeof(u32)) {
3844 NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request");
3845 return -EINVAL;
3846 }
3847 *br_idx = nla_get_u32(tb[NDA_MASTER]);
3848 break;
3849 default:
3850 NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request");
3851 return -EINVAL;
3852 }
3853 }
3854
3855 return 0;
3856}
3857
3858static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
3859 int *br_idx, int *brport_idx,
3860 struct netlink_ext_ack *extack)
3735{ 3861{
3736 struct net_device *dev;
3737 struct nlattr *tb[IFLA_MAX+1]; 3862 struct nlattr *tb[IFLA_MAX+1];
3738 struct net_device *br_dev = NULL; 3863 int err;
3739 const struct net_device_ops *ops = NULL;
3740 const struct net_device_ops *cops = NULL;
3741 struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
3742 struct net *net = sock_net(skb->sk);
3743 struct hlist_head *head;
3744 int brport_idx = 0;
3745 int br_idx = 0;
3746 int h, s_h;
3747 int idx = 0, s_idx;
3748 int err = 0;
3749 int fidx = 0;
3750 3864
3751 /* A hack to preserve kernel<->userspace interface. 3865 /* A hack to preserve kernel<->userspace interface.
3752 * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. 3866 * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
@@ -3755,20 +3869,49 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
3755 * Fortunately these sizes don't conflict with the size of ifinfomsg 3869 * Fortunately these sizes don't conflict with the size of ifinfomsg
3756 * with an optional attribute. 3870 * with an optional attribute.
3757 */ 3871 */
3758 if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) && 3872 if (nlmsg_len(nlh) != sizeof(struct ndmsg) &&
3759 (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) + 3873 (nlmsg_len(nlh) != sizeof(struct ndmsg) +
3760 nla_attr_size(sizeof(u32)))) { 3874 nla_attr_size(sizeof(u32)))) {
3761 err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, 3875 struct ifinfomsg *ifm;
3762 IFLA_MAX, ifla_policy, NULL); 3876
3877 err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
3878 ifla_policy, extack);
3763 if (err < 0) { 3879 if (err < 0) {
3764 return -EINVAL; 3880 return -EINVAL;
3765 } else if (err == 0) { 3881 } else if (err == 0) {
3766 if (tb[IFLA_MASTER]) 3882 if (tb[IFLA_MASTER])
3767 br_idx = nla_get_u32(tb[IFLA_MASTER]); 3883 *br_idx = nla_get_u32(tb[IFLA_MASTER]);
3768 } 3884 }
3769 3885
3770 brport_idx = ifm->ifi_index; 3886 ifm = nlmsg_data(nlh);
3887 *brport_idx = ifm->ifi_index;
3771 } 3888 }
3889 return 0;
3890}
3891
3892static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
3893{
3894 struct net_device *dev;
3895 struct net_device *br_dev = NULL;
3896 const struct net_device_ops *ops = NULL;
3897 const struct net_device_ops *cops = NULL;
3898 struct net *net = sock_net(skb->sk);
3899 struct hlist_head *head;
3900 int brport_idx = 0;
3901 int br_idx = 0;
3902 int h, s_h;
3903 int idx = 0, s_idx;
3904 int err = 0;
3905 int fidx = 0;
3906
3907 if (cb->strict_check)
3908 err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
3909 cb->extack);
3910 else
3911 err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
3912 cb->extack);
3913 if (err < 0)
3914 return err;
3772 3915
3773 if (br_idx) { 3916 if (br_idx) {
3774 br_dev = __dev_get_by_index(net, br_idx); 3917 br_dev = __dev_get_by_index(net, br_idx);
@@ -3953,28 +4096,72 @@ nla_put_failure:
3953} 4096}
3954EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); 4097EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
3955 4098
4099static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
4100 bool strict_check, u32 *filter_mask,
4101 struct netlink_ext_ack *extack)
4102{
4103 struct nlattr *tb[IFLA_MAX+1];
4104 int err, i;
4105
4106 if (strict_check) {
4107 struct ifinfomsg *ifm;
4108
4109 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
4110 NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
4111 return -EINVAL;
4112 }
4113
4114 ifm = nlmsg_data(nlh);
4115 if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
4116 ifm->ifi_change || ifm->ifi_index) {
4117 NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
4118 return -EINVAL;
4119 }
4120
4121 err = nlmsg_parse_strict(nlh, sizeof(struct ifinfomsg), tb,
4122 IFLA_MAX, ifla_policy, extack);
4123 } else {
4124 err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb,
4125 IFLA_MAX, ifla_policy, extack);
4126 }
4127 if (err < 0)
4128 return err;
4129
4130 /* new attributes should only be added with strict checking */
4131 for (i = 0; i <= IFLA_MAX; ++i) {
4132 if (!tb[i])
4133 continue;
4134
4135 switch (i) {
4136 case IFLA_EXT_MASK:
4137 *filter_mask = nla_get_u32(tb[i]);
4138 break;
4139 default:
4140 if (strict_check) {
4141 NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request");
4142 return -EINVAL;
4143 }
4144 }
4145 }
4146
4147 return 0;
4148}
4149
3956static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) 4150static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
3957{ 4151{
4152 const struct nlmsghdr *nlh = cb->nlh;
3958 struct net *net = sock_net(skb->sk); 4153 struct net *net = sock_net(skb->sk);
3959 struct net_device *dev; 4154 struct net_device *dev;
3960 int idx = 0; 4155 int idx = 0;
3961 u32 portid = NETLINK_CB(cb->skb).portid; 4156 u32 portid = NETLINK_CB(cb->skb).portid;
3962 u32 seq = cb->nlh->nlmsg_seq; 4157 u32 seq = nlh->nlmsg_seq;
3963 u32 filter_mask = 0; 4158 u32 filter_mask = 0;
3964 int err; 4159 int err;
3965 4160
3966 if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { 4161 err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask,
3967 struct nlattr *extfilt; 4162 cb->extack);
3968 4163 if (err < 0 && cb->strict_check)
3969 extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), 4164 return err;
3970 IFLA_EXT_MASK);
3971 if (extfilt) {
3972 if (nla_len(extfilt) < sizeof(filter_mask))
3973 return -EINVAL;
3974
3975 filter_mask = nla_get_u32(extfilt);
3976 }
3977 }
3978 4165
3979 rcu_read_lock(); 4166 rcu_read_lock();
3980 for_each_netdev_rcu(net, dev) { 4167 for_each_netdev_rcu(net, dev) {
@@ -4568,6 +4755,7 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
4568 4755
4569static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) 4756static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
4570{ 4757{
4758 struct netlink_ext_ack *extack = cb->extack;
4571 int h, s_h, err, s_idx, s_idxattr, s_prividx; 4759 int h, s_h, err, s_idx, s_idxattr, s_prividx;
4572 struct net *net = sock_net(skb->sk); 4760 struct net *net = sock_net(skb->sk);
4573 unsigned int flags = NLM_F_MULTI; 4761 unsigned int flags = NLM_F_MULTI;
@@ -4584,13 +4772,32 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
4584 4772
4585 cb->seq = net->dev_base_seq; 4773 cb->seq = net->dev_base_seq;
4586 4774
4587 if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) 4775 if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) {
4776 NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
4588 return -EINVAL; 4777 return -EINVAL;
4778 }
4589 4779
4590 ifsm = nlmsg_data(cb->nlh); 4780 ifsm = nlmsg_data(cb->nlh);
4781
4782 /* only requests using strict checks can pass data to influence
4783 * the dump. The legacy exception is filter_mask.
4784 */
4785 if (cb->strict_check) {
4786 if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) {
4787 NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
4788 return -EINVAL;
4789 }
4790 if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) {
4791 NL_SET_ERR_MSG(extack, "Invalid attributes after stats header");
4792 return -EINVAL;
4793 }
4794 }
4795
4591 filter_mask = ifsm->filter_mask; 4796 filter_mask = ifsm->filter_mask;
4592 if (!filter_mask) 4797 if (!filter_mask) {
4798 NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
4593 return -EINVAL; 4799 return -EINVAL;
4800 }
4594 4801
4595 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 4802 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
4596 idx = 0; 4803 idx = 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f817f336595d..946de0e24c87 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3382,64 +3382,6 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
3382} 3382}
3383EXPORT_SYMBOL(skb_find_text); 3383EXPORT_SYMBOL(skb_find_text);
3384 3384
3385/**
3386 * skb_append_datato_frags - append the user data to a skb
3387 * @sk: sock structure
3388 * @skb: skb structure to be appended with user data.
3389 * @getfrag: call back function to be used for getting the user data
3390 * @from: pointer to user message iov
3391 * @length: length of the iov message
3392 *
3393 * Description: This procedure append the user data in the fragment part
3394 * of the skb if any page alloc fails user this procedure returns -ENOMEM
3395 */
3396int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
3397 int (*getfrag)(void *from, char *to, int offset,
3398 int len, int odd, struct sk_buff *skb),
3399 void *from, int length)
3400{
3401 int frg_cnt = skb_shinfo(skb)->nr_frags;
3402 int copy;
3403 int offset = 0;
3404 int ret;
3405 struct page_frag *pfrag = &current->task_frag;
3406
3407 do {
3408 /* Return error if we don't have space for new frag */
3409 if (frg_cnt >= MAX_SKB_FRAGS)
3410 return -EMSGSIZE;
3411
3412 if (!sk_page_frag_refill(sk, pfrag))
3413 return -ENOMEM;
3414
3415 /* copy the user data to page */
3416 copy = min_t(int, length, pfrag->size - pfrag->offset);
3417
3418 ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
3419 offset, copy, 0, skb);
3420 if (ret < 0)
3421 return -EFAULT;
3422
3423 /* copy was successful so update the size parameters */
3424 skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
3425 copy);
3426 frg_cnt++;
3427 pfrag->offset += copy;
3428 get_page(pfrag->page);
3429
3430 skb->truesize += copy;
3431 refcount_add(copy, &sk->sk_wmem_alloc);
3432 skb->len += copy;
3433 skb->data_len += copy;
3434 offset += copy;
3435 length -= copy;
3436
3437 } while (length > 0);
3438
3439 return 0;
3440}
3441EXPORT_SYMBOL(skb_append_datato_frags);
3442
3443int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 3385int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
3444 int offset, size_t size) 3386 int offset, size_t size)
3445{ 3387{
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 000000000000..56a99d0c9aa0
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,802 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
3
4#include <linux/skmsg.h>
5#include <linux/skbuff.h>
6#include <linux/scatterlist.h>
7
8#include <net/sock.h>
9#include <net/tcp.h>
10
11static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
12{
13 if (msg->sg.end > msg->sg.start &&
14 elem_first_coalesce < msg->sg.end)
15 return true;
16
17 if (msg->sg.end < msg->sg.start &&
18 (elem_first_coalesce > msg->sg.start ||
19 elem_first_coalesce < msg->sg.end))
20 return true;
21
22 return false;
23}
24
25int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
26 int elem_first_coalesce)
27{
28 struct page_frag *pfrag = sk_page_frag(sk);
29 int ret = 0;
30
31 len -= msg->sg.size;
32 while (len > 0) {
33 struct scatterlist *sge;
34 u32 orig_offset;
35 int use, i;
36
37 if (!sk_page_frag_refill(sk, pfrag))
38 return -ENOMEM;
39
40 orig_offset = pfrag->offset;
41 use = min_t(int, len, pfrag->size - orig_offset);
42 if (!sk_wmem_schedule(sk, use))
43 return -ENOMEM;
44
45 i = msg->sg.end;
46 sk_msg_iter_var_prev(i);
47 sge = &msg->sg.data[i];
48
49 if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
50 sg_page(sge) == pfrag->page &&
51 sge->offset + sge->length == orig_offset) {
52 sge->length += use;
53 } else {
54 if (sk_msg_full(msg)) {
55 ret = -ENOSPC;
56 break;
57 }
58
59 sge = &msg->sg.data[msg->sg.end];
60 sg_unmark_end(sge);
61 sg_set_page(sge, pfrag->page, use, orig_offset);
62 get_page(pfrag->page);
63 sk_msg_iter_next(msg, end);
64 }
65
66 sk_mem_charge(sk, use);
67 msg->sg.size += use;
68 pfrag->offset += use;
69 len -= use;
70 }
71
72 return ret;
73}
74EXPORT_SYMBOL_GPL(sk_msg_alloc);
75
76int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
77 u32 off, u32 len)
78{
79 int i = src->sg.start;
80 struct scatterlist *sge = sk_msg_elem(src, i);
81 u32 sge_len, sge_off;
82
83 if (sk_msg_full(dst))
84 return -ENOSPC;
85
86 while (off) {
87 if (sge->length > off)
88 break;
89 off -= sge->length;
90 sk_msg_iter_var_next(i);
91 if (i == src->sg.end && off)
92 return -ENOSPC;
93 sge = sk_msg_elem(src, i);
94 }
95
96 while (len) {
97 sge_len = sge->length - off;
98 sge_off = sge->offset + off;
99 if (sge_len > len)
100 sge_len = len;
101 off = 0;
102 len -= sge_len;
103 sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
104 sk_mem_charge(sk, sge_len);
105 sk_msg_iter_var_next(i);
106 if (i == src->sg.end && len)
107 return -ENOSPC;
108 sge = sk_msg_elem(src, i);
109 }
110
111 return 0;
112}
113EXPORT_SYMBOL_GPL(sk_msg_clone);
114
115void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
116{
117 int i = msg->sg.start;
118
119 do {
120 struct scatterlist *sge = sk_msg_elem(msg, i);
121
122 if (bytes < sge->length) {
123 sge->length -= bytes;
124 sge->offset += bytes;
125 sk_mem_uncharge(sk, bytes);
126 break;
127 }
128
129 sk_mem_uncharge(sk, sge->length);
130 bytes -= sge->length;
131 sge->length = 0;
132 sge->offset = 0;
133 sk_msg_iter_var_next(i);
134 } while (bytes && i != msg->sg.end);
135 msg->sg.start = i;
136}
137EXPORT_SYMBOL_GPL(sk_msg_return_zero);
138
139void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
140{
141 int i = msg->sg.start;
142
143 do {
144 struct scatterlist *sge = &msg->sg.data[i];
145 int uncharge = (bytes < sge->length) ? bytes : sge->length;
146
147 sk_mem_uncharge(sk, uncharge);
148 bytes -= uncharge;
149 sk_msg_iter_var_next(i);
150 } while (i != msg->sg.end);
151}
152EXPORT_SYMBOL_GPL(sk_msg_return);
153
154static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
155 bool charge)
156{
157 struct scatterlist *sge = sk_msg_elem(msg, i);
158 u32 len = sge->length;
159
160 if (charge)
161 sk_mem_uncharge(sk, len);
162 if (!msg->skb)
163 put_page(sg_page(sge));
164 memset(sge, 0, sizeof(*sge));
165 return len;
166}
167
168static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
169 bool charge)
170{
171 struct scatterlist *sge = sk_msg_elem(msg, i);
172 int freed = 0;
173
174 while (msg->sg.size) {
175 msg->sg.size -= sge->length;
176 freed += sk_msg_free_elem(sk, msg, i, charge);
177 sk_msg_iter_var_next(i);
178 sk_msg_check_to_free(msg, i, msg->sg.size);
179 sge = sk_msg_elem(msg, i);
180 }
181 if (msg->skb)
182 consume_skb(msg->skb);
183 sk_msg_init(msg);
184 return freed;
185}
186
187int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
188{
189 return __sk_msg_free(sk, msg, msg->sg.start, false);
190}
191EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
192
193int sk_msg_free(struct sock *sk, struct sk_msg *msg)
194{
195 return __sk_msg_free(sk, msg, msg->sg.start, true);
196}
197EXPORT_SYMBOL_GPL(sk_msg_free);
198
199static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
200 u32 bytes, bool charge)
201{
202 struct scatterlist *sge;
203 u32 i = msg->sg.start;
204
205 while (bytes) {
206 sge = sk_msg_elem(msg, i);
207 if (!sge->length)
208 break;
209 if (bytes < sge->length) {
210 if (charge)
211 sk_mem_uncharge(sk, bytes);
212 sge->length -= bytes;
213 sge->offset += bytes;
214 msg->sg.size -= bytes;
215 break;
216 }
217
218 msg->sg.size -= sge->length;
219 bytes -= sge->length;
220 sk_msg_free_elem(sk, msg, i, charge);
221 sk_msg_iter_var_next(i);
222 sk_msg_check_to_free(msg, i, bytes);
223 }
224 msg->sg.start = i;
225}
226
227void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
228{
229 __sk_msg_free_partial(sk, msg, bytes, true);
230}
231EXPORT_SYMBOL_GPL(sk_msg_free_partial);
232
233void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
234 u32 bytes)
235{
236 __sk_msg_free_partial(sk, msg, bytes, false);
237}
238
239void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
240{
241 int trim = msg->sg.size - len;
242 u32 i = msg->sg.end;
243
244 if (trim <= 0) {
245 WARN_ON(trim < 0);
246 return;
247 }
248
249 sk_msg_iter_var_prev(i);
250 msg->sg.size = len;
251 while (msg->sg.data[i].length &&
252 trim >= msg->sg.data[i].length) {
253 trim -= msg->sg.data[i].length;
254 sk_msg_free_elem(sk, msg, i, true);
255 sk_msg_iter_var_prev(i);
256 if (!trim)
257 goto out;
258 }
259
260 msg->sg.data[i].length -= trim;
261 sk_mem_uncharge(sk, trim);
262out:
263 /* If we trim data before curr pointer update copybreak and current
264 * so that any future copy operations start at new copy location.
265 * However trimed data that has not yet been used in a copy op
266 * does not require an update.
267 */
268 if (msg->sg.curr >= i) {
269 msg->sg.curr = i;
270 msg->sg.copybreak = msg->sg.data[i].length;
271 }
272 sk_msg_iter_var_next(i);
273 msg->sg.end = i;
274}
275EXPORT_SYMBOL_GPL(sk_msg_trim);
276
277int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
278 struct sk_msg *msg, u32 bytes)
279{
280 int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
281 const int to_max_pages = MAX_MSG_FRAGS;
282 struct page *pages[MAX_MSG_FRAGS];
283 ssize_t orig, copied, use, offset;
284
285 orig = msg->sg.size;
286 while (bytes > 0) {
287 i = 0;
288 maxpages = to_max_pages - num_elems;
289 if (maxpages == 0) {
290 ret = -EFAULT;
291 goto out;
292 }
293
294 copied = iov_iter_get_pages(from, pages, bytes, maxpages,
295 &offset);
296 if (copied <= 0) {
297 ret = -EFAULT;
298 goto out;
299 }
300
301 iov_iter_advance(from, copied);
302 bytes -= copied;
303 msg->sg.size += copied;
304
305 while (copied) {
306 use = min_t(int, copied, PAGE_SIZE - offset);
307 sg_set_page(&msg->sg.data[msg->sg.end],
308 pages[i], use, offset);
309 sg_unmark_end(&msg->sg.data[msg->sg.end]);
310 sk_mem_charge(sk, use);
311
312 offset = 0;
313 copied -= use;
314 sk_msg_iter_next(msg, end);
315 num_elems++;
316 i++;
317 }
318 /* When zerocopy is mixed with sk_msg_*copy* operations we
319 * may have a copybreak set in this case clear and prefer
320 * zerocopy remainder when possible.
321 */
322 msg->sg.copybreak = 0;
323 msg->sg.curr = msg->sg.end;
324 }
325out:
326 /* Revert iov_iter updates, msg will need to use 'trim' later if it
327 * also needs to be cleared.
328 */
329 if (ret)
330 iov_iter_revert(from, msg->sg.size - orig);
331 return ret;
332}
333EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
334
335int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
336 struct sk_msg *msg, u32 bytes)
337{
338 int ret = -ENOSPC, i = msg->sg.curr;
339 struct scatterlist *sge;
340 u32 copy, buf_size;
341 void *to;
342
343 do {
344 sge = sk_msg_elem(msg, i);
345 /* This is possible if a trim operation shrunk the buffer */
346 if (msg->sg.copybreak >= sge->length) {
347 msg->sg.copybreak = 0;
348 sk_msg_iter_var_next(i);
349 if (i == msg->sg.end)
350 break;
351 sge = sk_msg_elem(msg, i);
352 }
353
354 buf_size = sge->length - msg->sg.copybreak;
355 copy = (buf_size > bytes) ? bytes : buf_size;
356 to = sg_virt(sge) + msg->sg.copybreak;
357 msg->sg.copybreak += copy;
358 if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
359 ret = copy_from_iter_nocache(to, copy, from);
360 else
361 ret = copy_from_iter(to, copy, from);
362 if (ret != copy) {
363 ret = -EFAULT;
364 goto out;
365 }
366 bytes -= copy;
367 if (!bytes)
368 break;
369 msg->sg.copybreak = 0;
370 sk_msg_iter_var_next(i);
371 } while (i != msg->sg.end);
372out:
373 msg->sg.curr = i;
374 return ret;
375}
376EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
377
378static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
379{
380 struct sock *sk = psock->sk;
381 int copied = 0, num_sge;
382 struct sk_msg *msg;
383
384 msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
385 if (unlikely(!msg))
386 return -EAGAIN;
387 if (!sk_rmem_schedule(sk, skb, skb->len)) {
388 kfree(msg);
389 return -EAGAIN;
390 }
391
392 sk_msg_init(msg);
393 num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
394 if (unlikely(num_sge < 0)) {
395 kfree(msg);
396 return num_sge;
397 }
398
399 sk_mem_charge(sk, skb->len);
400 copied = skb->len;
401 msg->sg.start = 0;
402 msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
403 msg->skb = skb;
404
405 sk_psock_queue_msg(psock, msg);
406 sk->sk_data_ready(sk);
407 return copied;
408}
409
410static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
411 u32 off, u32 len, bool ingress)
412{
413 if (ingress)
414 return sk_psock_skb_ingress(psock, skb);
415 else
416 return skb_send_sock_locked(psock->sk, skb, off, len);
417}
418
419static void sk_psock_backlog(struct work_struct *work)
420{
421 struct sk_psock *psock = container_of(work, struct sk_psock, work);
422 struct sk_psock_work_state *state = &psock->work_state;
423 struct sk_buff *skb;
424 bool ingress;
425 u32 len, off;
426 int ret;
427
428 /* Lock sock to avoid losing sk_socket during loop. */
429 lock_sock(psock->sk);
430 if (state->skb) {
431 skb = state->skb;
432 len = state->len;
433 off = state->off;
434 state->skb = NULL;
435 goto start;
436 }
437
438 while ((skb = skb_dequeue(&psock->ingress_skb))) {
439 len = skb->len;
440 off = 0;
441start:
442 ingress = tcp_skb_bpf_ingress(skb);
443 do {
444 ret = -EIO;
445 if (likely(psock->sk->sk_socket))
446 ret = sk_psock_handle_skb(psock, skb, off,
447 len, ingress);
448 if (ret <= 0) {
449 if (ret == -EAGAIN) {
450 state->skb = skb;
451 state->len = len;
452 state->off = off;
453 goto end;
454 }
455 /* Hard errors break pipe and stop xmit. */
456 sk_psock_report_error(psock, ret ? -ret : EPIPE);
457 sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
458 kfree_skb(skb);
459 goto end;
460 }
461 off += ret;
462 len -= ret;
463 } while (len);
464
465 if (!ingress)
466 kfree_skb(skb);
467 }
468end:
469 release_sock(psock->sk);
470}
471
472struct sk_psock *sk_psock_init(struct sock *sk, int node)
473{
474 struct sk_psock *psock = kzalloc_node(sizeof(*psock),
475 GFP_ATOMIC | __GFP_NOWARN,
476 node);
477 if (!psock)
478 return NULL;
479
480 psock->sk = sk;
481 psock->eval = __SK_NONE;
482
483 INIT_LIST_HEAD(&psock->link);
484 spin_lock_init(&psock->link_lock);
485
486 INIT_WORK(&psock->work, sk_psock_backlog);
487 INIT_LIST_HEAD(&psock->ingress_msg);
488 skb_queue_head_init(&psock->ingress_skb);
489
490 sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
491 refcount_set(&psock->refcnt, 1);
492
493 rcu_assign_sk_user_data(sk, psock);
494 sock_hold(sk);
495
496 return psock;
497}
498EXPORT_SYMBOL_GPL(sk_psock_init);
499
500struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
501{
502 struct sk_psock_link *link;
503
504 spin_lock_bh(&psock->link_lock);
505 link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
506 list);
507 if (link)
508 list_del(&link->list);
509 spin_unlock_bh(&psock->link_lock);
510 return link;
511}
512
513void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
514{
515 struct sk_msg *msg, *tmp;
516
517 list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
518 list_del(&msg->list);
519 sk_msg_free(psock->sk, msg);
520 kfree(msg);
521 }
522}
523
524static void sk_psock_zap_ingress(struct sk_psock *psock)
525{
526 __skb_queue_purge(&psock->ingress_skb);
527 __sk_psock_purge_ingress_msg(psock);
528}
529
530static void sk_psock_link_destroy(struct sk_psock *psock)
531{
532 struct sk_psock_link *link, *tmp;
533
534 list_for_each_entry_safe(link, tmp, &psock->link, list) {
535 list_del(&link->list);
536 sk_psock_free_link(link);
537 }
538}
539
540static void sk_psock_destroy_deferred(struct work_struct *gc)
541{
542 struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
543
544 /* No sk_callback_lock since already detached. */
545 if (psock->parser.enabled)
546 strp_done(&psock->parser.strp);
547
548 cancel_work_sync(&psock->work);
549
550 psock_progs_drop(&psock->progs);
551
552 sk_psock_link_destroy(psock);
553 sk_psock_cork_free(psock);
554 sk_psock_zap_ingress(psock);
555
556 if (psock->sk_redir)
557 sock_put(psock->sk_redir);
558 sock_put(psock->sk);
559 kfree(psock);
560}
561
562void sk_psock_destroy(struct rcu_head *rcu)
563{
564 struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
565
566 INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
567 schedule_work(&psock->gc);
568}
569EXPORT_SYMBOL_GPL(sk_psock_destroy);
570
571void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
572{
573 rcu_assign_sk_user_data(sk, NULL);
574 sk_psock_cork_free(psock);
575 sk_psock_restore_proto(sk, psock);
576
577 write_lock_bh(&sk->sk_callback_lock);
578 if (psock->progs.skb_parser)
579 sk_psock_stop_strp(sk, psock);
580 write_unlock_bh(&sk->sk_callback_lock);
581 sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
582
583 call_rcu_sched(&psock->rcu, sk_psock_destroy);
584}
585EXPORT_SYMBOL_GPL(sk_psock_drop);
586
587static int sk_psock_map_verd(int verdict, bool redir)
588{
589 switch (verdict) {
590 case SK_PASS:
591 return redir ? __SK_REDIRECT : __SK_PASS;
592 case SK_DROP:
593 default:
594 break;
595 }
596
597 return __SK_DROP;
598}
599
600int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
601 struct sk_msg *msg)
602{
603 struct bpf_prog *prog;
604 int ret;
605
606 preempt_disable();
607 rcu_read_lock();
608 prog = READ_ONCE(psock->progs.msg_parser);
609 if (unlikely(!prog)) {
610 ret = __SK_PASS;
611 goto out;
612 }
613
614 sk_msg_compute_data_pointers(msg);
615 msg->sk = sk;
616 ret = BPF_PROG_RUN(prog, msg);
617 ret = sk_psock_map_verd(ret, msg->sk_redir);
618 psock->apply_bytes = msg->apply_bytes;
619 if (ret == __SK_REDIRECT) {
620 if (psock->sk_redir)
621 sock_put(psock->sk_redir);
622 psock->sk_redir = msg->sk_redir;
623 if (!psock->sk_redir) {
624 ret = __SK_DROP;
625 goto out;
626 }
627 sock_hold(psock->sk_redir);
628 }
629out:
630 rcu_read_unlock();
631 preempt_enable();
632 return ret;
633}
634EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
635
636static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
637 struct sk_buff *skb)
638{
639 int ret;
640
641 skb->sk = psock->sk;
642 bpf_compute_data_end_sk_skb(skb);
643 preempt_disable();
644 ret = BPF_PROG_RUN(prog, skb);
645 preempt_enable();
646 /* strparser clones the skb before handing it to a upper layer,
647 * meaning skb_orphan has been called. We NULL sk on the way out
648 * to ensure we don't trigger a BUG_ON() in skb/sk operations
649 * later and because we are not charging the memory of this skb
650 * to any socket yet.
651 */
652 skb->sk = NULL;
653 return ret;
654}
655
656static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
657{
658 struct sk_psock_parser *parser;
659
660 parser = container_of(strp, struct sk_psock_parser, strp);
661 return container_of(parser, struct sk_psock, parser);
662}
663
664static void sk_psock_verdict_apply(struct sk_psock *psock,
665 struct sk_buff *skb, int verdict)
666{
667 struct sk_psock *psock_other;
668 struct sock *sk_other;
669 bool ingress;
670
671 switch (verdict) {
672 case __SK_REDIRECT:
673 sk_other = tcp_skb_bpf_redirect_fetch(skb);
674 if (unlikely(!sk_other))
675 goto out_free;
676 psock_other = sk_psock(sk_other);
677 if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
678 !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
679 goto out_free;
680 ingress = tcp_skb_bpf_ingress(skb);
681 if ((!ingress && sock_writeable(sk_other)) ||
682 (ingress &&
683 atomic_read(&sk_other->sk_rmem_alloc) <=
684 sk_other->sk_rcvbuf)) {
685 if (!ingress)
686 skb_set_owner_w(skb, sk_other);
687 skb_queue_tail(&psock_other->ingress_skb, skb);
688 schedule_work(&psock_other->work);
689 break;
690 }
691 /* fall-through */
692 case __SK_DROP:
693 /* fall-through */
694 default:
695out_free:
696 kfree_skb(skb);
697 }
698}
699
700static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
701{
702 struct sk_psock *psock = sk_psock_from_strp(strp);
703 struct bpf_prog *prog;
704 int ret = __SK_DROP;
705
706 rcu_read_lock();
707 prog = READ_ONCE(psock->progs.skb_verdict);
708 if (likely(prog)) {
709 skb_orphan(skb);
710 tcp_skb_bpf_redirect_clear(skb);
711 ret = sk_psock_bpf_run(psock, prog, skb);
712 ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
713 }
714 rcu_read_unlock();
715 sk_psock_verdict_apply(psock, skb, ret);
716}
717
718static int sk_psock_strp_read_done(struct strparser *strp, int err)
719{
720 return err;
721}
722
723static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
724{
725 struct sk_psock *psock = sk_psock_from_strp(strp);
726 struct bpf_prog *prog;
727 int ret = skb->len;
728
729 rcu_read_lock();
730 prog = READ_ONCE(psock->progs.skb_parser);
731 if (likely(prog))
732 ret = sk_psock_bpf_run(psock, prog, skb);
733 rcu_read_unlock();
734 return ret;
735}
736
737/* Called with socket lock held. */
738static void sk_psock_data_ready(struct sock *sk)
739{
740 struct sk_psock *psock;
741
742 rcu_read_lock();
743 psock = sk_psock(sk);
744 if (likely(psock)) {
745 write_lock_bh(&sk->sk_callback_lock);
746 strp_data_ready(&psock->parser.strp);
747 write_unlock_bh(&sk->sk_callback_lock);
748 }
749 rcu_read_unlock();
750}
751
752static void sk_psock_write_space(struct sock *sk)
753{
754 struct sk_psock *psock;
755 void (*write_space)(struct sock *sk);
756
757 rcu_read_lock();
758 psock = sk_psock(sk);
759 if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
760 schedule_work(&psock->work);
761 write_space = psock->saved_write_space;
762 rcu_read_unlock();
763 write_space(sk);
764}
765
766int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
767{
768 static const struct strp_callbacks cb = {
769 .rcv_msg = sk_psock_strp_read,
770 .read_sock_done = sk_psock_strp_read_done,
771 .parse_msg = sk_psock_strp_parse,
772 };
773
774 psock->parser.enabled = false;
775 return strp_init(&psock->parser.strp, sk, &cb);
776}
777
778void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
779{
780 struct sk_psock_parser *parser = &psock->parser;
781
782 if (parser->enabled)
783 return;
784
785 parser->saved_data_ready = sk->sk_data_ready;
786 sk->sk_data_ready = sk_psock_data_ready;
787 sk->sk_write_space = sk_psock_write_space;
788 parser->enabled = true;
789}
790
791void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
792{
793 struct sk_psock_parser *parser = &psock->parser;
794
795 if (!parser->enabled)
796 return;
797
798 sk->sk_data_ready = parser->saved_data_ready;
799 parser->saved_data_ready = NULL;
800 strp_stop(&parser->strp);
801 parser->enabled = false;
802}
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..6fcc4bc07d19 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -998,7 +998,7 @@ set_rcvbuf:
998 cmpxchg(&sk->sk_pacing_status, 998 cmpxchg(&sk->sk_pacing_status,
999 SK_PACING_NONE, 999 SK_PACING_NONE,
1000 SK_PACING_NEEDED); 1000 SK_PACING_NEEDED);
1001 sk->sk_max_pacing_rate = val; 1001 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
1002 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1002 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1003 sk->sk_max_pacing_rate); 1003 sk->sk_max_pacing_rate);
1004 break; 1004 break;
@@ -1336,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1336#endif 1336#endif
1337 1337
1338 case SO_MAX_PACING_RATE: 1338 case SO_MAX_PACING_RATE:
1339 v.val = sk->sk_max_pacing_rate; 1339 /* 32bit version */
1340 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1340 break; 1341 break;
1341 1342
1342 case SO_INCOMING_CPU: 1343 case SO_INCOMING_CPU:
@@ -2238,67 +2239,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2238} 2239}
2239EXPORT_SYMBOL(sk_page_frag_refill); 2240EXPORT_SYMBOL(sk_page_frag_refill);
2240 2241
2241int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2242 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2243 int first_coalesce)
2244{
2245 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2246 unsigned int size = *sg_curr_size;
2247 struct page_frag *pfrag;
2248 struct scatterlist *sge;
2249
2250 len -= size;
2251 pfrag = sk_page_frag(sk);
2252
2253 while (len > 0) {
2254 unsigned int orig_offset;
2255
2256 if (!sk_page_frag_refill(sk, pfrag)) {
2257 rc = -ENOMEM;
2258 goto out;
2259 }
2260
2261 use = min_t(int, len, pfrag->size - pfrag->offset);
2262
2263 if (!sk_wmem_schedule(sk, use)) {
2264 rc = -ENOMEM;
2265 goto out;
2266 }
2267
2268 sk_mem_charge(sk, use);
2269 size += use;
2270 orig_offset = pfrag->offset;
2271 pfrag->offset += use;
2272
2273 sge = sg + sg_curr - 1;
2274 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2275 sge->offset + sge->length == orig_offset) {
2276 sge->length += use;
2277 } else {
2278 sge = sg + sg_curr;
2279 sg_unmark_end(sge);
2280 sg_set_page(sge, pfrag->page, use, orig_offset);
2281 get_page(pfrag->page);
2282 sg_curr++;
2283
2284 if (sg_curr == MAX_SKB_FRAGS)
2285 sg_curr = 0;
2286
2287 if (sg_curr == sg_start) {
2288 rc = -ENOSPC;
2289 break;
2290 }
2291 }
2292
2293 len -= use;
2294 }
2295out:
2296 *sg_curr_size = size;
2297 *sg_curr_index = sg_curr;
2298 return rc;
2299}
2300EXPORT_SYMBOL(sk_alloc_sg);
2301
2302static void __lock_sock(struct sock *sk) 2242static void __lock_sock(struct sock *sk)
2303 __releases(&sk->sk_lock.slock) 2243 __releases(&sk->sk_lock.slock)
2304 __acquires(&sk->sk_lock.slock) 2244 __acquires(&sk->sk_lock.slock)
@@ -2317,7 +2257,7 @@ static void __lock_sock(struct sock *sk)
2317 finish_wait(&sk->sk_lock.wq, &wait); 2257 finish_wait(&sk->sk_lock.wq, &wait);
2318} 2258}
2319 2259
2320static void __release_sock(struct sock *sk) 2260void __release_sock(struct sock *sk)
2321 __releases(&sk->sk_lock.slock) 2261 __releases(&sk->sk_lock.slock)
2322 __acquires(&sk->sk_lock.slock) 2262 __acquires(&sk->sk_lock.slock)
2323{ 2263{
@@ -2332,7 +2272,7 @@ static void __release_sock(struct sock *sk)
2332 next = skb->next; 2272 next = skb->next;
2333 prefetch(next); 2273 prefetch(next);
2334 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2274 WARN_ON_ONCE(skb_dst_is_noref(skb));
2335 skb->next = NULL; 2275 skb_mark_not_on_list(skb);
2336 sk_backlog_rcv(sk, skb); 2276 sk_backlog_rcv(sk, skb);
2337 2277
2338 cond_resched(); 2278 cond_resched();
@@ -2810,8 +2750,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2810 sk->sk_ll_usec = sysctl_net_busy_read; 2750 sk->sk_ll_usec = sysctl_net_busy_read;
2811#endif 2751#endif
2812 2752
2813 sk->sk_max_pacing_rate = ~0U; 2753 sk->sk_max_pacing_rate = ~0UL;
2814 sk->sk_pacing_rate = ~0U; 2754 sk->sk_pacing_rate = ~0UL;
2815 sk->sk_pacing_shift = 10; 2755 sk->sk_pacing_shift = 10;
2816 sk->sk_incoming_cpu = -1; 2756 sk->sk_incoming_cpu = -1;
2817 2757
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 000000000000..be6092ac69f8
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1003 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
3
4#include <linux/bpf.h>
5#include <linux/filter.h>
6#include <linux/errno.h>
7#include <linux/file.h>
8#include <linux/net.h>
9#include <linux/workqueue.h>
10#include <linux/skmsg.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13
14struct bpf_stab {
15 struct bpf_map map;
16 struct sock **sks;
17 struct sk_psock_progs progs;
18 raw_spinlock_t lock;
19};
20
21#define SOCK_CREATE_FLAG_MASK \
22 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
23
24static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
25{
26 struct bpf_stab *stab;
27 u64 cost;
28 int err;
29
30 if (!capable(CAP_NET_ADMIN))
31 return ERR_PTR(-EPERM);
32 if (attr->max_entries == 0 ||
33 attr->key_size != 4 ||
34 attr->value_size != 4 ||
35 attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
36 return ERR_PTR(-EINVAL);
37
38 stab = kzalloc(sizeof(*stab), GFP_USER);
39 if (!stab)
40 return ERR_PTR(-ENOMEM);
41
42 bpf_map_init_from_attr(&stab->map, attr);
43 raw_spin_lock_init(&stab->lock);
44
45 /* Make sure page count doesn't overflow. */
46 cost = (u64) stab->map.max_entries * sizeof(struct sock *);
47 if (cost >= U32_MAX - PAGE_SIZE) {
48 err = -EINVAL;
49 goto free_stab;
50 }
51
52 stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
53 err = bpf_map_precharge_memlock(stab->map.pages);
54 if (err)
55 goto free_stab;
56
57 stab->sks = bpf_map_area_alloc(stab->map.max_entries *
58 sizeof(struct sock *),
59 stab->map.numa_node);
60 if (stab->sks)
61 return &stab->map;
62 err = -ENOMEM;
63free_stab:
64 kfree(stab);
65 return ERR_PTR(err);
66}
67
68int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
69{
70 u32 ufd = attr->target_fd;
71 struct bpf_map *map;
72 struct fd f;
73 int ret;
74
75 f = fdget(ufd);
76 map = __bpf_map_get(f);
77 if (IS_ERR(map))
78 return PTR_ERR(map);
79 ret = sock_map_prog_update(map, prog, attr->attach_type);
80 fdput(f);
81 return ret;
82}
83
84static void sock_map_sk_acquire(struct sock *sk)
85 __acquires(&sk->sk_lock.slock)
86{
87 lock_sock(sk);
88 preempt_disable();
89 rcu_read_lock();
90}
91
92static void sock_map_sk_release(struct sock *sk)
93 __releases(&sk->sk_lock.slock)
94{
95 rcu_read_unlock();
96 preempt_enable();
97 release_sock(sk);
98}
99
100static void sock_map_add_link(struct sk_psock *psock,
101 struct sk_psock_link *link,
102 struct bpf_map *map, void *link_raw)
103{
104 link->link_raw = link_raw;
105 link->map = map;
106 spin_lock_bh(&psock->link_lock);
107 list_add_tail(&link->list, &psock->link);
108 spin_unlock_bh(&psock->link_lock);
109}
110
111static void sock_map_del_link(struct sock *sk,
112 struct sk_psock *psock, void *link_raw)
113{
114 struct sk_psock_link *link, *tmp;
115 bool strp_stop = false;
116
117 spin_lock_bh(&psock->link_lock);
118 list_for_each_entry_safe(link, tmp, &psock->link, list) {
119 if (link->link_raw == link_raw) {
120 struct bpf_map *map = link->map;
121 struct bpf_stab *stab = container_of(map, struct bpf_stab,
122 map);
123 if (psock->parser.enabled && stab->progs.skb_parser)
124 strp_stop = true;
125 list_del(&link->list);
126 sk_psock_free_link(link);
127 }
128 }
129 spin_unlock_bh(&psock->link_lock);
130 if (strp_stop) {
131 write_lock_bh(&sk->sk_callback_lock);
132 sk_psock_stop_strp(sk, psock);
133 write_unlock_bh(&sk->sk_callback_lock);
134 }
135}
136
137static void sock_map_unref(struct sock *sk, void *link_raw)
138{
139 struct sk_psock *psock = sk_psock(sk);
140
141 if (likely(psock)) {
142 sock_map_del_link(sk, psock, link_raw);
143 sk_psock_put(sk, psock);
144 }
145}
146
147static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
148 struct sock *sk)
149{
150 struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
151 bool skb_progs, sk_psock_is_new = false;
152 struct sk_psock *psock;
153 int ret;
154
155 skb_verdict = READ_ONCE(progs->skb_verdict);
156 skb_parser = READ_ONCE(progs->skb_parser);
157 skb_progs = skb_parser && skb_verdict;
158 if (skb_progs) {
159 skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
160 if (IS_ERR(skb_verdict))
161 return PTR_ERR(skb_verdict);
162 skb_parser = bpf_prog_inc_not_zero(skb_parser);
163 if (IS_ERR(skb_parser)) {
164 bpf_prog_put(skb_verdict);
165 return PTR_ERR(skb_parser);
166 }
167 }
168
169 msg_parser = READ_ONCE(progs->msg_parser);
170 if (msg_parser) {
171 msg_parser = bpf_prog_inc_not_zero(msg_parser);
172 if (IS_ERR(msg_parser)) {
173 ret = PTR_ERR(msg_parser);
174 goto out;
175 }
176 }
177
178 psock = sk_psock_get_checked(sk);
179 if (IS_ERR(psock)) {
180 ret = PTR_ERR(psock);
181 goto out_progs;
182 }
183
184 if (psock) {
185 if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
186 (skb_progs && READ_ONCE(psock->progs.skb_parser))) {
187 sk_psock_put(sk, psock);
188 ret = -EBUSY;
189 goto out_progs;
190 }
191 } else {
192 psock = sk_psock_init(sk, map->numa_node);
193 if (!psock) {
194 ret = -ENOMEM;
195 goto out_progs;
196 }
197 sk_psock_is_new = true;
198 }
199
200 if (msg_parser)
201 psock_set_prog(&psock->progs.msg_parser, msg_parser);
202 if (sk_psock_is_new) {
203 ret = tcp_bpf_init(sk);
204 if (ret < 0)
205 goto out_drop;
206 } else {
207 tcp_bpf_reinit(sk);
208 }
209
210 write_lock_bh(&sk->sk_callback_lock);
211 if (skb_progs && !psock->parser.enabled) {
212 ret = sk_psock_init_strp(sk, psock);
213 if (ret) {
214 write_unlock_bh(&sk->sk_callback_lock);
215 goto out_drop;
216 }
217 psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
218 psock_set_prog(&psock->progs.skb_parser, skb_parser);
219 sk_psock_start_strp(sk, psock);
220 }
221 write_unlock_bh(&sk->sk_callback_lock);
222 return 0;
223out_drop:
224 sk_psock_put(sk, psock);
225out_progs:
226 if (msg_parser)
227 bpf_prog_put(msg_parser);
228out:
229 if (skb_progs) {
230 bpf_prog_put(skb_verdict);
231 bpf_prog_put(skb_parser);
232 }
233 return ret;
234}
235
236static void sock_map_free(struct bpf_map *map)
237{
238 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
239 int i;
240
241 synchronize_rcu();
242 rcu_read_lock();
243 raw_spin_lock_bh(&stab->lock);
244 for (i = 0; i < stab->map.max_entries; i++) {
245 struct sock **psk = &stab->sks[i];
246 struct sock *sk;
247
248 sk = xchg(psk, NULL);
249 if (sk)
250 sock_map_unref(sk, psk);
251 }
252 raw_spin_unlock_bh(&stab->lock);
253 rcu_read_unlock();
254
255 bpf_map_area_free(stab->sks);
256 kfree(stab);
257}
258
259static void sock_map_release_progs(struct bpf_map *map)
260{
261 psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
262}
263
264static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
265{
266 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
267
268 WARN_ON_ONCE(!rcu_read_lock_held());
269
270 if (unlikely(key >= map->max_entries))
271 return NULL;
272 return READ_ONCE(stab->sks[key]);
273}
274
275static void *sock_map_lookup(struct bpf_map *map, void *key)
276{
277 return ERR_PTR(-EOPNOTSUPP);
278}
279
280static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
281 struct sock **psk)
282{
283 struct sock *sk;
284
285 raw_spin_lock_bh(&stab->lock);
286 sk = *psk;
287 if (!sk_test || sk_test == sk)
288 *psk = NULL;
289 raw_spin_unlock_bh(&stab->lock);
290 if (unlikely(!sk))
291 return -EINVAL;
292 sock_map_unref(sk, psk);
293 return 0;
294}
295
296static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
297 void *link_raw)
298{
299 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
300
301 __sock_map_delete(stab, sk, link_raw);
302}
303
304static int sock_map_delete_elem(struct bpf_map *map, void *key)
305{
306 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
307 u32 i = *(u32 *)key;
308 struct sock **psk;
309
310 if (unlikely(i >= map->max_entries))
311 return -EINVAL;
312
313 psk = &stab->sks[i];
314 return __sock_map_delete(stab, NULL, psk);
315}
316
317static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
318{
319 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
320 u32 i = key ? *(u32 *)key : U32_MAX;
321 u32 *key_next = next;
322
323 if (i == stab->map.max_entries - 1)
324 return -ENOENT;
325 if (i >= stab->map.max_entries)
326 *key_next = 0;
327 else
328 *key_next = i + 1;
329 return 0;
330}
331
332static int sock_map_update_common(struct bpf_map *map, u32 idx,
333 struct sock *sk, u64 flags)
334{
335 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
336 struct sk_psock_link *link;
337 struct sk_psock *psock;
338 struct sock *osk;
339 int ret;
340
341 WARN_ON_ONCE(!rcu_read_lock_held());
342 if (unlikely(flags > BPF_EXIST))
343 return -EINVAL;
344 if (unlikely(idx >= map->max_entries))
345 return -E2BIG;
346
347 link = sk_psock_init_link();
348 if (!link)
349 return -ENOMEM;
350
351 ret = sock_map_link(map, &stab->progs, sk);
352 if (ret < 0)
353 goto out_free;
354
355 psock = sk_psock(sk);
356 WARN_ON_ONCE(!psock);
357
358 raw_spin_lock_bh(&stab->lock);
359 osk = stab->sks[idx];
360 if (osk && flags == BPF_NOEXIST) {
361 ret = -EEXIST;
362 goto out_unlock;
363 } else if (!osk && flags == BPF_EXIST) {
364 ret = -ENOENT;
365 goto out_unlock;
366 }
367
368 sock_map_add_link(psock, link, map, &stab->sks[idx]);
369 stab->sks[idx] = sk;
370 if (osk)
371 sock_map_unref(osk, &stab->sks[idx]);
372 raw_spin_unlock_bh(&stab->lock);
373 return 0;
374out_unlock:
375 raw_spin_unlock_bh(&stab->lock);
376 if (psock)
377 sk_psock_put(sk, psock);
378out_free:
379 sk_psock_free_link(link);
380 return ret;
381}
382
383static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
384{
385 return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
386 ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
387}
388
389static bool sock_map_sk_is_suitable(const struct sock *sk)
390{
391 return sk->sk_type == SOCK_STREAM &&
392 sk->sk_protocol == IPPROTO_TCP;
393}
394
395static int sock_map_update_elem(struct bpf_map *map, void *key,
396 void *value, u64 flags)
397{
398 u32 ufd = *(u32 *)value;
399 u32 idx = *(u32 *)key;
400 struct socket *sock;
401 struct sock *sk;
402 int ret;
403
404 sock = sockfd_lookup(ufd, &ret);
405 if (!sock)
406 return ret;
407 sk = sock->sk;
408 if (!sk) {
409 ret = -EINVAL;
410 goto out;
411 }
412 if (!sock_map_sk_is_suitable(sk) ||
413 sk->sk_state != TCP_ESTABLISHED) {
414 ret = -EOPNOTSUPP;
415 goto out;
416 }
417
418 sock_map_sk_acquire(sk);
419 ret = sock_map_update_common(map, idx, sk, flags);
420 sock_map_sk_release(sk);
421out:
422 fput(sock->file);
423 return ret;
424}
425
426BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
427 struct bpf_map *, map, void *, key, u64, flags)
428{
429 WARN_ON_ONCE(!rcu_read_lock_held());
430
431 if (likely(sock_map_sk_is_suitable(sops->sk) &&
432 sock_map_op_okay(sops)))
433 return sock_map_update_common(map, *(u32 *)key, sops->sk,
434 flags);
435 return -EOPNOTSUPP;
436}
437
438const struct bpf_func_proto bpf_sock_map_update_proto = {
439 .func = bpf_sock_map_update,
440 .gpl_only = false,
441 .pkt_access = true,
442 .ret_type = RET_INTEGER,
443 .arg1_type = ARG_PTR_TO_CTX,
444 .arg2_type = ARG_CONST_MAP_PTR,
445 .arg3_type = ARG_PTR_TO_MAP_KEY,
446 .arg4_type = ARG_ANYTHING,
447};
448
449BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
450 struct bpf_map *, map, u32, key, u64, flags)
451{
452 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
453
454 if (unlikely(flags & ~(BPF_F_INGRESS)))
455 return SK_DROP;
456 tcb->bpf.flags = flags;
457 tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
458 if (!tcb->bpf.sk_redir)
459 return SK_DROP;
460 return SK_PASS;
461}
462
463const struct bpf_func_proto bpf_sk_redirect_map_proto = {
464 .func = bpf_sk_redirect_map,
465 .gpl_only = false,
466 .ret_type = RET_INTEGER,
467 .arg1_type = ARG_PTR_TO_CTX,
468 .arg2_type = ARG_CONST_MAP_PTR,
469 .arg3_type = ARG_ANYTHING,
470 .arg4_type = ARG_ANYTHING,
471};
472
473BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
474 struct bpf_map *, map, u32, key, u64, flags)
475{
476 if (unlikely(flags & ~(BPF_F_INGRESS)))
477 return SK_DROP;
478 msg->flags = flags;
479 msg->sk_redir = __sock_map_lookup_elem(map, key);
480 if (!msg->sk_redir)
481 return SK_DROP;
482 return SK_PASS;
483}
484
485const struct bpf_func_proto bpf_msg_redirect_map_proto = {
486 .func = bpf_msg_redirect_map,
487 .gpl_only = false,
488 .ret_type = RET_INTEGER,
489 .arg1_type = ARG_PTR_TO_CTX,
490 .arg2_type = ARG_CONST_MAP_PTR,
491 .arg3_type = ARG_ANYTHING,
492 .arg4_type = ARG_ANYTHING,
493};
494
495const struct bpf_map_ops sock_map_ops = {
496 .map_alloc = sock_map_alloc,
497 .map_free = sock_map_free,
498 .map_get_next_key = sock_map_get_next_key,
499 .map_update_elem = sock_map_update_elem,
500 .map_delete_elem = sock_map_delete_elem,
501 .map_lookup_elem = sock_map_lookup,
502 .map_release_uref = sock_map_release_progs,
503 .map_check_btf = map_check_no_btf,
504};
505
506struct bpf_htab_elem {
507 struct rcu_head rcu;
508 u32 hash;
509 struct sock *sk;
510 struct hlist_node node;
511 u8 key[0];
512};
513
514struct bpf_htab_bucket {
515 struct hlist_head head;
516 raw_spinlock_t lock;
517};
518
519struct bpf_htab {
520 struct bpf_map map;
521 struct bpf_htab_bucket *buckets;
522 u32 buckets_num;
523 u32 elem_size;
524 struct sk_psock_progs progs;
525 atomic_t count;
526};
527
528static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
529{
530 return jhash(key, len, 0);
531}
532
533static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
534 u32 hash)
535{
536 return &htab->buckets[hash & (htab->buckets_num - 1)];
537}
538
539static struct bpf_htab_elem *
540sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
541 u32 key_size)
542{
543 struct bpf_htab_elem *elem;
544
545 hlist_for_each_entry_rcu(elem, head, node) {
546 if (elem->hash == hash &&
547 !memcmp(&elem->key, key, key_size))
548 return elem;
549 }
550
551 return NULL;
552}
553
554static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
555{
556 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
557 u32 key_size = map->key_size, hash;
558 struct bpf_htab_bucket *bucket;
559 struct bpf_htab_elem *elem;
560
561 WARN_ON_ONCE(!rcu_read_lock_held());
562
563 hash = sock_hash_bucket_hash(key, key_size);
564 bucket = sock_hash_select_bucket(htab, hash);
565 elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
566
567 return elem ? elem->sk : NULL;
568}
569
570static void sock_hash_free_elem(struct bpf_htab *htab,
571 struct bpf_htab_elem *elem)
572{
573 atomic_dec(&htab->count);
574 kfree_rcu(elem, rcu);
575}
576
577static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
578 void *link_raw)
579{
580 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
581 struct bpf_htab_elem *elem_probe, *elem = link_raw;
582 struct bpf_htab_bucket *bucket;
583
584 WARN_ON_ONCE(!rcu_read_lock_held());
585 bucket = sock_hash_select_bucket(htab, elem->hash);
586
587 /* elem may be deleted in parallel from the map, but access here
588 * is okay since it's going away only after RCU grace period.
589 * However, we need to check whether it's still present.
590 */
591 raw_spin_lock_bh(&bucket->lock);
592 elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
593 elem->key, map->key_size);
594 if (elem_probe && elem_probe == elem) {
595 hlist_del_rcu(&elem->node);
596 sock_map_unref(elem->sk, elem);
597 sock_hash_free_elem(htab, elem);
598 }
599 raw_spin_unlock_bh(&bucket->lock);
600}
601
602static int sock_hash_delete_elem(struct bpf_map *map, void *key)
603{
604 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
605 u32 hash, key_size = map->key_size;
606 struct bpf_htab_bucket *bucket;
607 struct bpf_htab_elem *elem;
608 int ret = -ENOENT;
609
610 hash = sock_hash_bucket_hash(key, key_size);
611 bucket = sock_hash_select_bucket(htab, hash);
612
613 raw_spin_lock_bh(&bucket->lock);
614 elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
615 if (elem) {
616 hlist_del_rcu(&elem->node);
617 sock_map_unref(elem->sk, elem);
618 sock_hash_free_elem(htab, elem);
619 ret = 0;
620 }
621 raw_spin_unlock_bh(&bucket->lock);
622 return ret;
623}
624
625static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
626 void *key, u32 key_size,
627 u32 hash, struct sock *sk,
628 struct bpf_htab_elem *old)
629{
630 struct bpf_htab_elem *new;
631
632 if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
633 if (!old) {
634 atomic_dec(&htab->count);
635 return ERR_PTR(-E2BIG);
636 }
637 }
638
639 new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
640 htab->map.numa_node);
641 if (!new) {
642 atomic_dec(&htab->count);
643 return ERR_PTR(-ENOMEM);
644 }
645 memcpy(new->key, key, key_size);
646 new->sk = sk;
647 new->hash = hash;
648 return new;
649}
650
651static int sock_hash_update_common(struct bpf_map *map, void *key,
652 struct sock *sk, u64 flags)
653{
654 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
655 u32 key_size = map->key_size, hash;
656 struct bpf_htab_elem *elem, *elem_new;
657 struct bpf_htab_bucket *bucket;
658 struct sk_psock_link *link;
659 struct sk_psock *psock;
660 int ret;
661
662 WARN_ON_ONCE(!rcu_read_lock_held());
663 if (unlikely(flags > BPF_EXIST))
664 return -EINVAL;
665
666 link = sk_psock_init_link();
667 if (!link)
668 return -ENOMEM;
669
670 ret = sock_map_link(map, &htab->progs, sk);
671 if (ret < 0)
672 goto out_free;
673
674 psock = sk_psock(sk);
675 WARN_ON_ONCE(!psock);
676
677 hash = sock_hash_bucket_hash(key, key_size);
678 bucket = sock_hash_select_bucket(htab, hash);
679
680 raw_spin_lock_bh(&bucket->lock);
681 elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
682 if (elem && flags == BPF_NOEXIST) {
683 ret = -EEXIST;
684 goto out_unlock;
685 } else if (!elem && flags == BPF_EXIST) {
686 ret = -ENOENT;
687 goto out_unlock;
688 }
689
690 elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
691 if (IS_ERR(elem_new)) {
692 ret = PTR_ERR(elem_new);
693 goto out_unlock;
694 }
695
696 sock_map_add_link(psock, link, map, elem_new);
697 /* Add new element to the head of the list, so that
698 * concurrent search will find it before old elem.
699 */
700 hlist_add_head_rcu(&elem_new->node, &bucket->head);
701 if (elem) {
702 hlist_del_rcu(&elem->node);
703 sock_map_unref(elem->sk, elem);
704 sock_hash_free_elem(htab, elem);
705 }
706 raw_spin_unlock_bh(&bucket->lock);
707 return 0;
708out_unlock:
709 raw_spin_unlock_bh(&bucket->lock);
710 sk_psock_put(sk, psock);
711out_free:
712 sk_psock_free_link(link);
713 return ret;
714}
715
716static int sock_hash_update_elem(struct bpf_map *map, void *key,
717 void *value, u64 flags)
718{
719 u32 ufd = *(u32 *)value;
720 struct socket *sock;
721 struct sock *sk;
722 int ret;
723
724 sock = sockfd_lookup(ufd, &ret);
725 if (!sock)
726 return ret;
727 sk = sock->sk;
728 if (!sk) {
729 ret = -EINVAL;
730 goto out;
731 }
732 if (!sock_map_sk_is_suitable(sk) ||
733 sk->sk_state != TCP_ESTABLISHED) {
734 ret = -EOPNOTSUPP;
735 goto out;
736 }
737
738 sock_map_sk_acquire(sk);
739 ret = sock_hash_update_common(map, key, sk, flags);
740 sock_map_sk_release(sk);
741out:
742 fput(sock->file);
743 return ret;
744}
745
746static int sock_hash_get_next_key(struct bpf_map *map, void *key,
747 void *key_next)
748{
749 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
750 struct bpf_htab_elem *elem, *elem_next;
751 u32 hash, key_size = map->key_size;
752 struct hlist_head *head;
753 int i = 0;
754
755 if (!key)
756 goto find_first_elem;
757 hash = sock_hash_bucket_hash(key, key_size);
758 head = &sock_hash_select_bucket(htab, hash)->head;
759 elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
760 if (!elem)
761 goto find_first_elem;
762
763 elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
764 struct bpf_htab_elem, node);
765 if (elem_next) {
766 memcpy(key_next, elem_next->key, key_size);
767 return 0;
768 }
769
770 i = hash & (htab->buckets_num - 1);
771 i++;
772find_first_elem:
773 for (; i < htab->buckets_num; i++) {
774 head = &sock_hash_select_bucket(htab, i)->head;
775 elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
776 struct bpf_htab_elem, node);
777 if (elem_next) {
778 memcpy(key_next, elem_next->key, key_size);
779 return 0;
780 }
781 }
782
783 return -ENOENT;
784}
785
786static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
787{
788 struct bpf_htab *htab;
789 int i, err;
790 u64 cost;
791
792 if (!capable(CAP_NET_ADMIN))
793 return ERR_PTR(-EPERM);
794 if (attr->max_entries == 0 ||
795 attr->key_size == 0 ||
796 attr->value_size != 4 ||
797 attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
798 return ERR_PTR(-EINVAL);
799 if (attr->key_size > MAX_BPF_STACK)
800 return ERR_PTR(-E2BIG);
801
802 htab = kzalloc(sizeof(*htab), GFP_USER);
803 if (!htab)
804 return ERR_PTR(-ENOMEM);
805
806 bpf_map_init_from_attr(&htab->map, attr);
807
808 htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
809 htab->elem_size = sizeof(struct bpf_htab_elem) +
810 round_up(htab->map.key_size, 8);
811 if (htab->buckets_num == 0 ||
812 htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
813 err = -EINVAL;
814 goto free_htab;
815 }
816
817 cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
818 (u64) htab->elem_size * htab->map.max_entries;
819 if (cost >= U32_MAX - PAGE_SIZE) {
820 err = -EINVAL;
821 goto free_htab;
822 }
823
824 htab->buckets = bpf_map_area_alloc(htab->buckets_num *
825 sizeof(struct bpf_htab_bucket),
826 htab->map.numa_node);
827 if (!htab->buckets) {
828 err = -ENOMEM;
829 goto free_htab;
830 }
831
832 for (i = 0; i < htab->buckets_num; i++) {
833 INIT_HLIST_HEAD(&htab->buckets[i].head);
834 raw_spin_lock_init(&htab->buckets[i].lock);
835 }
836
837 return &htab->map;
838free_htab:
839 kfree(htab);
840 return ERR_PTR(err);
841}
842
843static void sock_hash_free(struct bpf_map *map)
844{
845 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
846 struct bpf_htab_bucket *bucket;
847 struct bpf_htab_elem *elem;
848 struct hlist_node *node;
849 int i;
850
851 synchronize_rcu();
852 rcu_read_lock();
853 for (i = 0; i < htab->buckets_num; i++) {
854 bucket = sock_hash_select_bucket(htab, i);
855 raw_spin_lock_bh(&bucket->lock);
856 hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
857 hlist_del_rcu(&elem->node);
858 sock_map_unref(elem->sk, elem);
859 }
860 raw_spin_unlock_bh(&bucket->lock);
861 }
862 rcu_read_unlock();
863
864 bpf_map_area_free(htab->buckets);
865 kfree(htab);
866}
867
868static void sock_hash_release_progs(struct bpf_map *map)
869{
870 psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
871}
872
873BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
874 struct bpf_map *, map, void *, key, u64, flags)
875{
876 WARN_ON_ONCE(!rcu_read_lock_held());
877
878 if (likely(sock_map_sk_is_suitable(sops->sk) &&
879 sock_map_op_okay(sops)))
880 return sock_hash_update_common(map, key, sops->sk, flags);
881 return -EOPNOTSUPP;
882}
883
884const struct bpf_func_proto bpf_sock_hash_update_proto = {
885 .func = bpf_sock_hash_update,
886 .gpl_only = false,
887 .pkt_access = true,
888 .ret_type = RET_INTEGER,
889 .arg1_type = ARG_PTR_TO_CTX,
890 .arg2_type = ARG_CONST_MAP_PTR,
891 .arg3_type = ARG_PTR_TO_MAP_KEY,
892 .arg4_type = ARG_ANYTHING,
893};
894
895BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
896 struct bpf_map *, map, void *, key, u64, flags)
897{
898 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
899
900 if (unlikely(flags & ~(BPF_F_INGRESS)))
901 return SK_DROP;
902 tcb->bpf.flags = flags;
903 tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
904 if (!tcb->bpf.sk_redir)
905 return SK_DROP;
906 return SK_PASS;
907}
908
909const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
910 .func = bpf_sk_redirect_hash,
911 .gpl_only = false,
912 .ret_type = RET_INTEGER,
913 .arg1_type = ARG_PTR_TO_CTX,
914 .arg2_type = ARG_CONST_MAP_PTR,
915 .arg3_type = ARG_PTR_TO_MAP_KEY,
916 .arg4_type = ARG_ANYTHING,
917};
918
919BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
920 struct bpf_map *, map, void *, key, u64, flags)
921{
922 if (unlikely(flags & ~(BPF_F_INGRESS)))
923 return SK_DROP;
924 msg->flags = flags;
925 msg->sk_redir = __sock_hash_lookup_elem(map, key);
926 if (!msg->sk_redir)
927 return SK_DROP;
928 return SK_PASS;
929}
930
931const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
932 .func = bpf_msg_redirect_hash,
933 .gpl_only = false,
934 .ret_type = RET_INTEGER,
935 .arg1_type = ARG_PTR_TO_CTX,
936 .arg2_type = ARG_CONST_MAP_PTR,
937 .arg3_type = ARG_PTR_TO_MAP_KEY,
938 .arg4_type = ARG_ANYTHING,
939};
940
941const struct bpf_map_ops sock_hash_ops = {
942 .map_alloc = sock_hash_alloc,
943 .map_free = sock_hash_free,
944 .map_get_next_key = sock_hash_get_next_key,
945 .map_update_elem = sock_hash_update_elem,
946 .map_delete_elem = sock_hash_delete_elem,
947 .map_lookup_elem = sock_map_lookup,
948 .map_release_uref = sock_hash_release_progs,
949 .map_check_btf = map_check_no_btf,
950};
951
952static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
953{
954 switch (map->map_type) {
955 case BPF_MAP_TYPE_SOCKMAP:
956 return &container_of(map, struct bpf_stab, map)->progs;
957 case BPF_MAP_TYPE_SOCKHASH:
958 return &container_of(map, struct bpf_htab, map)->progs;
959 default:
960 break;
961 }
962
963 return NULL;
964}
965
966int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
967 u32 which)
968{
969 struct sk_psock_progs *progs = sock_map_progs(map);
970
971 if (!progs)
972 return -EOPNOTSUPP;
973
974 switch (which) {
975 case BPF_SK_MSG_VERDICT:
976 psock_set_prog(&progs->msg_parser, prog);
977 break;
978 case BPF_SK_SKB_STREAM_PARSER:
979 psock_set_prog(&progs->skb_parser, prog);
980 break;
981 case BPF_SK_SKB_STREAM_VERDICT:
982 psock_set_prog(&progs->skb_verdict, prog);
983 break;
984 default:
985 return -EOPNOTSUPP;
986 }
987
988 return 0;
989}
990
991void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
992{
993 switch (link->map->map_type) {
994 case BPF_MAP_TYPE_SOCKMAP:
995 return sock_map_delete_from_link(link->map, sk,
996 link->link_raw);
997 case BPF_MAP_TYPE_SOCKHASH:
998 return sock_hash_delete_from_link(link->map, sk,
999 link->link_raw);
1000 default:
1001 break;
1002 }
1003}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 89b6785cef2a..4b2b194f4f1f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -94,11 +94,21 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
94 kfree(xa); 94 kfree(xa);
95} 95}
96 96
97static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) 97void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
98{ 98{
99 struct xdp_mem_allocator *xa; 99 struct xdp_mem_allocator *xa;
100 int id = xdp_rxq->mem.id; 100 int id = xdp_rxq->mem.id;
101 101
102 if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
103 WARN(1, "Missing register, driver bug");
104 return;
105 }
106
107 if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
108 xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
109 return;
110 }
111
102 if (id == 0) 112 if (id == 0)
103 return; 113 return;
104 114
@@ -110,6 +120,7 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
110 120
111 mutex_unlock(&mem_id_lock); 121 mutex_unlock(&mem_id_lock);
112} 122}
123EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
113 124
114void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) 125void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
115{ 126{
@@ -119,7 +130,7 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
119 130
120 WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); 131 WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
121 132
122 __xdp_rxq_info_unreg_mem_model(xdp_rxq); 133 xdp_rxq_info_unreg_mem_model(xdp_rxq);
123 134
124 xdp_rxq->reg_state = REG_STATE_UNREGISTERED; 135 xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
125 xdp_rxq->dev = NULL; 136 xdp_rxq->dev = NULL;
@@ -398,3 +409,41 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
398 info->flags = bpf->flags; 409 info->flags = bpf->flags;
399} 410}
400EXPORT_SYMBOL_GPL(xdp_attachment_setup); 411EXPORT_SYMBOL_GPL(xdp_attachment_setup);
412
413struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
414{
415 unsigned int metasize, totsize;
416 void *addr, *data_to_copy;
417 struct xdp_frame *xdpf;
418 struct page *page;
419
420 /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
421 metasize = xdp_data_meta_unsupported(xdp) ? 0 :
422 xdp->data - xdp->data_meta;
423 totsize = xdp->data_end - xdp->data + metasize;
424
425 if (sizeof(*xdpf) + totsize > PAGE_SIZE)
426 return NULL;
427
428 page = dev_alloc_page();
429 if (!page)
430 return NULL;
431
432 addr = page_to_virt(page);
433 xdpf = addr;
434 memset(xdpf, 0, sizeof(*xdpf));
435
436 addr += sizeof(*xdpf);
437 data_to_copy = metasize ? xdp->data_meta : xdp->data;
438 memcpy(addr, data_to_copy, totsize);
439
440 xdpf->data = addr + metasize;
441 xdpf->len = totsize - metasize;
442 xdpf->headroom = 0;
443 xdpf->metasize = metasize;
444 xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
445
446 xdp_return_buff(xdp);
447 return xdpf;
448}
449EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 875858c8b059..43733accf58e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
325 __poll_t mask; 325 __poll_t mask;
326 struct sock *sk = sock->sk; 326 struct sock *sk = sock->sk;
327 327
328 sock_poll_wait(file, wait); 328 sock_poll_wait(file, sock, wait);
329 if (sk->sk_state == DCCP_LISTEN) 329 if (sk->sk_state == DCCP_LISTEN)
330 return inet_csk_listen_poll(sk); 330 return inet_csk_listen_poll(sk);
331 331
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index bfd43e8f2c06..d0b3e69c6b39 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1363,7 +1363,7 @@ static int dn_dev_seq_show(struct seq_file *seq, void *v)
1363 1363
1364 seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu" 1364 seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu"
1365 " %04hu %03d %02x %-10s %-7s %-7s\n", 1365 " %04hu %03d %02x %-10s %-7s %-7s\n",
1366 dev->name ? dev->name : "???", 1366 dev->name,
1367 dn_type2asc(dn_db->parms.mode), 1367 dn_type2asc(dn_db->parms.mode),
1368 0, 0, 1368 0, 0,
1369 dn_db->t3, dn_db->parms.t3, 1369 dn_db->t3, dn_db->parms.t3,
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 7f4534828f6c..a65d553e730d 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -29,6 +29,7 @@
29#include <linux/keyctl.h> 29#include <linux/keyctl.h>
30#include <linux/err.h> 30#include <linux/err.h>
31#include <linux/seq_file.h> 31#include <linux/seq_file.h>
32#include <linux/dns_resolver.h>
32#include <keys/dns_resolver-type.h> 33#include <keys/dns_resolver-type.h>
33#include <keys/user-type.h> 34#include <keys/user-type.h>
34#include "internal.h" 35#include "internal.h"
@@ -48,27 +49,86 @@ const struct cred *dns_resolver_cache;
48/* 49/*
49 * Preparse instantiation data for a dns_resolver key. 50 * Preparse instantiation data for a dns_resolver key.
50 * 51 *
51 * The data must be a NUL-terminated string, with the NUL char accounted in 52 * For normal hostname lookups, the data must be a NUL-terminated string, with
52 * datalen. 53 * the NUL char accounted in datalen.
53 * 54 *
54 * If the data contains a '#' characters, then we take the clause after each 55 * If the data contains a '#' characters, then we take the clause after each
55 * one to be an option of the form 'key=value'. The actual data of interest is 56 * one to be an option of the form 'key=value'. The actual data of interest is
56 * the string leading up to the first '#'. For instance: 57 * the string leading up to the first '#'. For instance:
57 * 58 *
58 * "ip1,ip2,...#foo=bar" 59 * "ip1,ip2,...#foo=bar"
60 *
61 * For server list requests, the data must begin with a NUL char and be
62 * followed by a byte indicating the version of the data format. Version 1
63 * looks something like (note this is packed):
64 *
65 * u8 Non-string marker (ie. 0)
66 * u8 Content (DNS_PAYLOAD_IS_*)
67 * u8 Version (e.g. 1)
68 * u8 Source of server list
69 * u8 Lookup status of server list
70 * u8 Number of servers
71 * foreach-server {
72 * __le16 Name length
73 * __le16 Priority (as per SRV record, low first)
74 * __le16 Weight (as per SRV record, higher first)
75 * __le16 Port
76 * u8 Source of address list
77 * u8 Lookup status of address list
78 * u8 Protocol (DNS_SERVER_PROTOCOL_*)
79 * u8 Number of addresses
80 * char[] Name (not NUL-terminated)
81 * foreach-address {
82 * u8 Family (DNS_ADDRESS_IS_*)
83 * union {
84 * u8[4] ipv4_addr
85 * u8[16] ipv6_addr
86 * }
87 * }
88 * }
89 *
59 */ 90 */
60static int 91static int
61dns_resolver_preparse(struct key_preparsed_payload *prep) 92dns_resolver_preparse(struct key_preparsed_payload *prep)
62{ 93{
94 const struct dns_payload_header *bin;
63 struct user_key_payload *upayload; 95 struct user_key_payload *upayload;
64 unsigned long derrno; 96 unsigned long derrno;
65 int ret; 97 int ret;
66 int datalen = prep->datalen, result_len = 0; 98 int datalen = prep->datalen, result_len = 0;
67 const char *data = prep->data, *end, *opt; 99 const char *data = prep->data, *end, *opt;
68 100
101 if (datalen <= 1 || !data)
102 return -EINVAL;
103
104 if (data[0] == 0) {
105 /* It may be a server list. */
106 if (datalen <= sizeof(*bin))
107 return -EINVAL;
108
109 bin = (const struct dns_payload_header *)data;
110 kenter("[%u,%u],%u", bin->content, bin->version, datalen);
111 if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) {
112 pr_warn_ratelimited(
113 "dns_resolver: Unsupported content type (%u)\n",
114 bin->content);
115 return -EINVAL;
116 }
117
118 if (bin->version != 1) {
119 pr_warn_ratelimited(
120 "dns_resolver: Unsupported server list version (%u)\n",
121 bin->version);
122 return -EINVAL;
123 }
124
125 result_len = datalen;
126 goto store_result;
127 }
128
69 kenter("'%*.*s',%u", datalen, datalen, data, datalen); 129 kenter("'%*.*s',%u", datalen, datalen, data, datalen);
70 130
71 if (datalen <= 1 || !data || data[datalen - 1] != '\0') 131 if (!data || data[datalen - 1] != '\0')
72 return -EINVAL; 132 return -EINVAL;
73 datalen--; 133 datalen--;
74 134
@@ -144,6 +204,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
144 return 0; 204 return 0;
145 } 205 }
146 206
207store_result:
147 kdebug("store result"); 208 kdebug("store result");
148 prep->quotalen = result_len; 209 prep->quotalen = result_len;
149 210
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 49da67034f29..76338c38738a 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -148,12 +148,9 @@ int dns_query(const char *type, const char *name, size_t namelen,
148 148
149 if (_result) { 149 if (_result) {
150 ret = -ENOMEM; 150 ret = -ENOMEM;
151 *_result = kmalloc(len + 1, GFP_KERNEL); 151 *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL);
152 if (!*_result) 152 if (!*_result)
153 goto put; 153 goto put;
154
155 memcpy(*_result, upayload->data, len);
156 (*_result)[len] = '\0';
157 } 154 }
158 155
159 if (_expiry) 156 if (_expiry)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 4183e4ba27a5..48c41918fb35 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,6 +38,9 @@ config NET_DSA_TAG_DSA
38config NET_DSA_TAG_EDSA 38config NET_DSA_TAG_EDSA
39 bool 39 bool
40 40
41config NET_DSA_TAG_GSWIP
42 bool
43
41config NET_DSA_TAG_KSZ 44config NET_DSA_TAG_KSZ
42 bool 45 bool
43 46
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9e4d3536f977..6e721f7a2947 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -9,6 +9,7 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
9dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o 9dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o
10dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o 10dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
11dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o 11dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
12dsa_core-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
12dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o 13dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
13dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o 14dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
14dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o 15dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 9f3209ff7ffd..a69c1790bbfc 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -52,6 +52,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
52#ifdef CONFIG_NET_DSA_TAG_EDSA 52#ifdef CONFIG_NET_DSA_TAG_EDSA
53 [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, 53 [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
54#endif 54#endif
55#ifdef CONFIG_NET_DSA_TAG_GSWIP
56 [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops,
57#endif
55#ifdef CONFIG_NET_DSA_TAG_KSZ 58#ifdef CONFIG_NET_DSA_TAG_KSZ
56 [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops, 59 [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops,
57#endif 60#endif
@@ -70,6 +73,52 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
70 [DSA_TAG_PROTO_NONE] = &none_ops, 73 [DSA_TAG_PROTO_NONE] = &none_ops,
71}; 74};
72 75
76const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
77{
78 const char *protocol_name[DSA_TAG_LAST] = {
79#ifdef CONFIG_NET_DSA_TAG_BRCM
80 [DSA_TAG_PROTO_BRCM] = "brcm",
81#endif
82#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND
83 [DSA_TAG_PROTO_BRCM_PREPEND] = "brcm-prepend",
84#endif
85#ifdef CONFIG_NET_DSA_TAG_DSA
86 [DSA_TAG_PROTO_DSA] = "dsa",
87#endif
88#ifdef CONFIG_NET_DSA_TAG_EDSA
89 [DSA_TAG_PROTO_EDSA] = "edsa",
90#endif
91#ifdef CONFIG_NET_DSA_TAG_GSWIP
92 [DSA_TAG_PROTO_GSWIP] = "gswip",
93#endif
94#ifdef CONFIG_NET_DSA_TAG_KSZ
95 [DSA_TAG_PROTO_KSZ] = "ksz",
96#endif
97#ifdef CONFIG_NET_DSA_TAG_LAN9303
98 [DSA_TAG_PROTO_LAN9303] = "lan9303",
99#endif
100#ifdef CONFIG_NET_DSA_TAG_MTK
101 [DSA_TAG_PROTO_MTK] = "mtk",
102#endif
103#ifdef CONFIG_NET_DSA_TAG_QCA
104 [DSA_TAG_PROTO_QCA] = "qca",
105#endif
106#ifdef CONFIG_NET_DSA_TAG_TRAILER
107 [DSA_TAG_PROTO_TRAILER] = "trailer",
108#endif
109 [DSA_TAG_PROTO_NONE] = "none",
110 };
111 unsigned int i;
112
113 BUILD_BUG_ON(ARRAY_SIZE(protocol_name) != DSA_TAG_LAST);
114
115 for (i = 0; i < ARRAY_SIZE(dsa_device_ops); i++)
116 if (ops == dsa_device_ops[i])
117 return protocol_name[i];
118
119 return protocol_name[DSA_TAG_PROTO_NONE];
120};
121
73const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) 122const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
74{ 123{
75 const struct dsa_device_ops *ops; 124 const struct dsa_device_ops *ops;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 3964c6f7a7c0..9e4fd04ab53c 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -86,6 +86,7 @@ struct dsa_slave_priv {
86/* dsa.c */ 86/* dsa.c */
87const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); 87const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
88bool dsa_schedule_work(struct work_struct *work); 88bool dsa_schedule_work(struct work_struct *work);
89const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops);
89 90
90/* legacy.c */ 91/* legacy.c */
91#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) 92#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
@@ -205,6 +206,9 @@ extern const struct dsa_device_ops dsa_netdev_ops;
205/* tag_edsa.c */ 206/* tag_edsa.c */
206extern const struct dsa_device_ops edsa_netdev_ops; 207extern const struct dsa_device_ops edsa_netdev_ops;
207 208
209/* tag_gswip.c */
210extern const struct dsa_device_ops gswip_netdev_ops;
211
208/* tag_ksz.c */ 212/* tag_ksz.c */
209extern const struct dsa_device_ops ksz_netdev_ops; 213extern const struct dsa_device_ops ksz_netdev_ops;
210 214
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 42a7b85b84e1..cb42939db776 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -392,8 +392,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
392 } 392 }
393 393
394 /* Drop our reference to the MDIO bus device */ 394 /* Drop our reference to the MDIO bus device */
395 if (pd->chip[i].host_dev) 395 put_device(pd->chip[i].host_dev);
396 put_device(pd->chip[i].host_dev);
397 } 396 }
398 kfree(pd->chip); 397 kfree(pd->chip);
399} 398}
@@ -687,8 +686,7 @@ static void dsa_shutdown(struct platform_device *pdev)
687#ifdef CONFIG_PM_SLEEP 686#ifdef CONFIG_PM_SLEEP
688static int dsa_suspend(struct device *d) 687static int dsa_suspend(struct device *d)
689{ 688{
690 struct platform_device *pdev = to_platform_device(d); 689 struct dsa_switch_tree *dst = dev_get_drvdata(d);
691 struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
692 int i, ret = 0; 690 int i, ret = 0;
693 691
694 for (i = 0; i < dst->pd->nr_chips; i++) { 692 for (i = 0; i < dst->pd->nr_chips; i++) {
@@ -703,8 +701,7 @@ static int dsa_suspend(struct device *d)
703 701
704static int dsa_resume(struct device *d) 702static int dsa_resume(struct device *d)
705{ 703{
706 struct platform_device *pdev = to_platform_device(d); 704 struct dsa_switch_tree *dst = dev_get_drvdata(d);
707 struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
708 int i, ret = 0; 705 int i, ret = 0;
709 706
710 for (i = 0; i < dst->pd->nr_chips; i++) { 707 for (i = 0; i < dst->pd->nr_chips; i++) {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1c45c1d6d241..7d0c19e7edcf 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -722,7 +722,7 @@ static void dsa_slave_netpoll_cleanup(struct net_device *dev)
722 722
723 p->netpoll = NULL; 723 p->netpoll = NULL;
724 724
725 __netpoll_free_async(netpoll); 725 __netpoll_free(netpoll);
726} 726}
727 727
728static void dsa_slave_poll_controller(struct net_device *dev) 728static void dsa_slave_poll_controller(struct net_device *dev)
@@ -1058,6 +1058,27 @@ static struct device_type dsa_type = {
1058 .name = "dsa", 1058 .name = "dsa",
1059}; 1059};
1060 1060
1061static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
1062 char *buf)
1063{
1064 struct net_device *dev = to_net_dev(d);
1065 struct dsa_port *dp = dsa_slave_to_port(dev);
1066
1067 return sprintf(buf, "%s\n",
1068 dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops));
1069}
1070static DEVICE_ATTR_RO(tagging);
1071
1072static struct attribute *dsa_slave_attrs[] = {
1073 &dev_attr_tagging.attr,
1074 NULL
1075};
1076
1077static const struct attribute_group dsa_group = {
1078 .name = "dsa",
1079 .attrs = dsa_slave_attrs,
1080};
1081
1061static void dsa_slave_phylink_validate(struct net_device *dev, 1082static void dsa_slave_phylink_validate(struct net_device *dev,
1062 unsigned long *supported, 1083 unsigned long *supported,
1063 struct phylink_link_state *state) 1084 struct phylink_link_state *state)
@@ -1353,8 +1374,14 @@ int dsa_slave_create(struct dsa_port *port)
1353 goto out_phy; 1374 goto out_phy;
1354 } 1375 }
1355 1376
1377 ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group);
1378 if (ret)
1379 goto out_unreg;
1380
1356 return 0; 1381 return 0;
1357 1382
1383out_unreg:
1384 unregister_netdev(slave_dev);
1358out_phy: 1385out_phy:
1359 rtnl_lock(); 1386 rtnl_lock();
1360 phylink_disconnect_phy(p->dp->pl); 1387 phylink_disconnect_phy(p->dp->pl);
@@ -1378,6 +1405,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
1378 rtnl_unlock(); 1405 rtnl_unlock();
1379 1406
1380 dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); 1407 dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
1408 sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group);
1381 unregister_netdev(slave_dev); 1409 unregister_netdev(slave_dev);
1382 phylink_destroy(dp->pl); 1410 phylink_destroy(dp->pl);
1383 free_percpu(p->stats64); 1411 free_percpu(p->stats64);
@@ -1450,6 +1478,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
1450 netdev_dbg(dev, "fdb add failed err=%d\n", err); 1478 netdev_dbg(dev, "fdb add failed err=%d\n", err);
1451 break; 1479 break;
1452 } 1480 }
1481 fdb_info->offloaded = true;
1453 call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev, 1482 call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
1454 &fdb_info->info); 1483 &fdb_info->info);
1455 break; 1484 break;
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
new file mode 100644
index 000000000000..49e9b73f1be3
--- /dev/null
+++ b/net/dsa/tag_gswip.c
@@ -0,0 +1,109 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Intel / Lantiq GSWIP V2.0 PMAC tag support
4 *
5 * Copyright (C) 2017 - 2018 Hauke Mehrtens <hauke@hauke-m.de>
6 */
7
8#include <linux/bitops.h>
9#include <linux/etherdevice.h>
10#include <linux/skbuff.h>
11#include <net/dsa.h>
12
13#include "dsa_priv.h"
14
15#define GSWIP_TX_HEADER_LEN 4
16
17/* special tag in TX path header */
18/* Byte 0 */
19#define GSWIP_TX_SLPID_SHIFT 0 /* source port ID */
20#define GSWIP_TX_SLPID_CPU 2
21#define GSWIP_TX_SLPID_APP1 3
22#define GSWIP_TX_SLPID_APP2 4
23#define GSWIP_TX_SLPID_APP3 5
24#define GSWIP_TX_SLPID_APP4 6
25#define GSWIP_TX_SLPID_APP5 7
26
27/* Byte 1 */
28#define GSWIP_TX_CRCGEN_DIS BIT(7)
29#define GSWIP_TX_DPID_SHIFT 0 /* destination group ID */
30#define GSWIP_TX_DPID_ELAN 0
31#define GSWIP_TX_DPID_EWAN 1
32#define GSWIP_TX_DPID_CPU 2
33#define GSWIP_TX_DPID_APP1 3
34#define GSWIP_TX_DPID_APP2 4
35#define GSWIP_TX_DPID_APP3 5
36#define GSWIP_TX_DPID_APP4 6
37#define GSWIP_TX_DPID_APP5 7
38
39/* Byte 2 */
40#define GSWIP_TX_PORT_MAP_EN BIT(7)
41#define GSWIP_TX_PORT_MAP_SEL BIT(6)
42#define GSWIP_TX_LRN_DIS BIT(5)
43#define GSWIP_TX_CLASS_EN BIT(4)
44#define GSWIP_TX_CLASS_SHIFT 0
45#define GSWIP_TX_CLASS_MASK GENMASK(3, 0)
46
47/* Byte 3 */
48#define GSWIP_TX_DPID_EN BIT(0)
49#define GSWIP_TX_PORT_MAP_SHIFT 1
50#define GSWIP_TX_PORT_MAP_MASK GENMASK(6, 1)
51
52#define GSWIP_RX_HEADER_LEN 8
53
54/* special tag in RX path header */
55/* Byte 7 */
56#define GSWIP_RX_SPPID_SHIFT 4
57#define GSWIP_RX_SPPID_MASK GENMASK(6, 4)
58
59static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
60 struct net_device *dev)
61{
62 struct dsa_port *dp = dsa_slave_to_port(dev);
63 int err;
64 u8 *gswip_tag;
65
66 err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN);
67 if (err)
68 return NULL;
69
70 skb_push(skb, GSWIP_TX_HEADER_LEN);
71
72 gswip_tag = skb->data;
73 gswip_tag[0] = GSWIP_TX_SLPID_CPU;
74 gswip_tag[1] = GSWIP_TX_DPID_ELAN;
75 gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL;
76 gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK;
77 gswip_tag[3] |= GSWIP_TX_DPID_EN;
78
79 return skb;
80}
81
82static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
83 struct net_device *dev,
84 struct packet_type *pt)
85{
86 int port;
87 u8 *gswip_tag;
88
89 if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
90 return NULL;
91
92 gswip_tag = skb->data - ETH_HLEN;
93
94 /* Get source port information */
95 port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
96 skb->dev = dsa_master_find_slave(dev, 0, port);
97 if (!skb->dev)
98 return NULL;
99
100 /* remove GSWIP tag */
101 skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
102
103 return skb;
104}
105
106const struct dsa_device_ops gswip_netdev_ops = {
107 .xmit = gswip_tag_xmit,
108 .rcv = gswip_tag_rcv,
109};
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index e7857a8ac86d..d14226ecfde4 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -260,7 +260,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
260 } 260 }
261 sub_frag_mem_limit(fq->q.net, sum_truesize); 261 sub_frag_mem_limit(fq->q.net, sum_truesize);
262 262
263 head->next = NULL; 263 skb_mark_not_on_list(head);
264 head->dev = ldev; 264 head->dev = ldev;
265 head->tstamp = fq->q.stamp; 265 head->tstamp = fq->q.stamp;
266 266
@@ -463,7 +463,6 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
463 463
464 table[0].data = &ieee802154_lowpan->frags.high_thresh; 464 table[0].data = &ieee802154_lowpan->frags.high_thresh;
465 table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; 465 table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
466 table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh;
467 table[1].data = &ieee802154_lowpan->frags.low_thresh; 466 table[1].data = &ieee802154_lowpan->frags.low_thresh;
468 table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; 467 table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
469 table[2].data = &ieee802154_lowpan->frags.timeout; 468 table[2].data = &ieee802154_lowpan->frags.timeout;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98661d8..58629314eae9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
63obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 63obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
64obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o 64obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
65obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 65obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
66obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
66obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 67obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
67 68
68obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 69obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 4dd95cdd8070..c01fa791260d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -461,9 +461,9 @@ static int ah4_err(struct sk_buff *skb, u32 info)
461 return 0; 461 return 0;
462 462
463 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 463 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
464 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); 464 ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
465 else 465 else
466 ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); 466 ipv4_redirect(skb, net, 0, IPPROTO_AH);
467 xfrm_state_put(x); 467 xfrm_state_put(x);
468 468
469 return 0; 469 return 0;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index e90c89ef8c08..850a6f13a082 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1255 change_info = ptr; 1255 change_info = ptr;
1256 if (change_info->flags_changed & IFF_NOARP) 1256 if (change_info->flags_changed & IFF_NOARP)
1257 neigh_changeaddr(&arp_tbl, dev); 1257 neigh_changeaddr(&arp_tbl, dev);
1258 if (!netif_carrier_ok(dev))
1259 neigh_carrier_down(&arp_tbl, dev);
1258 break; 1260 break;
1259 default: 1261 default:
1260 break; 1262 break;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 82178cc69c96..777fa3b7fb13 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
1512 * 1512 *
1513 * Description: 1513 * Description:
1514 * Parse the packet's IP header looking for a CIPSO option. Returns a pointer 1514 * Parse the packet's IP header looking for a CIPSO option. Returns a pointer
1515 * to the start of the CIPSO option on success, NULL if one if not found. 1515 * to the start of the CIPSO option on success, NULL if one is not found.
1516 * 1516 *
1517 */ 1517 */
1518unsigned char *cipso_v4_optptr(const struct sk_buff *skb) 1518unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
@@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
1522 int optlen; 1522 int optlen;
1523 int taglen; 1523 int taglen;
1524 1524
1525 for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { 1525 for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
1526 switch (optptr[0]) { 1526 switch (optptr[0]) {
1527 case IPOPT_CIPSO:
1528 return optptr;
1529 case IPOPT_END: 1527 case IPOPT_END:
1530 return NULL; 1528 return NULL;
1531 case IPOPT_NOOP: 1529 case IPOPT_NOOP:
@@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
1534 default: 1532 default:
1535 taglen = optptr[1]; 1533 taglen = optptr[1];
1536 } 1534 }
1535 if (!taglen || taglen > optlen)
1536 return NULL;
1537 if (optptr[0] == IPOPT_CIPSO)
1538 return optptr;
1539
1537 optlen -= taglen; 1540 optlen -= taglen;
1538 optptr += taglen; 1541 optptr += taglen;
1539 } 1542 }
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..300921417f89 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
42 oif = sk->sk_bound_dev_if; 42 oif = sk->sk_bound_dev_if;
43 saddr = inet->inet_saddr; 43 saddr = inet->inet_saddr;
44 if (ipv4_is_multicast(usin->sin_addr.s_addr)) { 44 if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
45 if (!oif) 45 if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
46 oif = inet->mc_index; 46 oif = inet->mc_index;
47 if (!saddr) 47 if (!saddr)
48 saddr = inet->mc_addr; 48 saddr = inet->mc_addr;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ea4bd8a52422..a34602ae27de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -100,6 +100,16 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
100 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, 100 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
101 [IFA_FLAGS] = { .type = NLA_U32 }, 101 [IFA_FLAGS] = { .type = NLA_U32 },
102 [IFA_RT_PRIORITY] = { .type = NLA_U32 }, 102 [IFA_RT_PRIORITY] = { .type = NLA_U32 },
103 [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
104};
105
106struct inet_fill_args {
107 u32 portid;
108 u32 seq;
109 int event;
110 unsigned int flags;
111 int netnsid;
112 int ifindex;
103}; 113};
104 114
105#define IN4_ADDR_HSIZE_SHIFT 8 115#define IN4_ADDR_HSIZE_SHIFT 8
@@ -773,7 +783,8 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
773} 783}
774 784
775static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, 785static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
776 __u32 *pvalid_lft, __u32 *pprefered_lft) 786 __u32 *pvalid_lft, __u32 *pprefered_lft,
787 struct netlink_ext_ack *extack)
777{ 788{
778 struct nlattr *tb[IFA_MAX+1]; 789 struct nlattr *tb[IFA_MAX+1];
779 struct in_ifaddr *ifa; 790 struct in_ifaddr *ifa;
@@ -783,7 +794,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
783 int err; 794 int err;
784 795
785 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, 796 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
786 NULL); 797 extack);
787 if (err < 0) 798 if (err < 0)
788 goto errout; 799 goto errout;
789 800
@@ -888,7 +899,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
888 899
889 ASSERT_RTNL(); 900 ASSERT_RTNL();
890 901
891 ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); 902 ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack);
892 if (IS_ERR(ifa)) 903 if (IS_ERR(ifa))
893 return PTR_ERR(ifa); 904 return PTR_ERR(ifa);
894 905
@@ -1584,13 +1595,14 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
1584} 1595}
1585 1596
1586static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1597static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1587 u32 portid, u32 seq, int event, unsigned int flags) 1598 struct inet_fill_args *args)
1588{ 1599{
1589 struct ifaddrmsg *ifm; 1600 struct ifaddrmsg *ifm;
1590 struct nlmsghdr *nlh; 1601 struct nlmsghdr *nlh;
1591 u32 preferred, valid; 1602 u32 preferred, valid;
1592 1603
1593 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); 1604 nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
1605 args->flags);
1594 if (!nlh) 1606 if (!nlh)
1595 return -EMSGSIZE; 1607 return -EMSGSIZE;
1596 1608
@@ -1601,6 +1613,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1601 ifm->ifa_scope = ifa->ifa_scope; 1613 ifm->ifa_scope = ifa->ifa_scope;
1602 ifm->ifa_index = ifa->ifa_dev->dev->ifindex; 1614 ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
1603 1615
1616 if (args->netnsid >= 0 &&
1617 nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
1618 goto nla_put_failure;
1619
1604 if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { 1620 if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
1605 preferred = ifa->ifa_preferred_lft; 1621 preferred = ifa->ifa_preferred_lft;
1606 valid = ifa->ifa_valid_lft; 1622 valid = ifa->ifa_valid_lft;
@@ -1645,27 +1661,142 @@ nla_put_failure:
1645 return -EMSGSIZE; 1661 return -EMSGSIZE;
1646} 1662}
1647 1663
1664static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
1665 struct inet_fill_args *fillargs,
1666 struct net **tgt_net, struct sock *sk,
1667 struct netlink_callback *cb)
1668{
1669 struct netlink_ext_ack *extack = cb->extack;
1670 struct nlattr *tb[IFA_MAX+1];
1671 struct ifaddrmsg *ifm;
1672 int err, i;
1673
1674 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
1675 NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
1676 return -EINVAL;
1677 }
1678
1679 ifm = nlmsg_data(nlh);
1680 if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
1681 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
1682 return -EINVAL;
1683 }
1684
1685 fillargs->ifindex = ifm->ifa_index;
1686 if (fillargs->ifindex) {
1687 cb->answer_flags |= NLM_F_DUMP_FILTERED;
1688 fillargs->flags |= NLM_F_DUMP_FILTERED;
1689 }
1690
1691 err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
1692 ifa_ipv4_policy, extack);
1693 if (err < 0)
1694 return err;
1695
1696 for (i = 0; i <= IFA_MAX; ++i) {
1697 if (!tb[i])
1698 continue;
1699
1700 if (i == IFA_TARGET_NETNSID) {
1701 struct net *net;
1702
1703 fillargs->netnsid = nla_get_s32(tb[i]);
1704
1705 net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
1706 if (IS_ERR(net)) {
1707 fillargs->netnsid = -1;
1708 NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
1709 return PTR_ERR(net);
1710 }
1711 *tgt_net = net;
1712 } else {
1713 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
1714 return -EINVAL;
1715 }
1716 }
1717
1718 return 0;
1719}
1720
1721static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
1722 struct netlink_callback *cb, int s_ip_idx,
1723 struct inet_fill_args *fillargs)
1724{
1725 struct in_ifaddr *ifa;
1726 int ip_idx = 0;
1727 int err;
1728
1729 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
1730 if (ip_idx < s_ip_idx)
1731 continue;
1732
1733 err = inet_fill_ifaddr(skb, ifa, fillargs);
1734 if (err < 0)
1735 goto done;
1736
1737 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1738 }
1739 err = 0;
1740
1741done:
1742 cb->args[2] = ip_idx;
1743
1744 return err;
1745}
1746
1648static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 1747static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1649{ 1748{
1749 const struct nlmsghdr *nlh = cb->nlh;
1750 struct inet_fill_args fillargs = {
1751 .portid = NETLINK_CB(cb->skb).portid,
1752 .seq = nlh->nlmsg_seq,
1753 .event = RTM_NEWADDR,
1754 .flags = NLM_F_MULTI,
1755 .netnsid = -1,
1756 };
1650 struct net *net = sock_net(skb->sk); 1757 struct net *net = sock_net(skb->sk);
1758 struct net *tgt_net = net;
1651 int h, s_h; 1759 int h, s_h;
1652 int idx, s_idx; 1760 int idx, s_idx;
1653 int ip_idx, s_ip_idx; 1761 int s_ip_idx;
1654 struct net_device *dev; 1762 struct net_device *dev;
1655 struct in_device *in_dev; 1763 struct in_device *in_dev;
1656 struct in_ifaddr *ifa;
1657 struct hlist_head *head; 1764 struct hlist_head *head;
1765 int err = 0;
1658 1766
1659 s_h = cb->args[0]; 1767 s_h = cb->args[0];
1660 s_idx = idx = cb->args[1]; 1768 s_idx = idx = cb->args[1];
1661 s_ip_idx = ip_idx = cb->args[2]; 1769 s_ip_idx = cb->args[2];
1770
1771 if (cb->strict_check) {
1772 err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
1773 skb->sk, cb);
1774 if (err < 0)
1775 goto put_tgt_net;
1776
1777 err = 0;
1778 if (fillargs.ifindex) {
1779 dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
1780 if (!dev) {
1781 err = -ENODEV;
1782 goto put_tgt_net;
1783 }
1784
1785 in_dev = __in_dev_get_rtnl(dev);
1786 if (in_dev) {
1787 err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
1788 &fillargs);
1789 }
1790 goto put_tgt_net;
1791 }
1792 }
1662 1793
1663 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 1794 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1664 idx = 0; 1795 idx = 0;
1665 head = &net->dev_index_head[h]; 1796 head = &tgt_net->dev_index_head[h];
1666 rcu_read_lock(); 1797 rcu_read_lock();
1667 cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ 1798 cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
1668 net->dev_base_seq; 1799 tgt_net->dev_base_seq;
1669 hlist_for_each_entry_rcu(dev, head, index_hlist) { 1800 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1670 if (idx < s_idx) 1801 if (idx < s_idx)
1671 goto cont; 1802 goto cont;
@@ -1675,18 +1806,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1675 if (!in_dev) 1806 if (!in_dev)
1676 goto cont; 1807 goto cont;
1677 1808
1678 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; 1809 err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
1679 ifa = ifa->ifa_next, ip_idx++) { 1810 &fillargs);
1680 if (ip_idx < s_ip_idx) 1811 if (err < 0) {
1681 continue; 1812 rcu_read_unlock();
1682 if (inet_fill_ifaddr(skb, ifa, 1813 goto done;
1683 NETLINK_CB(cb->skb).portid,
1684 cb->nlh->nlmsg_seq,
1685 RTM_NEWADDR, NLM_F_MULTI) < 0) {
1686 rcu_read_unlock();
1687 goto done;
1688 }
1689 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1690 } 1814 }
1691cont: 1815cont:
1692 idx++; 1816 idx++;
@@ -1697,16 +1821,24 @@ cont:
1697done: 1821done:
1698 cb->args[0] = h; 1822 cb->args[0] = h;
1699 cb->args[1] = idx; 1823 cb->args[1] = idx;
1700 cb->args[2] = ip_idx; 1824put_tgt_net:
1825 if (fillargs.netnsid >= 0)
1826 put_net(tgt_net);
1701 1827
1702 return skb->len; 1828 return err < 0 ? err : skb->len;
1703} 1829}
1704 1830
1705static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, 1831static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1706 u32 portid) 1832 u32 portid)
1707{ 1833{
1834 struct inet_fill_args fillargs = {
1835 .portid = portid,
1836 .seq = nlh ? nlh->nlmsg_seq : 0,
1837 .event = event,
1838 .flags = 0,
1839 .netnsid = -1,
1840 };
1708 struct sk_buff *skb; 1841 struct sk_buff *skb;
1709 u32 seq = nlh ? nlh->nlmsg_seq : 0;
1710 int err = -ENOBUFS; 1842 int err = -ENOBUFS;
1711 struct net *net; 1843 struct net *net;
1712 1844
@@ -1715,7 +1847,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1715 if (!skb) 1847 if (!skb)
1716 goto errout; 1848 goto errout;
1717 1849
1718 err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); 1850 err = inet_fill_ifaddr(skb, ifa, &fillargs);
1719 if (err < 0) { 1851 if (err < 0) {
1720 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ 1852 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
1721 WARN_ON(err == -EMSGSIZE); 1853 WARN_ON(err == -EMSGSIZE);
@@ -1995,6 +2127,7 @@ errout:
1995static int inet_netconf_dump_devconf(struct sk_buff *skb, 2127static int inet_netconf_dump_devconf(struct sk_buff *skb,
1996 struct netlink_callback *cb) 2128 struct netlink_callback *cb)
1997{ 2129{
2130 const struct nlmsghdr *nlh = cb->nlh;
1998 struct net *net = sock_net(skb->sk); 2131 struct net *net = sock_net(skb->sk);
1999 int h, s_h; 2132 int h, s_h;
2000 int idx, s_idx; 2133 int idx, s_idx;
@@ -2002,6 +2135,21 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
2002 struct in_device *in_dev; 2135 struct in_device *in_dev;
2003 struct hlist_head *head; 2136 struct hlist_head *head;
2004 2137
2138 if (cb->strict_check) {
2139 struct netlink_ext_ack *extack = cb->extack;
2140 struct netconfmsg *ncm;
2141
2142 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
2143 NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
2144 return -EINVAL;
2145 }
2146
2147 if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
2148 NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
2149 return -EINVAL;
2150 }
2151 }
2152
2005 s_h = cb->args[0]; 2153 s_h = cb->args[0];
2006 s_idx = idx = cb->args[1]; 2154 s_idx = idx = cb->args[1];
2007 2155
@@ -2021,7 +2169,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
2021 if (inet_netconf_fill_devconf(skb, dev->ifindex, 2169 if (inet_netconf_fill_devconf(skb, dev->ifindex,
2022 &in_dev->cnf, 2170 &in_dev->cnf,
2023 NETLINK_CB(cb->skb).portid, 2171 NETLINK_CB(cb->skb).portid,
2024 cb->nlh->nlmsg_seq, 2172 nlh->nlmsg_seq,
2025 RTM_NEWNETCONF, 2173 RTM_NEWNETCONF,
2026 NLM_F_MULTI, 2174 NLM_F_MULTI,
2027 NETCONFA_ALL) < 0) { 2175 NETCONFA_ALL) < 0) {
@@ -2038,7 +2186,7 @@ cont:
2038 if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, 2186 if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
2039 net->ipv4.devconf_all, 2187 net->ipv4.devconf_all,
2040 NETLINK_CB(cb->skb).portid, 2188 NETLINK_CB(cb->skb).portid,
2041 cb->nlh->nlmsg_seq, 2189 nlh->nlmsg_seq,
2042 RTM_NEWNETCONF, NLM_F_MULTI, 2190 RTM_NEWNETCONF, NLM_F_MULTI,
2043 NETCONFA_ALL) < 0) 2191 NETCONFA_ALL) < 0)
2044 goto done; 2192 goto done;
@@ -2049,7 +2197,7 @@ cont:
2049 if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, 2197 if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
2050 net->ipv4.devconf_dflt, 2198 net->ipv4.devconf_dflt,
2051 NETLINK_CB(cb->skb).portid, 2199 NETLINK_CB(cb->skb).portid,
2052 cb->nlh->nlmsg_seq, 2200 nlh->nlmsg_seq,
2053 RTM_NEWNETCONF, NLM_F_MULTI, 2201 RTM_NEWNETCONF, NLM_F_MULTI,
2054 NETCONFA_ALL) < 0) 2202 NETCONFA_ALL) < 0)
2055 goto done; 2203 goto done;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 97689012b357..9e1c840596c5 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -683,12 +683,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
683 */ 683 */
684static int esp_input(struct xfrm_state *x, struct sk_buff *skb) 684static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
685{ 685{
686 struct ip_esp_hdr *esph;
687 struct crypto_aead *aead = x->data; 686 struct crypto_aead *aead = x->data;
688 struct aead_request *req; 687 struct aead_request *req;
689 struct sk_buff *trailer; 688 struct sk_buff *trailer;
690 int ivlen = crypto_aead_ivsize(aead); 689 int ivlen = crypto_aead_ivsize(aead);
691 int elen = skb->len - sizeof(*esph) - ivlen; 690 int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
692 int nfrags; 691 int nfrags;
693 int assoclen; 692 int assoclen;
694 int seqhilen; 693 int seqhilen;
@@ -698,13 +697,13 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
698 struct scatterlist *sg; 697 struct scatterlist *sg;
699 int err = -EINVAL; 698 int err = -EINVAL;
700 699
701 if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) 700 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
702 goto out; 701 goto out;
703 702
704 if (elen <= 0) 703 if (elen <= 0)
705 goto out; 704 goto out;
706 705
707 assoclen = sizeof(*esph); 706 assoclen = sizeof(struct ip_esp_hdr);
708 seqhilen = 0; 707 seqhilen = 0;
709 708
710 if (x->props.flags & XFRM_STATE_ESN) { 709 if (x->props.flags & XFRM_STATE_ESN) {
@@ -820,9 +819,9 @@ static int esp4_err(struct sk_buff *skb, u32 info)
820 return 0; 819 return 0;
821 820
822 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 821 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
823 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); 822 ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
824 else 823 else
825 ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); 824 ipv4_redirect(skb, net, 0, IPPROTO_ESP);
826 xfrm_state_put(x); 825 xfrm_state_put(x);
827 826
828 return 0; 827 return 0;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 0113993e9b2c..6df95be96311 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
315 return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); 315 return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
316} 316}
317 317
318bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
319{
320 bool dev_match = false;
321#ifdef CONFIG_IP_ROUTE_MULTIPATH
322 int ret;
323
324 for (ret = 0; ret < fi->fib_nhs; ret++) {
325 struct fib_nh *nh = &fi->fib_nh[ret];
326
327 if (nh->nh_dev == dev) {
328 dev_match = true;
329 break;
330 } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
331 dev_match = true;
332 break;
333 }
334 }
335#else
336 if (fi->fib_nh[0].nh_dev == dev)
337 dev_match = true;
338#endif
339
340 return dev_match;
341}
342EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
343
318/* Given (packet source, input interface) and optional (dst, oif, tos): 344/* Given (packet source, input interface) and optional (dst, oif, tos):
319 * - (main) check, that source is valid i.e. not broadcast or our local 345 * - (main) check, that source is valid i.e. not broadcast or our local
320 * address. 346 * address.
@@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
361 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) 387 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
362 goto e_inval; 388 goto e_inval;
363 fib_combine_itag(itag, &res); 389 fib_combine_itag(itag, &res);
364 dev_match = false;
365 390
366#ifdef CONFIG_IP_ROUTE_MULTIPATH 391 dev_match = fib_info_nh_uses_dev(res.fi, dev);
367 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
368 struct fib_nh *nh = &res.fi->fib_nh[ret];
369
370 if (nh->nh_dev == dev) {
371 dev_match = true;
372 break;
373 } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
374 dev_match = true;
375 break;
376 }
377 }
378#else
379 if (FIB_RES_DEV(res) == dev)
380 dev_match = true;
381#endif
382 if (dev_match) { 392 if (dev_match) {
383 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 393 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
384 return ret; 394 return ret;
@@ -792,19 +802,115 @@ errout:
792 return err; 802 return err;
793} 803}
794 804
805int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
806 struct fib_dump_filter *filter,
807 struct netlink_callback *cb)
808{
809 struct netlink_ext_ack *extack = cb->extack;
810 struct nlattr *tb[RTA_MAX + 1];
811 struct rtmsg *rtm;
812 int err, i;
813
814 ASSERT_RTNL();
815
816 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
817 NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
818 return -EINVAL;
819 }
820
821 rtm = nlmsg_data(nlh);
822 if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
823 rtm->rtm_scope) {
824 NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
825 return -EINVAL;
826 }
827 if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
828 NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
829 return -EINVAL;
830 }
831
832 filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
833 filter->flags = rtm->rtm_flags;
834 filter->protocol = rtm->rtm_protocol;
835 filter->rt_type = rtm->rtm_type;
836 filter->table_id = rtm->rtm_table;
837
838 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
839 rtm_ipv4_policy, extack);
840 if (err < 0)
841 return err;
842
843 for (i = 0; i <= RTA_MAX; ++i) {
844 int ifindex;
845
846 if (!tb[i])
847 continue;
848
849 switch (i) {
850 case RTA_TABLE:
851 filter->table_id = nla_get_u32(tb[i]);
852 break;
853 case RTA_OIF:
854 ifindex = nla_get_u32(tb[i]);
855 filter->dev = __dev_get_by_index(net, ifindex);
856 if (!filter->dev)
857 return -ENODEV;
858 break;
859 default:
860 NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
861 return -EINVAL;
862 }
863 }
864
865 if (filter->flags || filter->protocol || filter->rt_type ||
866 filter->table_id || filter->dev) {
867 filter->filter_set = 1;
868 cb->answer_flags = NLM_F_DUMP_FILTERED;
869 }
870
871 return 0;
872}
873EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
874
795static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 875static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
796{ 876{
877 const struct nlmsghdr *nlh = cb->nlh;
797 struct net *net = sock_net(skb->sk); 878 struct net *net = sock_net(skb->sk);
879 struct fib_dump_filter filter = {};
798 unsigned int h, s_h; 880 unsigned int h, s_h;
799 unsigned int e = 0, s_e; 881 unsigned int e = 0, s_e;
800 struct fib_table *tb; 882 struct fib_table *tb;
801 struct hlist_head *head; 883 struct hlist_head *head;
802 int dumped = 0, err; 884 int dumped = 0, err;
803 885
804 if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && 886 if (cb->strict_check) {
805 ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) 887 err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
888 if (err < 0)
889 return err;
890 } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
891 struct rtmsg *rtm = nlmsg_data(nlh);
892
893 filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
894 }
895
896 /* fib entries are never clones and ipv4 does not use prefix flag */
897 if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
806 return skb->len; 898 return skb->len;
807 899
900 if (filter.table_id) {
901 tb = fib_get_table(net, filter.table_id);
902 if (!tb) {
903 if (filter.dump_all_families)
904 return skb->len;
905
906 NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
907 return -ENOENT;
908 }
909
910 err = fib_table_dump(tb, skb, cb, &filter);
911 return skb->len ? : err;
912 }
913
808 s_h = cb->args[0]; 914 s_h = cb->args[0];
809 s_e = cb->args[1]; 915 s_e = cb->args[1];
810 916
@@ -819,7 +925,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
819 if (dumped) 925 if (dumped)
820 memset(&cb->args[2], 0, sizeof(cb->args) - 926 memset(&cb->args[2], 0, sizeof(cb->args) -
821 2 * sizeof(cb->args[0])); 927 2 * sizeof(cb->args[0]));
822 err = fib_table_dump(tb, skb, cb); 928 err = fib_table_dump(tb, skb, cb, &filter);
823 if (err < 0) { 929 if (err < 0) {
824 if (likely(skb->len)) 930 if (likely(skb->len))
825 goto out; 931 goto out;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 446204ca7406..b5c3937ca6ec 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
208static void free_fib_info_rcu(struct rcu_head *head) 208static void free_fib_info_rcu(struct rcu_head *head)
209{ 209{
210 struct fib_info *fi = container_of(head, struct fib_info, rcu); 210 struct fib_info *fi = container_of(head, struct fib_info, rcu);
211 struct dst_metrics *m;
212 211
213 change_nexthops(fi) { 212 change_nexthops(fi) {
214 if (nexthop_nh->nh_dev) 213 if (nexthop_nh->nh_dev)
@@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
219 rt_fibinfo_free(&nexthop_nh->nh_rth_input); 218 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
220 } endfor_nexthops(fi); 219 } endfor_nexthops(fi);
221 220
222 m = fi->fib_metrics; 221 ip_fib_metrics_put(fi->fib_metrics);
223 if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) 222
224 kfree(m);
225 kfree(fi); 223 kfree(fi);
226} 224}
227 225
@@ -797,8 +795,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
797 return -EINVAL; 795 return -EINVAL;
798 } 796 }
799 dev = __dev_get_by_index(net, nh->nh_oif); 797 dev = __dev_get_by_index(net, nh->nh_oif);
800 if (!dev) 798 if (!dev) {
799 NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
801 return -ENODEV; 800 return -ENODEV;
801 }
802 if (!(dev->flags & IFF_UP)) { 802 if (!(dev->flags & IFF_UP)) {
803 NL_SET_ERR_MSG(extack, 803 NL_SET_ERR_MSG(extack,
804 "Nexthop device is not up"); 804 "Nexthop device is not up");
@@ -1018,13 +1018,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
1018 return true; 1018 return true;
1019} 1019}
1020 1020
1021static int
1022fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
1023{
1024 return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
1025 fi->fib_metrics->metrics);
1026}
1027
1028struct fib_info *fib_create_info(struct fib_config *cfg, 1021struct fib_info *fib_create_info(struct fib_config *cfg,
1029 struct netlink_ext_ack *extack) 1022 struct netlink_ext_ack *extack)
1030{ 1023{
@@ -1082,16 +1075,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
1082 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 1075 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1083 if (!fi) 1076 if (!fi)
1084 goto failure; 1077 goto failure;
1085 if (cfg->fc_mx) { 1078 fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
1086 fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); 1079 cfg->fc_mx_len);
1087 if (unlikely(!fi->fib_metrics)) { 1080 if (unlikely(IS_ERR(fi->fib_metrics))) {
1088 kfree(fi); 1081 err = PTR_ERR(fi->fib_metrics);
1089 return ERR_PTR(err); 1082 kfree(fi);
1090 } 1083 return ERR_PTR(err);
1091 refcount_set(&fi->fib_metrics->refcnt, 1);
1092 } else {
1093 fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
1094 } 1084 }
1085
1095 fib_info_cnt++; 1086 fib_info_cnt++;
1096 fi->fib_net = net; 1087 fi->fib_net = net;
1097 fi->fib_protocol = cfg->fc_protocol; 1088 fi->fib_protocol = cfg->fc_protocol;
@@ -1110,10 +1101,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
1110 goto failure; 1101 goto failure;
1111 } endfor_nexthops(fi) 1102 } endfor_nexthops(fi)
1112 1103
1113 err = fib_convert_metrics(fi, cfg);
1114 if (err)
1115 goto failure;
1116
1117 if (cfg->fc_mp) { 1104 if (cfg->fc_mp) {
1118#ifdef CONFIG_IP_ROUTE_MULTIPATH 1105#ifdef CONFIG_IP_ROUTE_MULTIPATH
1119 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); 1106 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
2003} 2003}
2004 2004
2005static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, 2005static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2006 struct sk_buff *skb, struct netlink_callback *cb) 2006 struct sk_buff *skb, struct netlink_callback *cb,
2007 struct fib_dump_filter *filter)
2007{ 2008{
2009 unsigned int flags = NLM_F_MULTI;
2008 __be32 xkey = htonl(l->key); 2010 __be32 xkey = htonl(l->key);
2009 struct fib_alias *fa; 2011 struct fib_alias *fa;
2010 int i, s_i; 2012 int i, s_i;
2011 2013
2014 if (filter->filter_set)
2015 flags |= NLM_F_DUMP_FILTERED;
2016
2012 s_i = cb->args[4]; 2017 s_i = cb->args[4];
2013 i = 0; 2018 i = 0;
2014 2019
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2016 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { 2021 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2017 int err; 2022 int err;
2018 2023
2019 if (i < s_i) { 2024 if (i < s_i)
2020 i++; 2025 goto next;
2021 continue;
2022 }
2023 2026
2024 if (tb->tb_id != fa->tb_id) { 2027 if (tb->tb_id != fa->tb_id)
2025 i++; 2028 goto next;
2026 continue; 2029
2030 if (filter->filter_set) {
2031 if (filter->rt_type && fa->fa_type != filter->rt_type)
2032 goto next;
2033
2034 if ((filter->protocol &&
2035 fa->fa_info->fib_protocol != filter->protocol))
2036 goto next;
2037
2038 if (filter->dev &&
2039 !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
2040 goto next;
2027 } 2041 }
2028 2042
2029 err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, 2043 err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
2030 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2044 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2031 tb->tb_id, fa->fa_type, 2045 tb->tb_id, fa->fa_type,
2032 xkey, KEYLENGTH - fa->fa_slen, 2046 xkey, KEYLENGTH - fa->fa_slen,
2033 fa->fa_tos, fa->fa_info, NLM_F_MULTI); 2047 fa->fa_tos, fa->fa_info, flags);
2034 if (err < 0) { 2048 if (err < 0) {
2035 cb->args[4] = i; 2049 cb->args[4] = i;
2036 return err; 2050 return err;
2037 } 2051 }
2052next:
2038 i++; 2053 i++;
2039 } 2054 }
2040 2055
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2044 2059
2045/* rcu_read_lock needs to be hold by caller from readside */ 2060/* rcu_read_lock needs to be hold by caller from readside */
2046int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 2061int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2047 struct netlink_callback *cb) 2062 struct netlink_callback *cb, struct fib_dump_filter *filter)
2048{ 2063{
2049 struct trie *t = (struct trie *)tb->tb_data; 2064 struct trie *t = (struct trie *)tb->tb_data;
2050 struct key_vector *l, *tp = t->kv; 2065 struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2057 while ((l = leaf_walk_rcu(&tp, key)) != NULL) { 2072 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2058 int err; 2073 int err;
2059 2074
2060 err = fn_trie_dump_leaf(l, tb, skb, cb); 2075 err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
2061 if (err < 0) { 2076 if (err < 0) {
2062 cb->args[3] = key; 2077 cb->args[3] = key;
2063 cb->args[2] = count; 2078 cb->args[2] = count;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862b6be5..7efe740c06eb 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -86,13 +86,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
86 86
87 options = (__be32 *)(greh + 1); 87 options = (__be32 *)(greh + 1);
88 if (greh->flags & GRE_CSUM) { 88 if (greh->flags & GRE_CSUM) {
89 if (skb_checksum_simple_validate(skb)) { 89 if (!skb_checksum_simple_validate(skb)) {
90 skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
91 null_compute_pseudo);
92 } else if (csum_err) {
90 *csum_err = true; 93 *csum_err = true;
91 return -EINVAL; 94 return -EINVAL;
92 } 95 }
93 96
94 skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
95 null_compute_pseudo);
96 options++; 97 options++;
97 } 98 }
98 99
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 695979b7ef6d..d832beed6e3a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1098,9 +1098,9 @@ void icmp_err(struct sk_buff *skb, u32 info)
1098 } 1098 }
1099 1099
1100 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) 1100 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1101 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); 1101 ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
1102 else if (type == ICMP_REDIRECT) 1102 else if (type == ICMP_REDIRECT)
1103 ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); 1103 ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
1104} 1104}
1105 1105
1106/* 1106/*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e7227128df2c..9b0158fa431f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -260,8 +260,7 @@ out:
260 spin_unlock(&qp->q.lock); 260 spin_unlock(&qp->q.lock);
261out_rcu_unlock: 261out_rcu_unlock:
262 rcu_read_unlock(); 262 rcu_read_unlock();
263 if (head) 263 kfree_skb(head);
264 kfree_skb(head);
265 ipq_put(qp); 264 ipq_put(qp);
266} 265}
267 266
@@ -382,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
382 */ 381 */
383 if (end < qp->q.len || 382 if (end < qp->q.len ||
384 ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) 383 ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
385 goto err; 384 goto discard_qp;
386 qp->q.flags |= INET_FRAG_LAST_IN; 385 qp->q.flags |= INET_FRAG_LAST_IN;
387 qp->q.len = end; 386 qp->q.len = end;
388 } else { 387 } else {
@@ -394,20 +393,20 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
394 if (end > qp->q.len) { 393 if (end > qp->q.len) {
395 /* Some bits beyond end -> corruption. */ 394 /* Some bits beyond end -> corruption. */
396 if (qp->q.flags & INET_FRAG_LAST_IN) 395 if (qp->q.flags & INET_FRAG_LAST_IN)
397 goto err; 396 goto discard_qp;
398 qp->q.len = end; 397 qp->q.len = end;
399 } 398 }
400 } 399 }
401 if (end == offset) 400 if (end == offset)
402 goto err; 401 goto discard_qp;
403 402
404 err = -ENOMEM; 403 err = -ENOMEM;
405 if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) 404 if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
406 goto err; 405 goto discard_qp;
407 406
408 err = pskb_trim_rcsum(skb, end - offset); 407 err = pskb_trim_rcsum(skb, end - offset);
409 if (err) 408 if (err)
410 goto err; 409 goto discard_qp;
411 410
412 /* Note : skb->rbnode and skb->dev share the same location. */ 411 /* Note : skb->rbnode and skb->dev share the same location. */
413 dev = skb->dev; 412 dev = skb->dev;
@@ -423,6 +422,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
423 * We do the same here for IPv4 (and increment an snmp counter). 422 * We do the same here for IPv4 (and increment an snmp counter).
424 */ 423 */
425 424
425 err = -EINVAL;
426 /* Find out where to put this fragment. */ 426 /* Find out where to put this fragment. */
427 prev_tail = qp->q.fragments_tail; 427 prev_tail = qp->q.fragments_tail;
428 if (!prev_tail) 428 if (!prev_tail)
@@ -431,7 +431,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
431 /* This is the common case: skb goes to the end. */ 431 /* This is the common case: skb goes to the end. */
432 /* Detect and discard overlaps. */ 432 /* Detect and discard overlaps. */
433 if (offset < prev_tail->ip_defrag_offset + prev_tail->len) 433 if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
434 goto discard_qp; 434 goto overlap;
435 if (offset == prev_tail->ip_defrag_offset + prev_tail->len) 435 if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
436 ip4_frag_append_to_last_run(&qp->q, skb); 436 ip4_frag_append_to_last_run(&qp->q, skb);
437 else 437 else
@@ -450,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
450 FRAG_CB(skb1)->frag_run_len) 450 FRAG_CB(skb1)->frag_run_len)
451 rbn = &parent->rb_right; 451 rbn = &parent->rb_right;
452 else /* Found an overlap with skb1. */ 452 else /* Found an overlap with skb1. */
453 goto discard_qp; 453 goto overlap;
454 } while (*rbn); 454 } while (*rbn);
455 /* Here we have parent properly set, and rbn pointing to 455 /* Here we have parent properly set, and rbn pointing to
456 * one of its NULL left/right children. Insert skb. 456 * one of its NULL left/right children. Insert skb.
@@ -487,16 +487,18 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
487 skb->_skb_refdst = 0UL; 487 skb->_skb_refdst = 0UL;
488 err = ip_frag_reasm(qp, skb, prev_tail, dev); 488 err = ip_frag_reasm(qp, skb, prev_tail, dev);
489 skb->_skb_refdst = orefdst; 489 skb->_skb_refdst = orefdst;
490 if (err)
491 inet_frag_kill(&qp->q);
490 return err; 492 return err;
491 } 493 }
492 494
493 skb_dst_drop(skb); 495 skb_dst_drop(skb);
494 return -EINPROGRESS; 496 return -EINPROGRESS;
495 497
498overlap:
499 __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
496discard_qp: 500discard_qp:
497 inet_frag_kill(&qp->q); 501 inet_frag_kill(&qp->q);
498 err = -EINVAL;
499 __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
500err: 502err:
501 kfree_skb(skb); 503 kfree_skb(skb);
502 return err; 504 return err;
@@ -621,7 +623,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
621 sub_frag_mem_limit(qp->q.net, head->truesize); 623 sub_frag_mem_limit(qp->q.net, head->truesize);
622 624
623 *nextp = NULL; 625 *nextp = NULL;
624 head->next = NULL; 626 skb_mark_not_on_list(head);
625 head->prev = NULL; 627 head->prev = NULL;
626 head->dev = dev; 628 head->dev = dev;
627 head->tstamp = qp->q.stamp; 629 head->tstamp = qp->q.stamp;
@@ -820,7 +822,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
820 822
821 table[0].data = &net->ipv4.frags.high_thresh; 823 table[0].data = &net->ipv4.frags.high_thresh;
822 table[0].extra1 = &net->ipv4.frags.low_thresh; 824 table[0].extra1 = &net->ipv4.frags.low_thresh;
823 table[0].extra2 = &init_net.ipv4.frags.high_thresh;
824 table[1].data = &net->ipv4.frags.low_thresh; 825 table[1].data = &net->ipv4.frags.low_thresh;
825 table[1].extra2 = &net->ipv4.frags.high_thresh; 826 table[1].extra2 = &net->ipv4.frags.high_thresh;
826 table[2].data = &net->ipv4.frags.timeout; 827 table[2].data = &net->ipv4.frags.timeout;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8cce0e9ea08c..38befe829caf 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -232,22 +232,19 @@ static void gre_err(struct sk_buff *skb, u32 info)
232 const int type = icmp_hdr(skb)->type; 232 const int type = icmp_hdr(skb)->type;
233 const int code = icmp_hdr(skb)->code; 233 const int code = icmp_hdr(skb)->code;
234 struct tnl_ptk_info tpi; 234 struct tnl_ptk_info tpi;
235 bool csum_err = false;
236 235
237 if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 236 if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
238 iph->ihl * 4) < 0) { 237 iph->ihl * 4) < 0)
239 if (!csum_err) /* ignore csum errors. */ 238 return;
240 return;
241 }
242 239
243 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 240 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
244 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 241 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
245 skb->dev->ifindex, 0, IPPROTO_GRE, 0); 242 skb->dev->ifindex, IPPROTO_GRE);
246 return; 243 return;
247 } 244 }
248 if (type == ICMP_REDIRECT) { 245 if (type == ICMP_REDIRECT) {
249 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, 246 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
250 IPPROTO_GRE, 0); 247 IPPROTO_GRE);
251 return; 248 return;
252 } 249 }
253 250
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..35a786c0aaa0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -531,11 +531,7 @@ static void ip_sublist_rcv_finish(struct list_head *head)
531 struct sk_buff *skb, *next; 531 struct sk_buff *skb, *next;
532 532
533 list_for_each_entry_safe(skb, next, head, list) { 533 list_for_each_entry_safe(skb, next, head, list) {
534 list_del(&skb->list); 534 skb_list_del_init(skb);
535 /* Handle ip{6}_forward case, as sch_direct_xmit have
536 * another kind of SKB-list usage (see validate_xmit_skb_list)
537 */
538 skb->next = NULL;
539 dst_input(skb); 535 dst_input(skb);
540 } 536 }
541} 537}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..c09219e7f230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
278 struct sk_buff *nskb = segs->next; 278 struct sk_buff *nskb = segs->next;
279 int err; 279 int err;
280 280
281 segs->next = NULL; 281 skb_mark_not_on_list(segs);
282 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); 282 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
283 283
284 if (err && ret == 0) 284 if (err && ret == 0)
@@ -684,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
684 684
685 skb = frag; 685 skb = frag;
686 frag = skb->next; 686 frag = skb->next;
687 skb->next = NULL; 687 skb_mark_not_on_list(skb);
688 } 688 }
689 689
690 if (err == 0) { 690 if (err == 0) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index f38cb21d773d..de31b302d69c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -318,9 +318,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
318 return 0; 318 return 0;
319 319
320 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 320 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
321 ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); 321 ipv4_update_pmtu(skb, net, info, 0, protocol);
322 else 322 else
323 ipv4_redirect(skb, net, 0, 0, protocol, 0); 323 ipv4_redirect(skb, net, 0, protocol);
324 xfrm_state_put(x); 324 xfrm_state_put(x);
325 325
326 return 0; 326 return 0;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d97f4f2787f5..9119d012ba46 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -48,9 +48,9 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
48 return 0; 48 return 0;
49 49
50 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 50 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
51 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); 51 ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
52 else 52 else
53 ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); 53 ipv4_redirect(skb, net, 0, IPPROTO_COMP);
54 xfrm_state_put(x); 54 xfrm_state_put(x);
55 55
56 return 0; 56 return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..e65287c27e3d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -175,13 +175,12 @@ static int ipip_err(struct sk_buff *skb, u32 info)
175 } 175 }
176 176
177 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 177 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
178 ipv4_update_pmtu(skb, net, info, t->parms.link, 0, 178 ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
179 iph->protocol, 0);
180 goto out; 179 goto out;
181 } 180 }
182 181
183 if (type == ICMP_REDIRECT) { 182 if (type == ICMP_REDIRECT) {
184 ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0); 183 ipv4_redirect(skb, net, t->parms.link, iph->protocol);
185 goto out; 184 goto out;
186 } 185 }
187 186
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5660adcf7a04..a6defbec4f1b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,8 +2527,34 @@ errout_free:
2527 2527
2528static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) 2528static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2529{ 2529{
2530 struct fib_dump_filter filter = {};
2531 int err;
2532
2533 if (cb->strict_check) {
2534 err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
2535 &filter, cb);
2536 if (err < 0)
2537 return err;
2538 }
2539
2540 if (filter.table_id) {
2541 struct mr_table *mrt;
2542
2543 mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
2544 if (!mrt) {
2545 if (filter.dump_all_families)
2546 return skb->len;
2547
2548 NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
2549 return -ENOENT;
2550 }
2551 err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
2552 &mfc_unres_lock, &filter);
2553 return skb->len ? : err;
2554 }
2555
2530 return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, 2556 return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
2531 _ipmr_fill_mroute, &mfc_unres_lock); 2557 _ipmr_fill_mroute, &mfc_unres_lock, &filter);
2532} 2558}
2533 2559
2534static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { 2560static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
@@ -2710,6 +2736,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
2710 return true; 2736 return true;
2711} 2737}
2712 2738
2739static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
2740 struct netlink_ext_ack *extack)
2741{
2742 struct ifinfomsg *ifm;
2743
2744 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
2745 NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
2746 return -EINVAL;
2747 }
2748
2749 if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
2750 NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
2751 return -EINVAL;
2752 }
2753
2754 ifm = nlmsg_data(nlh);
2755 if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
2756 ifm->ifi_change || ifm->ifi_index) {
2757 NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
2758 return -EINVAL;
2759 }
2760
2761 return 0;
2762}
2763
2713static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) 2764static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
2714{ 2765{
2715 struct net *net = sock_net(skb->sk); 2766 struct net *net = sock_net(skb->sk);
@@ -2718,6 +2769,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
2718 unsigned int e = 0, s_e; 2769 unsigned int e = 0, s_e;
2719 struct mr_table *mrt; 2770 struct mr_table *mrt;
2720 2771
2772 if (cb->strict_check) {
2773 int err = ipmr_valid_dumplink(cb->nlh, cb->extack);
2774
2775 if (err < 0)
2776 return err;
2777 }
2778
2721 s_t = cb->args[0]; 2779 s_t = cb->args[0];
2722 s_e = cb->args[1]; 2780 s_e = cb->args[1];
2723 2781
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index eab8cd5ec2f5..3e614cc824f7 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,81 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
268} 268}
269EXPORT_SYMBOL(mr_fill_mroute); 269EXPORT_SYMBOL(mr_fill_mroute);
270 270
271static bool mr_mfc_uses_dev(const struct mr_table *mrt,
272 const struct mr_mfc *c,
273 const struct net_device *dev)
274{
275 int ct;
276
277 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
278 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
279 const struct vif_device *vif;
280
281 vif = &mrt->vif_table[ct];
282 if (vif->dev == dev)
283 return true;
284 }
285 }
286 return false;
287}
288
289int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
290 struct netlink_callback *cb,
291 int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
292 u32 portid, u32 seq, struct mr_mfc *c,
293 int cmd, int flags),
294 spinlock_t *lock, struct fib_dump_filter *filter)
295{
296 unsigned int e = 0, s_e = cb->args[1];
297 unsigned int flags = NLM_F_MULTI;
298 struct mr_mfc *mfc;
299 int err;
300
301 if (filter->filter_set)
302 flags |= NLM_F_DUMP_FILTERED;
303
304 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
305 if (e < s_e)
306 goto next_entry;
307 if (filter->dev &&
308 !mr_mfc_uses_dev(mrt, mfc, filter->dev))
309 goto next_entry;
310
311 err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
312 cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
313 if (err < 0)
314 goto out;
315next_entry:
316 e++;
317 }
318
319 spin_lock_bh(lock);
320 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
321 if (e < s_e)
322 goto next_entry2;
323 if (filter->dev &&
324 !mr_mfc_uses_dev(mrt, mfc, filter->dev))
325 goto next_entry2;
326
327 err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
328 cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
329 if (err < 0) {
330 spin_unlock_bh(lock);
331 goto out;
332 }
333next_entry2:
334 e++;
335 }
336 spin_unlock_bh(lock);
337 err = 0;
338 e = 0;
339
340out:
341 cb->args[1] = e;
342 return err;
343}
344EXPORT_SYMBOL(mr_table_dump);
345
271int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, 346int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
272 struct mr_table *(*iter)(struct net *net, 347 struct mr_table *(*iter)(struct net *net,
273 struct mr_table *mrt), 348 struct mr_table *mrt),
@@ -275,51 +350,35 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
275 struct sk_buff *skb, 350 struct sk_buff *skb,
276 u32 portid, u32 seq, struct mr_mfc *c, 351 u32 portid, u32 seq, struct mr_mfc *c,
277 int cmd, int flags), 352 int cmd, int flags),
278 spinlock_t *lock) 353 spinlock_t *lock, struct fib_dump_filter *filter)
279{ 354{
280 unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1]; 355 unsigned int t = 0, s_t = cb->args[0];
281 struct net *net = sock_net(skb->sk); 356 struct net *net = sock_net(skb->sk);
282 struct mr_table *mrt; 357 struct mr_table *mrt;
283 struct mr_mfc *mfc; 358 int err;
359
360 /* multicast does not track protocol or have route type other
361 * than RTN_MULTICAST
362 */
363 if (filter->filter_set) {
364 if (filter->protocol || filter->flags ||
365 (filter->rt_type && filter->rt_type != RTN_MULTICAST))
366 return skb->len;
367 }
284 368
285 rcu_read_lock(); 369 rcu_read_lock();
286 for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { 370 for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
287 if (t < s_t) 371 if (t < s_t)
288 goto next_table; 372 goto next_table;
289 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
290 if (e < s_e)
291 goto next_entry;
292 if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
293 cb->nlh->nlmsg_seq, mfc,
294 RTM_NEWROUTE, NLM_F_MULTI) < 0)
295 goto done;
296next_entry:
297 e++;
298 }
299 373
300 spin_lock_bh(lock); 374 err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
301 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { 375 if (err < 0)
302 if (e < s_e) 376 break;
303 goto next_entry2;
304 if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
305 cb->nlh->nlmsg_seq, mfc,
306 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
307 spin_unlock_bh(lock);
308 goto done;
309 }
310next_entry2:
311 e++;
312 }
313 spin_unlock_bh(lock);
314 e = 0;
315 s_e = 0;
316next_table: 377next_table:
317 t++; 378 t++;
318 } 379 }
319done:
320 rcu_read_unlock(); 380 rcu_read_unlock();
321 381
322 cb->args[1] = e;
323 cb->args[0] = t; 382 cb->args[0] = t;
324 383
325 return skb->len; 384 return skb->len;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 04311f7067e2..6d218f5a2e71 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -5,8 +5,8 @@
5#include <net/net_namespace.h> 5#include <net/net_namespace.h>
6#include <net/tcp.h> 6#include <net/tcp.h>
7 7
8int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, 8static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
9 u32 *metrics) 9 int fc_mx_len, u32 *metrics)
10{ 10{
11 bool ecn_ca = false; 11 bool ecn_ca = false;
12 struct nlattr *nla; 12 struct nlattr *nla;
@@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
52 52
53 return 0; 53 return 0;
54} 54}
55EXPORT_SYMBOL_GPL(ip_metrics_convert); 55
56struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
57 int fc_mx_len)
58{
59 struct dst_metrics *fib_metrics;
60 int err;
61
62 if (!fc_mx)
63 return (struct dst_metrics *)&dst_default_metrics;
64
65 fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
66 if (unlikely(!fib_metrics))
67 return ERR_PTR(-ENOMEM);
68
69 err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
70 if (!err) {
71 refcount_set(&fib_metrics->refcnt, 1);
72 } else {
73 kfree(fib_metrics);
74 fib_metrics = ERR_PTR(err);
75 }
76
77 return fib_metrics;
78}
79EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 12843c9ef142..0b10d8812828 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -36,7 +36,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
36 const struct net_device *dev, u8 flags) 36 const struct net_device *dev, u8 flags)
37{ 37{
38 struct fib_result res; 38 struct fib_result res;
39 bool dev_match;
40 int ret __maybe_unused; 39 int ret __maybe_unused;
41 40
42 if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) 41 if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
@@ -46,21 +45,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
46 if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) 45 if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
47 return false; 46 return false;
48 } 47 }
49 dev_match = false; 48 return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE;
50#ifdef CONFIG_IP_ROUTE_MULTIPATH
51 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
52 struct fib_nh *nh = &res.fi->fib_nh[ret];
53
54 if (nh->nh_dev == dev) {
55 dev_match = true;
56 break;
57 }
58 }
59#else
60 if (FIB_RES_DEV(res) == dev)
61 dev_match = true;
62#endif
63 return dev_match || flags & XT_RPFILTER_LOOSE;
64} 49}
65 50
66static bool 51static bool
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 6115bf1ff6f0..78a67f961d86 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -264,7 +264,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
264 264
265 return nf_nat_inet_fn(priv, skb, state); 265 return nf_nat_inet_fn(priv, skb, state);
266} 266}
267EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
268 267
269static unsigned int 268static unsigned int
270nf_nat_ipv4_in(void *priv, struct sk_buff *skb, 269nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index ad3aeff152ed..a9d5e013e555 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -104,12 +104,26 @@ static int masq_device_event(struct notifier_block *this,
104 return NOTIFY_DONE; 104 return NOTIFY_DONE;
105} 105}
106 106
107static int inet_cmp(struct nf_conn *ct, void *ptr)
108{
109 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
110 struct net_device *dev = ifa->ifa_dev->dev;
111 struct nf_conntrack_tuple *tuple;
112
113 if (!device_cmp(ct, (void *)(long)dev->ifindex))
114 return 0;
115
116 tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
117
118 return ifa->ifa_address == tuple->dst.u3.ip;
119}
120
107static int masq_inet_event(struct notifier_block *this, 121static int masq_inet_event(struct notifier_block *this,
108 unsigned long event, 122 unsigned long event,
109 void *ptr) 123 void *ptr)
110{ 124{
111 struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; 125 struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
112 struct netdev_notifier_info info; 126 struct net *net = dev_net(idev->dev);
113 127
114 /* The masq_dev_notifier will catch the case of the device going 128 /* The masq_dev_notifier will catch the case of the device going
115 * down. So if the inetdev is dead and being destroyed we have 129 * down. So if the inetdev is dead and being destroyed we have
@@ -119,8 +133,10 @@ static int masq_inet_event(struct notifier_block *this,
119 if (idev->dead) 133 if (idev->dead)
120 return NOTIFY_DONE; 134 return NOTIFY_DONE;
121 135
122 netdev_notifier_info_init(&info, idev->dev); 136 if (event == NETDEV_DOWN)
123 return masq_device_event(this, event, &info); 137 nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
138
139 return NOTIFY_DONE;
124} 140}
125 141
126static struct notifier_block masq_dev_notifier = { 142static struct notifier_block masq_dev_notifier = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index ac110c1d55b5..a0aa13bcabda 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -60,6 +60,7 @@ MODULE_LICENSE("GPL");
60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
61MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); 61MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
62MODULE_ALIAS("ip_nat_snmp_basic"); 62MODULE_ALIAS("ip_nat_snmp_basic");
63MODULE_ALIAS_NFCT_HELPER("snmp_trap");
63 64
64#define SNMP_PORT 161 65#define SNMP_PORT 161
65#define SNMP_TRAP_PORT 162 66#define SNMP_TRAP_PORT 162
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index e50976e3c213..94eb25bc8d7e 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -76,10 +76,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
76 .flowi4_iif = LOOPBACK_IFINDEX, 76 .flowi4_iif = LOOPBACK_IFINDEX,
77 }; 77 };
78 const struct net_device *oif; 78 const struct net_device *oif;
79 struct net_device *found; 79 const struct net_device *found;
80#ifdef CONFIG_IP_ROUTE_MULTIPATH
81 int i;
82#endif
83 80
84 /* 81 /*
85 * Do not set flowi4_oif, it restricts results (for example, asking 82 * Do not set flowi4_oif, it restricts results (for example, asking
@@ -146,25 +143,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
146 143
147 if (!oif) { 144 if (!oif) {
148 found = FIB_RES_DEV(res); 145 found = FIB_RES_DEV(res);
149 goto ok; 146 } else {
150 } 147 if (!fib_info_nh_uses_dev(res.fi, oif))
151 148 return;
152#ifdef CONFIG_IP_ROUTE_MULTIPATH
153 for (i = 0; i < res.fi->fib_nhs; i++) {
154 struct fib_nh *nh = &res.fi->fib_nh[i];
155 149
156 if (nh->nh_dev == oif) { 150 found = oif;
157 found = nh->nh_dev;
158 goto ok;
159 }
160 } 151 }
161 return; 152
162#else
163 found = FIB_RES_DEV(res);
164 if (found != oif)
165 return;
166#endif
167ok:
168 switch (priv->result) { 153 switch (priv->result) {
169 case NFT_FIB_RESULT_OIF: 154 case NFT_FIB_RESULT_OIF:
170 *dest = found->ifindex; 155 *dest = found->ifindex;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8d7aaf118a30..7ccb5f87f70b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
779 } 779 }
780 780
781 if (ipv4_is_multicast(daddr)) { 781 if (ipv4_is_multicast(daddr)) {
782 if (!ipc.oif) 782 if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
783 ipc.oif = inet->mc_index; 783 ipc.oif = inet->mc_index;
784 if (!saddr) 784 if (!saddr)
785 saddr = inet->mc_addr; 785 saddr = inet->mc_addr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 33df4d76db2d..8ca3eb06ba04 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
608 tos |= RTO_ONLINK; 608 tos |= RTO_ONLINK;
609 609
610 if (ipv4_is_multicast(daddr)) { 610 if (ipv4_is_multicast(daddr)) {
611 if (!ipc.oif) 611 if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
612 ipc.oif = inet->mc_index; 612 ipc.oif = inet->mc_index;
613 if (!saddr) 613 if (!saddr)
614 saddr = inet->mc_addr; 614 saddr = inet->mc_addr;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8501554e96a4..c0a9d26c06ce 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1041,17 +1041,15 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1041} 1041}
1042 1042
1043void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1043void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1044 int oif, u32 mark, u8 protocol, int flow_flags) 1044 int oif, u8 protocol)
1045{ 1045{
1046 const struct iphdr *iph = (const struct iphdr *) skb->data; 1046 const struct iphdr *iph = (const struct iphdr *) skb->data;
1047 struct flowi4 fl4; 1047 struct flowi4 fl4;
1048 struct rtable *rt; 1048 struct rtable *rt;
1049 1049 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1050 if (!mark)
1051 mark = IP4_REPLY_MARK(net, skb->mark);
1052 1050
1053 __build_flow_key(net, &fl4, NULL, iph, oif, 1051 __build_flow_key(net, &fl4, NULL, iph, oif,
1054 RT_TOS(iph->tos), protocol, mark, flow_flags); 1052 RT_TOS(iph->tos), protocol, mark, 0);
1055 rt = __ip_route_output_key(net, &fl4); 1053 rt = __ip_route_output_key(net, &fl4);
1056 if (!IS_ERR(rt)) { 1054 if (!IS_ERR(rt)) {
1057 __ip_rt_update_pmtu(rt, &fl4, mtu); 1055 __ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1133,14 +1131,14 @@ out:
1133EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1131EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1134 1132
1135void ipv4_redirect(struct sk_buff *skb, struct net *net, 1133void ipv4_redirect(struct sk_buff *skb, struct net *net,
1136 int oif, u32 mark, u8 protocol, int flow_flags) 1134 int oif, u8 protocol)
1137{ 1135{
1138 const struct iphdr *iph = (const struct iphdr *) skb->data; 1136 const struct iphdr *iph = (const struct iphdr *) skb->data;
1139 struct flowi4 fl4; 1137 struct flowi4 fl4;
1140 struct rtable *rt; 1138 struct rtable *rt;
1141 1139
1142 __build_flow_key(net, &fl4, NULL, iph, oif, 1140 __build_flow_key(net, &fl4, NULL, iph, oif,
1143 RT_TOS(iph->tos), protocol, mark, flow_flags); 1141 RT_TOS(iph->tos), protocol, 0, 0);
1144 rt = __ip_route_output_key(net, &fl4); 1142 rt = __ip_route_output_key(net, &fl4);
1145 if (!IS_ERR(rt)) { 1143 if (!IS_ERR(rt)) {
1146 __ip_do_redirect(rt, skb, &fl4, false); 1144 __ip_do_redirect(rt, skb, &fl4, false);
@@ -1220,18 +1218,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1220 src = ip_hdr(skb)->saddr; 1218 src = ip_hdr(skb)->saddr;
1221 else { 1219 else {
1222 struct fib_result res; 1220 struct fib_result res;
1223 struct flowi4 fl4; 1221 struct iphdr *iph = ip_hdr(skb);
1224 struct iphdr *iph; 1222 struct flowi4 fl4 = {
1225 1223 .daddr = iph->daddr,
1226 iph = ip_hdr(skb); 1224 .saddr = iph->saddr,
1227 1225 .flowi4_tos = RT_TOS(iph->tos),
1228 memset(&fl4, 0, sizeof(fl4)); 1226 .flowi4_oif = rt->dst.dev->ifindex,
1229 fl4.daddr = iph->daddr; 1227 .flowi4_iif = skb->dev->ifindex,
1230 fl4.saddr = iph->saddr; 1228 .flowi4_mark = skb->mark,
1231 fl4.flowi4_tos = RT_TOS(iph->tos); 1229 };
1232 fl4.flowi4_oif = rt->dst.dev->ifindex;
1233 fl4.flowi4_iif = skb->dev->ifindex;
1234 fl4.flowi4_mark = skb->mark;
1235 1230
1236 rcu_read_lock(); 1231 rcu_read_lock();
1237 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1232 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
@@ -1482,12 +1477,9 @@ void rt_del_uncached_list(struct rtable *rt)
1482 1477
1483static void ipv4_dst_destroy(struct dst_entry *dst) 1478static void ipv4_dst_destroy(struct dst_entry *dst)
1484{ 1479{
1485 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1486 struct rtable *rt = (struct rtable *)dst; 1480 struct rtable *rt = (struct rtable *)dst;
1487 1481
1488 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) 1482 ip_dst_metrics_put(dst);
1489 kfree(p);
1490
1491 rt_del_uncached_list(rt); 1483 rt_del_uncached_list(rt);
1492} 1484}
1493 1485
@@ -1534,11 +1526,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1534 rt->rt_gateway = nh->nh_gw; 1526 rt->rt_gateway = nh->nh_gw;
1535 rt->rt_uses_gateway = 1; 1527 rt->rt_uses_gateway = 1;
1536 } 1528 }
1537 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); 1529 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1538 if (fi->fib_metrics != &dst_default_metrics) { 1530
1539 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1540 refcount_inc(&fi->fib_metrics->refcnt);
1541 }
1542#ifdef CONFIG_IP_ROUTE_CLASSID 1531#ifdef CONFIG_IP_ROUTE_CLASSID
1543 rt->dst.tclassid = nh->nh_tclassid; 1532 rt->dst.tclassid = nh->nh_tclassid;
1544#endif 1533#endif
@@ -2786,7 +2775,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2786 struct rtable *rt = NULL; 2775 struct rtable *rt = NULL;
2787 struct sk_buff *skb; 2776 struct sk_buff *skb;
2788 struct rtmsg *rtm; 2777 struct rtmsg *rtm;
2789 struct flowi4 fl4; 2778 struct flowi4 fl4 = {};
2790 __be32 dst = 0; 2779 __be32 dst = 0;
2791 __be32 src = 0; 2780 __be32 src = 0;
2792 kuid_t uid; 2781 kuid_t uid;
@@ -2826,7 +2815,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2826 if (!skb) 2815 if (!skb)
2827 return -ENOBUFS; 2816 return -ENOBUFS;
2828 2817
2829 memset(&fl4, 0, sizeof(fl4));
2830 fl4.daddr = dst; 2818 fl4.daddr = dst;
2831 fl4.saddr = src; 2819 fl4.saddr = src;
2832 fl4.flowi4_tos = rtm->rtm_tos; 2820 fl4.flowi4_tos = rtm->rtm_tos;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c3387dfd725b..606f868d9f3f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req)
88 ts <<= TSBITS; 88 ts <<= TSBITS;
89 ts |= options; 89 ts |= options;
90 } 90 }
91 return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); 91 return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
92} 92}
93 93
94 94
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 10c6246396cc..1834818ed07b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
507 const struct tcp_sock *tp = tcp_sk(sk); 507 const struct tcp_sock *tp = tcp_sk(sk);
508 int state; 508 int state;
509 509
510 sock_poll_wait(file, wait); 510 sock_poll_wait(file, sock, wait);
511 511
512 state = inet_sk_state_load(sk); 512 state = inet_sk_state_load(sk);
513 if (state == TCP_LISTEN) 513 if (state == TCP_LISTEN)
@@ -1295,7 +1295,7 @@ new_segment:
1295 copy = size_goal; 1295 copy = size_goal;
1296 1296
1297 /* All packets are restored as if they have 1297 /* All packets are restored as if they have
1298 * already been sent. skb_mstamp isn't set to 1298 * already been sent. skb_mstamp_ns isn't set to
1299 * avoid wrong rtt estimation. 1299 * avoid wrong rtt estimation.
1300 */ 1300 */
1301 if (tp->repair) 1301 if (tp->repair)
@@ -1753,6 +1753,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
1753 struct vm_area_struct *vma; 1753 struct vm_area_struct *vma;
1754 struct sk_buff *skb = NULL; 1754 struct sk_buff *skb = NULL;
1755 struct tcp_sock *tp; 1755 struct tcp_sock *tp;
1756 int inq;
1756 int ret; 1757 int ret;
1757 1758
1758 if (address & (PAGE_SIZE - 1) || address != zc->address) 1759 if (address & (PAGE_SIZE - 1) || address != zc->address)
@@ -1773,12 +1774,15 @@ static int tcp_zerocopy_receive(struct sock *sk,
1773 1774
1774 tp = tcp_sk(sk); 1775 tp = tcp_sk(sk);
1775 seq = tp->copied_seq; 1776 seq = tp->copied_seq;
1776 zc->length = min_t(u32, zc->length, tcp_inq(sk)); 1777 inq = tcp_inq(sk);
1778 zc->length = min_t(u32, zc->length, inq);
1777 zc->length &= ~(PAGE_SIZE - 1); 1779 zc->length &= ~(PAGE_SIZE - 1);
1778 1780 if (zc->length) {
1779 zap_page_range(vma, address, zc->length); 1781 zap_page_range(vma, address, zc->length);
1780 1782 zc->recv_skip_hint = 0;
1781 zc->recv_skip_hint = 0; 1783 } else {
1784 zc->recv_skip_hint = inq;
1785 }
1782 ret = 0; 1786 ret = 0;
1783 while (length + PAGE_SIZE <= zc->length) { 1787 while (length + PAGE_SIZE <= zc->length) {
1784 if (zc->recv_skip_hint < PAGE_SIZE) { 1788 if (zc->recv_skip_hint < PAGE_SIZE) {
@@ -1801,8 +1805,17 @@ static int tcp_zerocopy_receive(struct sock *sk,
1801 frags++; 1805 frags++;
1802 } 1806 }
1803 } 1807 }
1804 if (frags->size != PAGE_SIZE || frags->page_offset) 1808 if (frags->size != PAGE_SIZE || frags->page_offset) {
1809 int remaining = zc->recv_skip_hint;
1810
1811 while (remaining && (frags->size != PAGE_SIZE ||
1812 frags->page_offset)) {
1813 remaining -= frags->size;
1814 frags++;
1815 }
1816 zc->recv_skip_hint -= remaining;
1805 break; 1817 break;
1818 }
1806 ret = vm_insert_page(vma, address + length, 1819 ret = vm_insert_page(vma, address + length,
1807 skb_frag_page(frags)); 1820 skb_frag_page(frags));
1808 if (ret) 1821 if (ret)
@@ -2403,16 +2416,10 @@ adjudge_to_death:
2403 sock_hold(sk); 2416 sock_hold(sk);
2404 sock_orphan(sk); 2417 sock_orphan(sk);
2405 2418
2406 /* It is the last release_sock in its life. It will remove backlog. */
2407 release_sock(sk);
2408
2409
2410 /* Now socket is owned by kernel and we acquire BH lock
2411 * to finish close. No need to check for user refs.
2412 */
2413 local_bh_disable(); 2419 local_bh_disable();
2414 bh_lock_sock(sk); 2420 bh_lock_sock(sk);
2415 WARN_ON(sock_owned_by_user(sk)); 2421 /* remove backlog if any, without releasing ownership. */
2422 __release_sock(sk);
2416 2423
2417 percpu_counter_inc(sk->sk_prot->orphan_count); 2424 percpu_counter_inc(sk->sk_prot->orphan_count);
2418 2425
@@ -2481,6 +2488,7 @@ adjudge_to_death:
2481out: 2488out:
2482 bh_unlock_sock(sk); 2489 bh_unlock_sock(sk);
2483 local_bh_enable(); 2490 local_bh_enable();
2491 release_sock(sk);
2484 sock_put(sk); 2492 sock_put(sk);
2485} 2493}
2486EXPORT_SYMBOL(tcp_close); 2494EXPORT_SYMBOL(tcp_close);
@@ -2595,6 +2603,8 @@ int tcp_disconnect(struct sock *sk, int flags)
2595 tp->compressed_ack = 0; 2603 tp->compressed_ack = 0;
2596 tp->bytes_sent = 0; 2604 tp->bytes_sent = 0;
2597 tp->bytes_retrans = 0; 2605 tp->bytes_retrans = 0;
2606 tp->duplicate_sack[0].start_seq = 0;
2607 tp->duplicate_sack[0].end_seq = 0;
2598 tp->dsack_dups = 0; 2608 tp->dsack_dups = 0;
2599 tp->reord_seen = 0; 2609 tp->reord_seen = 0;
2600 2610
@@ -3101,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
3101{ 3111{
3102 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 3112 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
3103 const struct inet_connection_sock *icsk = inet_csk(sk); 3113 const struct inet_connection_sock *icsk = inet_csk(sk);
3114 unsigned long rate;
3104 u32 now; 3115 u32 now;
3105 u64 rate64; 3116 u64 rate64;
3106 bool slow; 3117 bool slow;
3107 u32 rate;
3108 3118
3109 memset(info, 0, sizeof(*info)); 3119 memset(info, 0, sizeof(*info));
3110 if (sk->sk_type != SOCK_STREAM) 3120 if (sk->sk_type != SOCK_STREAM)
@@ -3114,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
3114 3124
3115 /* Report meaningful fields for all TCP states, including listeners */ 3125 /* Report meaningful fields for all TCP states, including listeners */
3116 rate = READ_ONCE(sk->sk_pacing_rate); 3126 rate = READ_ONCE(sk->sk_pacing_rate);
3117 rate64 = rate != ~0U ? rate : ~0ULL; 3127 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3118 info->tcpi_pacing_rate = rate64; 3128 info->tcpi_pacing_rate = rate64;
3119 3129
3120 rate = READ_ONCE(sk->sk_max_pacing_rate); 3130 rate = READ_ONCE(sk->sk_max_pacing_rate);
3121 rate64 = rate != ~0U ? rate : ~0ULL; 3131 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3122 info->tcpi_max_pacing_rate = rate64; 3132 info->tcpi_max_pacing_rate = rate64;
3123 3133
3124 info->tcpi_reordering = tp->reordering; 3134 info->tcpi_reordering = tp->reordering;
@@ -3244,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3244 const struct tcp_sock *tp = tcp_sk(sk); 3254 const struct tcp_sock *tp = tcp_sk(sk);
3245 struct sk_buff *stats; 3255 struct sk_buff *stats;
3246 struct tcp_info info; 3256 struct tcp_info info;
3257 unsigned long rate;
3247 u64 rate64; 3258 u64 rate64;
3248 u32 rate;
3249 3259
3250 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); 3260 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3251 if (!stats) 3261 if (!stats)
@@ -3264,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3264 tp->total_retrans, TCP_NLA_PAD); 3274 tp->total_retrans, TCP_NLA_PAD);
3265 3275
3266 rate = READ_ONCE(sk->sk_pacing_rate); 3276 rate = READ_ONCE(sk->sk_pacing_rate);
3267 rate64 = rate != ~0U ? rate : ~0ULL; 3277 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3268 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); 3278 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3269 3279
3270 rate64 = tcp_compute_delivery_rate(tp); 3280 rate64 = tcp_compute_delivery_rate(tp);
@@ -3894,8 +3904,8 @@ void __init tcp_init(void)
3894 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); 3904 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3895 3905
3896 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; 3906 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3897 init_net.ipv4.sysctl_tcp_rmem[1] = 87380; 3907 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
3898 init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); 3908 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
3899 3909
3900 pr_info("Hash tables configured (established %u bind %u)\n", 3910 pr_info("Hash tables configured (established %u bind %u)\n",
3901 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3911 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 02ff2dde9609..9277abdd822a 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
128/* Skip TSO below the following bandwidth (bits/sec): */ 128/* Skip TSO below the following bandwidth (bits/sec): */
129static const int bbr_min_tso_rate = 1200000; 129static const int bbr_min_tso_rate = 1200000;
130 130
131/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
132static const int bbr_pacing_margin_percent = 1;
133
131/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 134/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
132 * that will allow a smoothly increasing pacing rate that will double each RTT 135 * that will allow a smoothly increasing pacing rate that will double each RTT
133 * and send the same number of packets per RTT that an un-paced, slow-starting 136 * and send the same number of packets per RTT that an un-paced, slow-starting
@@ -208,17 +211,15 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
208{ 211{
209 unsigned int mss = tcp_sk(sk)->mss_cache; 212 unsigned int mss = tcp_sk(sk)->mss_cache;
210 213
211 if (!tcp_needs_internal_pacing(sk))
212 mss = tcp_mss_to_mtu(sk, mss);
213 rate *= mss; 214 rate *= mss;
214 rate *= gain; 215 rate *= gain;
215 rate >>= BBR_SCALE; 216 rate >>= BBR_SCALE;
216 rate *= USEC_PER_SEC; 217 rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
217 return rate >> BW_SCALE; 218 return rate >> BW_SCALE;
218} 219}
219 220
220/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ 221/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
221static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) 222static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
222{ 223{
223 u64 rate = bw; 224 u64 rate = bw;
224 225
@@ -257,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
257{ 258{
258 struct tcp_sock *tp = tcp_sk(sk); 259 struct tcp_sock *tp = tcp_sk(sk);
259 struct bbr *bbr = inet_csk_ca(sk); 260 struct bbr *bbr = inet_csk_ca(sk);
260 u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); 261 unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
261 262
262 if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) 263 if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
263 bbr_init_pacing_rate_from_rtt(sk); 264 bbr_init_pacing_rate_from_rtt(sk);
@@ -279,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
279 /* Sort of tcp_tso_autosize() but ignoring 280 /* Sort of tcp_tso_autosize() but ignoring
280 * driver provided sk_gso_max_size. 281 * driver provided sk_gso_max_size.
281 */ 282 */
282 bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, 283 bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
283 GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); 284 GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
284 segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); 285 segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
285 286
@@ -368,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
368 return cwnd; 369 return cwnd;
369} 370}
370 371
372/* With pacing at lower layers, there's often less data "in the network" than
373 * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
374 * we often have several skbs queued in the pacing layer with a pre-scheduled
375 * earliest departure time (EDT). BBR adapts its pacing rate based on the
376 * inflight level that it estimates has already been "baked in" by previous
377 * departure time decisions. We calculate a rough estimate of the number of our
378 * packets that might be in the network at the earliest departure time for the
379 * next skb scheduled:
380 * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
381 * If we're increasing inflight, then we want to know if the transmit of the
382 * EDT skb will push inflight above the target, so inflight_at_edt includes
383 * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
384 * then estimate if inflight will sink too low just before the EDT transmit.
385 */
386static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
387{
388 struct tcp_sock *tp = tcp_sk(sk);
389 struct bbr *bbr = inet_csk_ca(sk);
390 u64 now_ns, edt_ns, interval_us;
391 u32 interval_delivered, inflight_at_edt;
392
393 now_ns = tp->tcp_clock_cache;
394 edt_ns = max(tp->tcp_wstamp_ns, now_ns);
395 interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
396 interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
397 inflight_at_edt = inflight_now;
398 if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
399 inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
400 if (interval_delivered >= inflight_at_edt)
401 return 0;
402 return inflight_at_edt - interval_delivered;
403}
404
371/* An optimization in BBR to reduce losses: On the first round of recovery, we 405/* An optimization in BBR to reduce losses: On the first round of recovery, we
372 * follow the packet conservation principle: send P packets per P packets acked. 406 * follow the packet conservation principle: send P packets per P packets acked.
373 * After that, we slow-start and send at most 2*P packets per P packets acked. 407 * After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -459,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
459 if (bbr->pacing_gain == BBR_UNIT) 493 if (bbr->pacing_gain == BBR_UNIT)
460 return is_full_length; /* just use wall clock time */ 494 return is_full_length; /* just use wall clock time */
461 495
462 inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 496 inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
463 bw = bbr_max_bw(sk); 497 bw = bbr_max_bw(sk);
464 498
465 /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 499 /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -487,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk)
487 521
488 bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 522 bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
489 bbr->cycle_mstamp = tp->delivered_mstamp; 523 bbr->cycle_mstamp = tp->delivered_mstamp;
490 bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
491 bbr_pacing_gain[bbr->cycle_idx];
492} 524}
493 525
494/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 526/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -506,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk)
506 struct bbr *bbr = inet_csk_ca(sk); 538 struct bbr *bbr = inet_csk_ca(sk);
507 539
508 bbr->mode = BBR_STARTUP; 540 bbr->mode = BBR_STARTUP;
509 bbr->pacing_gain = bbr_high_gain;
510 bbr->cwnd_gain = bbr_high_gain;
511} 541}
512 542
513static void bbr_reset_probe_bw_mode(struct sock *sk) 543static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -515,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk)
515 struct bbr *bbr = inet_csk_ca(sk); 545 struct bbr *bbr = inet_csk_ca(sk);
516 546
517 bbr->mode = BBR_PROBE_BW; 547 bbr->mode = BBR_PROBE_BW;
518 bbr->pacing_gain = BBR_UNIT;
519 bbr->cwnd_gain = bbr_cwnd_gain;
520 bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 548 bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
521 bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 549 bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
522} 550}
@@ -734,13 +762,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
734 762
735 if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 763 if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
736 bbr->mode = BBR_DRAIN; /* drain queue we created */ 764 bbr->mode = BBR_DRAIN; /* drain queue we created */
737 bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
738 bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
739 tcp_sk(sk)->snd_ssthresh = 765 tcp_sk(sk)->snd_ssthresh =
740 bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); 766 bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
741 } /* fall through to check if in-flight is already small: */ 767 } /* fall through to check if in-flight is already small: */
742 if (bbr->mode == BBR_DRAIN && 768 if (bbr->mode == BBR_DRAIN &&
743 tcp_packets_in_flight(tcp_sk(sk)) <= 769 bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
744 bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 770 bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
745 bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 771 bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
746} 772}
@@ -797,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
797 if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 823 if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
798 !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 824 !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
799 bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 825 bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
800 bbr->pacing_gain = BBR_UNIT;
801 bbr->cwnd_gain = BBR_UNIT;
802 bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 826 bbr_save_cwnd(sk); /* note cwnd so we can restore it */
803 bbr->probe_rtt_done_stamp = 0; 827 bbr->probe_rtt_done_stamp = 0;
804 } 828 }
@@ -826,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
826 bbr->idle_restart = 0; 850 bbr->idle_restart = 0;
827} 851}
828 852
853static void bbr_update_gains(struct sock *sk)
854{
855 struct bbr *bbr = inet_csk_ca(sk);
856
857 switch (bbr->mode) {
858 case BBR_STARTUP:
859 bbr->pacing_gain = bbr_high_gain;
860 bbr->cwnd_gain = bbr_high_gain;
861 break;
862 case BBR_DRAIN:
863 bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
864 bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
865 break;
866 case BBR_PROBE_BW:
867 bbr->pacing_gain = (bbr->lt_use_bw ?
868 BBR_UNIT :
869 bbr_pacing_gain[bbr->cycle_idx]);
870 bbr->cwnd_gain = bbr_cwnd_gain;
871 break;
872 case BBR_PROBE_RTT:
873 bbr->pacing_gain = BBR_UNIT;
874 bbr->cwnd_gain = BBR_UNIT;
875 break;
876 default:
877 WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
878 break;
879 }
880}
881
829static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 882static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
830{ 883{
831 bbr_update_bw(sk, rs); 884 bbr_update_bw(sk, rs);
@@ -833,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
833 bbr_check_full_bw_reached(sk, rs); 886 bbr_check_full_bw_reached(sk, rs);
834 bbr_check_drain(sk, rs); 887 bbr_check_drain(sk, rs);
835 bbr_update_min_rtt(sk, rs); 888 bbr_update_min_rtt(sk, rs);
889 bbr_update_gains(sk);
836} 890}
837 891
838static void bbr_main(struct sock *sk, const struct rate_sample *rs) 892static void bbr_main(struct sock *sk, const struct rate_sample *rs)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..b7918d4caa30
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,668 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
3
4#include <linux/skmsg.h>
5#include <linux/filter.h>
6#include <linux/bpf.h>
7#include <linux/init.h>
8#include <linux/wait.h>
9
10#include <net/inet_common.h>
11
12static bool tcp_bpf_stream_read(const struct sock *sk)
13{
14 struct sk_psock *psock;
15 bool empty = true;
16
17 rcu_read_lock();
18 psock = sk_psock(sk);
19 if (likely(psock))
20 empty = list_empty(&psock->ingress_msg);
21 rcu_read_unlock();
22 return !empty;
23}
24
25static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
26 int flags, long timeo, int *err)
27{
28 DEFINE_WAIT_FUNC(wait, woken_wake_function);
29 int ret;
30
31 add_wait_queue(sk_sleep(sk), &wait);
32 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
33 ret = sk_wait_event(sk, &timeo,
34 !list_empty(&psock->ingress_msg) ||
35 !skb_queue_empty(&sk->sk_receive_queue), &wait);
36 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
37 remove_wait_queue(sk_sleep(sk), &wait);
38 return ret;
39}
40
41int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
42 struct msghdr *msg, int len, int flags)
43{
44 struct iov_iter *iter = &msg->msg_iter;
45 int peek = flags & MSG_PEEK;
46 int i, ret, copied = 0;
47 struct sk_msg *msg_rx;
48
49 msg_rx = list_first_entry_or_null(&psock->ingress_msg,
50 struct sk_msg, list);
51
52 while (copied != len) {
53 struct scatterlist *sge;
54
55 if (unlikely(!msg_rx))
56 break;
57
58 i = msg_rx->sg.start;
59 do {
60 struct page *page;
61 int copy;
62
63 sge = sk_msg_elem(msg_rx, i);
64 copy = sge->length;
65 page = sg_page(sge);
66 if (copied + copy > len)
67 copy = len - copied;
68 ret = copy_page_to_iter(page, sge->offset, copy, iter);
69 if (ret != copy) {
70 msg_rx->sg.start = i;
71 return -EFAULT;
72 }
73
74 copied += copy;
75 if (likely(!peek)) {
76 sge->offset += copy;
77 sge->length -= copy;
78 sk_mem_uncharge(sk, copy);
79 msg_rx->sg.size -= copy;
80
81 if (!sge->length) {
82 sk_msg_iter_var_next(i);
83 if (!msg_rx->skb)
84 put_page(page);
85 }
86 } else {
87 sk_msg_iter_var_next(i);
88 }
89
90 if (copied == len)
91 break;
92 } while (i != msg_rx->sg.end);
93
94 if (unlikely(peek)) {
95 msg_rx = list_next_entry(msg_rx, list);
96 continue;
97 }
98
99 msg_rx->sg.start = i;
100 if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
101 list_del(&msg_rx->list);
102 if (msg_rx->skb)
103 consume_skb(msg_rx->skb);
104 kfree(msg_rx);
105 }
106 msg_rx = list_first_entry_or_null(&psock->ingress_msg,
107 struct sk_msg, list);
108 }
109
110 return copied;
111}
112EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
113
114int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
115 int nonblock, int flags, int *addr_len)
116{
117 struct sk_psock *psock;
118 int copied, ret;
119
120 if (unlikely(flags & MSG_ERRQUEUE))
121 return inet_recv_error(sk, msg, len, addr_len);
122 if (!skb_queue_empty(&sk->sk_receive_queue))
123 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
124
125 psock = sk_psock_get(sk);
126 if (unlikely(!psock))
127 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
128 lock_sock(sk);
129msg_bytes_ready:
130 copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
131 if (!copied) {
132 int data, err = 0;
133 long timeo;
134
135 timeo = sock_rcvtimeo(sk, nonblock);
136 data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
137 if (data) {
138 if (skb_queue_empty(&sk->sk_receive_queue))
139 goto msg_bytes_ready;
140 release_sock(sk);
141 sk_psock_put(sk, psock);
142 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
143 }
144 if (err) {
145 ret = err;
146 goto out;
147 }
148 }
149 ret = copied;
150out:
151 release_sock(sk);
152 sk_psock_put(sk, psock);
153 return ret;
154}
155
156static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
157 struct sk_msg *msg, u32 apply_bytes, int flags)
158{
159 bool apply = apply_bytes;
160 struct scatterlist *sge;
161 u32 size, copied = 0;
162 struct sk_msg *tmp;
163 int i, ret = 0;
164
165 tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
166 if (unlikely(!tmp))
167 return -ENOMEM;
168
169 lock_sock(sk);
170 tmp->sg.start = msg->sg.start;
171 i = msg->sg.start;
172 do {
173 sge = sk_msg_elem(msg, i);
174 size = (apply && apply_bytes < sge->length) ?
175 apply_bytes : sge->length;
176 if (!sk_wmem_schedule(sk, size)) {
177 if (!copied)
178 ret = -ENOMEM;
179 break;
180 }
181
182 sk_mem_charge(sk, size);
183 sk_msg_xfer(tmp, msg, i, size);
184 copied += size;
185 if (sge->length)
186 get_page(sk_msg_page(tmp, i));
187 sk_msg_iter_var_next(i);
188 tmp->sg.end = i;
189 if (apply) {
190 apply_bytes -= size;
191 if (!apply_bytes)
192 break;
193 }
194 } while (i != msg->sg.end);
195
196 if (!ret) {
197 msg->sg.start = i;
198 msg->sg.size -= apply_bytes;
199 sk_psock_queue_msg(psock, tmp);
200 sk->sk_data_ready(sk);
201 } else {
202 sk_msg_free(sk, tmp);
203 kfree(tmp);
204 }
205
206 release_sock(sk);
207 return ret;
208}
209
210static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
211 int flags, bool uncharge)
212{
213 bool apply = apply_bytes;
214 struct scatterlist *sge;
215 struct page *page;
216 int size, ret = 0;
217 u32 off;
218
219 while (1) {
220 sge = sk_msg_elem(msg, msg->sg.start);
221 size = (apply && apply_bytes < sge->length) ?
222 apply_bytes : sge->length;
223 off = sge->offset;
224 page = sg_page(sge);
225
226 tcp_rate_check_app_limited(sk);
227retry:
228 ret = do_tcp_sendpages(sk, page, off, size, flags);
229 if (ret <= 0)
230 return ret;
231 if (apply)
232 apply_bytes -= ret;
233 msg->sg.size -= ret;
234 sge->offset += ret;
235 sge->length -= ret;
236 if (uncharge)
237 sk_mem_uncharge(sk, ret);
238 if (ret != size) {
239 size -= ret;
240 off += ret;
241 goto retry;
242 }
243 if (!sge->length) {
244 put_page(page);
245 sk_msg_iter_next(msg, start);
246 sg_init_table(sge, 1);
247 if (msg->sg.start == msg->sg.end)
248 break;
249 }
250 if (apply && !apply_bytes)
251 break;
252 }
253
254 return 0;
255}
256
257static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
258 u32 apply_bytes, int flags, bool uncharge)
259{
260 int ret;
261
262 lock_sock(sk);
263 ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
264 release_sock(sk);
265 return ret;
266}
267
268int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
269 u32 bytes, int flags)
270{
271 bool ingress = sk_msg_to_ingress(msg);
272 struct sk_psock *psock = sk_psock_get(sk);
273 int ret;
274
275 if (unlikely(!psock)) {
276 sk_msg_free(sk, msg);
277 return 0;
278 }
279 ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
280 tcp_bpf_push_locked(sk, msg, bytes, flags, false);
281 sk_psock_put(sk, psock);
282 return ret;
283}
284EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
285
286static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
287 struct sk_msg *msg, int *copied, int flags)
288{
289 bool cork = false, enospc = msg->sg.start == msg->sg.end;
290 struct sock *sk_redir;
291 u32 tosend;
292 int ret;
293
294more_data:
295 if (psock->eval == __SK_NONE)
296 psock->eval = sk_psock_msg_verdict(sk, psock, msg);
297
298 if (msg->cork_bytes &&
299 msg->cork_bytes > msg->sg.size && !enospc) {
300 psock->cork_bytes = msg->cork_bytes - msg->sg.size;
301 if (!psock->cork) {
302 psock->cork = kzalloc(sizeof(*psock->cork),
303 GFP_ATOMIC | __GFP_NOWARN);
304 if (!psock->cork)
305 return -ENOMEM;
306 }
307 memcpy(psock->cork, msg, sizeof(*msg));
308 return 0;
309 }
310
311 tosend = msg->sg.size;
312 if (psock->apply_bytes && psock->apply_bytes < tosend)
313 tosend = psock->apply_bytes;
314
315 switch (psock->eval) {
316 case __SK_PASS:
317 ret = tcp_bpf_push(sk, msg, tosend, flags, true);
318 if (unlikely(ret)) {
319 *copied -= sk_msg_free(sk, msg);
320 break;
321 }
322 sk_msg_apply_bytes(psock, tosend);
323 break;
324 case __SK_REDIRECT:
325 sk_redir = psock->sk_redir;
326 sk_msg_apply_bytes(psock, tosend);
327 if (psock->cork) {
328 cork = true;
329 psock->cork = NULL;
330 }
331 sk_msg_return(sk, msg, tosend);
332 release_sock(sk);
333 ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
334 lock_sock(sk);
335 if (unlikely(ret < 0)) {
336 int free = sk_msg_free_nocharge(sk, msg);
337
338 if (!cork)
339 *copied -= free;
340 }
341 if (cork) {
342 sk_msg_free(sk, msg);
343 kfree(msg);
344 msg = NULL;
345 ret = 0;
346 }
347 break;
348 case __SK_DROP:
349 default:
350 sk_msg_free_partial(sk, msg, tosend);
351 sk_msg_apply_bytes(psock, tosend);
352 *copied -= tosend;
353 return -EACCES;
354 }
355
356 if (likely(!ret)) {
357 if (!psock->apply_bytes) {
358 psock->eval = __SK_NONE;
359 if (psock->sk_redir) {
360 sock_put(psock->sk_redir);
361 psock->sk_redir = NULL;
362 }
363 }
364 if (msg &&
365 msg->sg.data[msg->sg.start].page_link &&
366 msg->sg.data[msg->sg.start].length)
367 goto more_data;
368 }
369 return ret;
370}
371
372static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
373{
374 struct sk_msg tmp, *msg_tx = NULL;
375 int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
376 int copied = 0, err = 0;
377 struct sk_psock *psock;
378 long timeo;
379
380 psock = sk_psock_get(sk);
381 if (unlikely(!psock))
382 return tcp_sendmsg(sk, msg, size);
383
384 lock_sock(sk);
385 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
386 while (msg_data_left(msg)) {
387 bool enospc = false;
388 u32 copy, osize;
389
390 if (sk->sk_err) {
391 err = -sk->sk_err;
392 goto out_err;
393 }
394
395 copy = msg_data_left(msg);
396 if (!sk_stream_memory_free(sk))
397 goto wait_for_sndbuf;
398 if (psock->cork) {
399 msg_tx = psock->cork;
400 } else {
401 msg_tx = &tmp;
402 sk_msg_init(msg_tx);
403 }
404
405 osize = msg_tx->sg.size;
406 err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
407 if (err) {
408 if (err != -ENOSPC)
409 goto wait_for_memory;
410 enospc = true;
411 copy = msg_tx->sg.size - osize;
412 }
413
414 err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
415 copy);
416 if (err < 0) {
417 sk_msg_trim(sk, msg_tx, osize);
418 goto out_err;
419 }
420
421 copied += copy;
422 if (psock->cork_bytes) {
423 if (size > psock->cork_bytes)
424 psock->cork_bytes = 0;
425 else
426 psock->cork_bytes -= size;
427 if (psock->cork_bytes && !enospc)
428 goto out_err;
429 /* All cork bytes are accounted, rerun the prog. */
430 psock->eval = __SK_NONE;
431 psock->cork_bytes = 0;
432 }
433
434 err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
435 if (unlikely(err < 0))
436 goto out_err;
437 continue;
438wait_for_sndbuf:
439 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
440wait_for_memory:
441 err = sk_stream_wait_memory(sk, &timeo);
442 if (err) {
443 if (msg_tx && msg_tx != psock->cork)
444 sk_msg_free(sk, msg_tx);
445 goto out_err;
446 }
447 }
448out_err:
449 if (err < 0)
450 err = sk_stream_error(sk, msg->msg_flags, err);
451 release_sock(sk);
452 sk_psock_put(sk, psock);
453 return copied ? copied : err;
454}
455
456static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
457 size_t size, int flags)
458{
459 struct sk_msg tmp, *msg = NULL;
460 int err = 0, copied = 0;
461 struct sk_psock *psock;
462 bool enospc = false;
463
464 psock = sk_psock_get(sk);
465 if (unlikely(!psock))
466 return tcp_sendpage(sk, page, offset, size, flags);
467
468 lock_sock(sk);
469 if (psock->cork) {
470 msg = psock->cork;
471 } else {
472 msg = &tmp;
473 sk_msg_init(msg);
474 }
475
476 /* Catch case where ring is full and sendpage is stalled. */
477 if (unlikely(sk_msg_full(msg)))
478 goto out_err;
479
480 sk_msg_page_add(msg, page, size, offset);
481 sk_mem_charge(sk, size);
482 copied = size;
483 if (sk_msg_full(msg))
484 enospc = true;
485 if (psock->cork_bytes) {
486 if (size > psock->cork_bytes)
487 psock->cork_bytes = 0;
488 else
489 psock->cork_bytes -= size;
490 if (psock->cork_bytes && !enospc)
491 goto out_err;
492 /* All cork bytes are accounted, rerun the prog. */
493 psock->eval = __SK_NONE;
494 psock->cork_bytes = 0;
495 }
496
497 err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
498out_err:
499 release_sock(sk);
500 sk_psock_put(sk, psock);
501 return copied ? copied : err;
502}
503
504static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
505{
506 struct sk_psock_link *link;
507
508 sk_psock_cork_free(psock);
509 __sk_psock_purge_ingress_msg(psock);
510 while ((link = sk_psock_link_pop(psock))) {
511 sk_psock_unlink(sk, link);
512 sk_psock_free_link(link);
513 }
514}
515
516static void tcp_bpf_unhash(struct sock *sk)
517{
518 void (*saved_unhash)(struct sock *sk);
519 struct sk_psock *psock;
520
521 rcu_read_lock();
522 psock = sk_psock(sk);
523 if (unlikely(!psock)) {
524 rcu_read_unlock();
525 if (sk->sk_prot->unhash)
526 sk->sk_prot->unhash(sk);
527 return;
528 }
529
530 saved_unhash = psock->saved_unhash;
531 tcp_bpf_remove(sk, psock);
532 rcu_read_unlock();
533 saved_unhash(sk);
534}
535
536static void tcp_bpf_close(struct sock *sk, long timeout)
537{
538 void (*saved_close)(struct sock *sk, long timeout);
539 struct sk_psock *psock;
540
541 lock_sock(sk);
542 rcu_read_lock();
543 psock = sk_psock(sk);
544 if (unlikely(!psock)) {
545 rcu_read_unlock();
546 release_sock(sk);
547 return sk->sk_prot->close(sk, timeout);
548 }
549
550 saved_close = psock->saved_close;
551 tcp_bpf_remove(sk, psock);
552 rcu_read_unlock();
553 release_sock(sk);
554 saved_close(sk, timeout);
555}
556
557enum {
558 TCP_BPF_IPV4,
559 TCP_BPF_IPV6,
560 TCP_BPF_NUM_PROTS,
561};
562
563enum {
564 TCP_BPF_BASE,
565 TCP_BPF_TX,
566 TCP_BPF_NUM_CFGS,
567};
568
569static struct proto *tcpv6_prot_saved __read_mostly;
570static DEFINE_SPINLOCK(tcpv6_prot_lock);
571static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
572
573static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
574 struct proto *base)
575{
576 prot[TCP_BPF_BASE] = *base;
577 prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
578 prot[TCP_BPF_BASE].close = tcp_bpf_close;
579 prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
580 prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
581
582 prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
583 prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
584 prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
585}
586
587static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
588{
589 if (sk->sk_family == AF_INET6 &&
590 unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
591 spin_lock_bh(&tcpv6_prot_lock);
592 if (likely(ops != tcpv6_prot_saved)) {
593 tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
594 smp_store_release(&tcpv6_prot_saved, ops);
595 }
596 spin_unlock_bh(&tcpv6_prot_lock);
597 }
598}
599
600static int __init tcp_bpf_v4_build_proto(void)
601{
602 tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
603 return 0;
604}
605core_initcall(tcp_bpf_v4_build_proto);
606
607static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
608{
609 int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
610 int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
611
612 sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
613}
614
615static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
616{
617 int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
618 int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
619
620 /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
621 * or added requiring sk_prot hook updates. We keep original saved
622 * hooks in this case.
623 */
624 sk->sk_prot = &tcp_bpf_prots[family][config];
625}
626
627static int tcp_bpf_assert_proto_ops(struct proto *ops)
628{
629 /* In order to avoid retpoline, we make assumptions when we call
630 * into ops if e.g. a psock is not present. Make sure they are
631 * indeed valid assumptions.
632 */
633 return ops->recvmsg == tcp_recvmsg &&
634 ops->sendmsg == tcp_sendmsg &&
635 ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
636}
637
638void tcp_bpf_reinit(struct sock *sk)
639{
640 struct sk_psock *psock;
641
642 sock_owned_by_me(sk);
643
644 rcu_read_lock();
645 psock = sk_psock(sk);
646 tcp_bpf_reinit_sk_prot(sk, psock);
647 rcu_read_unlock();
648}
649
650int tcp_bpf_init(struct sock *sk)
651{
652 struct proto *ops = READ_ONCE(sk->sk_prot);
653 struct sk_psock *psock;
654
655 sock_owned_by_me(sk);
656
657 rcu_read_lock();
658 psock = sk_psock(sk);
659 if (unlikely(!psock || psock->sk_proto ||
660 tcp_bpf_assert_proto_ops(ops))) {
661 rcu_read_unlock();
662 return -EINVAL;
663 }
664 tcp_bpf_check_v6_needs_rebuild(sk, ops);
665 tcp_bpf_update_sk_prot(sk, psock);
666 rcu_read_unlock();
667 return 0;
668}
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 06fbe102a425..37eebd910396 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -146,7 +146,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
146 return; 146 return;
147 147
148 if (hystart_detect & HYSTART_ACK_TRAIN) { 148 if (hystart_detect & HYSTART_ACK_TRAIN) {
149 u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); 149 u32 now_us = tp->tcp_mstamp;
150 150
151 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { 151 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
152 ca->last_ack = now_us; 152 ca->last_ack = now_us;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ca61e2a659e7..cd4814f7e962 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -44,6 +44,7 @@
44#include <linux/mm.h> 44#include <linux/mm.h>
45#include <net/tcp.h> 45#include <net/tcp.h>
46#include <linux/inet_diag.h> 46#include <linux/inet_diag.h>
47#include "tcp_dctcp.h"
47 48
48#define DCTCP_MAX_ALPHA 1024U 49#define DCTCP_MAX_ALPHA 1024U
49 50
@@ -118,54 +119,6 @@ static u32 dctcp_ssthresh(struct sock *sk)
118 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); 119 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
119} 120}
120 121
121/* Minimal DCTP CE state machine:
122 *
123 * S: 0 <- last pkt was non-CE
124 * 1 <- last pkt was CE
125 */
126
127static void dctcp_ce_state_0_to_1(struct sock *sk)
128{
129 struct dctcp *ca = inet_csk_ca(sk);
130 struct tcp_sock *tp = tcp_sk(sk);
131
132 if (!ca->ce_state) {
133 /* State has changed from CE=0 to CE=1, force an immediate
134 * ACK to reflect the new CE state. If an ACK was delayed,
135 * send that first to reflect the prior CE state.
136 */
137 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
138 __tcp_send_ack(sk, ca->prior_rcv_nxt);
139 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
140 }
141
142 ca->prior_rcv_nxt = tp->rcv_nxt;
143 ca->ce_state = 1;
144
145 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
146}
147
148static void dctcp_ce_state_1_to_0(struct sock *sk)
149{
150 struct dctcp *ca = inet_csk_ca(sk);
151 struct tcp_sock *tp = tcp_sk(sk);
152
153 if (ca->ce_state) {
154 /* State has changed from CE=1 to CE=0, force an immediate
155 * ACK to reflect the new CE state. If an ACK was delayed,
156 * send that first to reflect the prior CE state.
157 */
158 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
159 __tcp_send_ack(sk, ca->prior_rcv_nxt);
160 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
161 }
162
163 ca->prior_rcv_nxt = tp->rcv_nxt;
164 ca->ce_state = 0;
165
166 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
167}
168
169static void dctcp_update_alpha(struct sock *sk, u32 flags) 122static void dctcp_update_alpha(struct sock *sk, u32 flags)
170{ 123{
171 const struct tcp_sock *tp = tcp_sk(sk); 124 const struct tcp_sock *tp = tcp_sk(sk);
@@ -230,12 +183,12 @@ static void dctcp_state(struct sock *sk, u8 new_state)
230 183
231static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) 184static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
232{ 185{
186 struct dctcp *ca = inet_csk_ca(sk);
187
233 switch (ev) { 188 switch (ev) {
234 case CA_EVENT_ECN_IS_CE: 189 case CA_EVENT_ECN_IS_CE:
235 dctcp_ce_state_0_to_1(sk);
236 break;
237 case CA_EVENT_ECN_NO_CE: 190 case CA_EVENT_ECN_NO_CE:
238 dctcp_ce_state_1_to_0(sk); 191 dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
239 break; 192 break;
240 default: 193 default:
241 /* Don't care for the rest. */ 194 /* Don't care for the rest. */
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
new file mode 100644
index 000000000000..d69a77cbd0c7
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.h
@@ -0,0 +1,40 @@
1#ifndef _TCP_DCTCP_H
2#define _TCP_DCTCP_H
3
4static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state)
5{
6 struct tcp_sock *tp = tcp_sk(sk);
7
8 if (ce_state == 1)
9 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
10 else
11 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
12}
13
14/* Minimal DCTP CE state machine:
15 *
16 * S: 0 <- last pkt was non-CE
17 * 1 <- last pkt was CE
18 */
19static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
20 u32 *prior_rcv_nxt, u32 *ce_state)
21{
22 u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
23
24 if (*ce_state != new_ce_state) {
25 /* CE state has changed, force an immediate ACK to
26 * reflect the new CE state. If an ACK was delayed,
27 * send that first to reflect the prior CE state.
28 */
29 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
30 dctcp_ece_ack_cwr(sk, *ce_state);
31 __tcp_send_ack(sk, *prior_rcv_nxt);
32 }
33 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
34 }
35 *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
36 *ce_state = new_ce_state;
37 dctcp_ece_ack_cwr(sk, new_ce_state);
38}
39
40#endif
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 47e08c1b5bc3..2868ef28ce52 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
426 } 426 }
427} 427}
428 428
429/* 3. Tuning rcvbuf, when connection enters established state. */ 429/* 3. Try to fixup all. It is made immediately after connection enters
430static void tcp_fixup_rcvbuf(struct sock *sk)
431{
432 u32 mss = tcp_sk(sk)->advmss;
433 int rcvmem;
434
435 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
436 tcp_default_init_rwnd(mss);
437
438 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
439 * Allow enough cushion so that sender is not limited by our window
440 */
441 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
442 rcvmem <<= 2;
443
444 if (sk->sk_rcvbuf < rcvmem)
445 sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
446}
447
448/* 4. Try to fixup all. It is made immediately after connection enters
449 * established state. 430 * established state.
450 */ 431 */
451void tcp_init_buffer_space(struct sock *sk) 432void tcp_init_buffer_space(struct sock *sk)
@@ -454,12 +435,10 @@ void tcp_init_buffer_space(struct sock *sk)
454 struct tcp_sock *tp = tcp_sk(sk); 435 struct tcp_sock *tp = tcp_sk(sk);
455 int maxwin; 436 int maxwin;
456 437
457 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
458 tcp_fixup_rcvbuf(sk);
459 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) 438 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
460 tcp_sndbuf_expand(sk); 439 tcp_sndbuf_expand(sk);
461 440
462 tp->rcvq_space.space = tp->rcv_wnd; 441 tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
463 tcp_mstamp_refresh(tp); 442 tcp_mstamp_refresh(tp);
464 tp->rcvq_space.time = tp->tcp_mstamp; 443 tp->rcvq_space.time = tp->tcp_mstamp;
465 tp->rcvq_space.seq = tp->copied_seq; 444 tp->rcvq_space.seq = tp->copied_seq;
@@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk)
485 tp->snd_cwnd_stamp = tcp_jiffies32; 464 tp->snd_cwnd_stamp = tcp_jiffies32;
486} 465}
487 466
488/* 5. Recalculate window clamp after socket hit its memory bounds. */ 467/* 4. Recalculate window clamp after socket hit its memory bounds. */
489static void tcp_clamp_window(struct sock *sk) 468static void tcp_clamp_window(struct sock *sk)
490{ 469{
491 struct tcp_sock *tp = tcp_sk(sk); 470 struct tcp_sock *tp = tcp_sk(sk);
@@ -1305,7 +1284,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1305 */ 1284 */
1306 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1285 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1307 start_seq, end_seq, dup_sack, pcount, 1286 start_seq, end_seq, dup_sack, pcount,
1308 skb->skb_mstamp); 1287 tcp_skb_timestamp_us(skb));
1309 tcp_rate_skb_delivered(sk, skb, state->rate); 1288 tcp_rate_skb_delivered(sk, skb, state->rate);
1310 1289
1311 if (skb == tp->lost_skb_hint) 1290 if (skb == tp->lost_skb_hint)
@@ -1580,7 +1559,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1580 TCP_SKB_CB(skb)->end_seq, 1559 TCP_SKB_CB(skb)->end_seq,
1581 dup_sack, 1560 dup_sack,
1582 tcp_skb_pcount(skb), 1561 tcp_skb_pcount(skb),
1583 skb->skb_mstamp); 1562 tcp_skb_timestamp_us(skb));
1584 tcp_rate_skb_delivered(sk, skb, state->rate); 1563 tcp_rate_skb_delivered(sk, skb, state->rate);
1585 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 1564 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1586 list_del_init(&skb->tcp_tsorted_anchor); 1565 list_del_init(&skb->tcp_tsorted_anchor);
@@ -3000,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk)
3000 */ 2979 */
3001 rto = usecs_to_jiffies(max_t(int, delta_us, 1)); 2980 rto = usecs_to_jiffies(max_t(int, delta_us, 1));
3002 } 2981 }
3003 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 2982 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3004 TCP_RTO_MAX); 2983 TCP_RTO_MAX, tcp_rtx_queue_head(sk));
3005 } 2984 }
3006} 2985}
3007 2986
@@ -3103,7 +3082,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3103 tp->retrans_out -= acked_pcount; 3082 tp->retrans_out -= acked_pcount;
3104 flag |= FLAG_RETRANS_DATA_ACKED; 3083 flag |= FLAG_RETRANS_DATA_ACKED;
3105 } else if (!(sacked & TCPCB_SACKED_ACKED)) { 3084 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3106 last_ackt = skb->skb_mstamp; 3085 last_ackt = tcp_skb_timestamp_us(skb);
3107 WARN_ON_ONCE(last_ackt == 0); 3086 WARN_ON_ONCE(last_ackt == 0);
3108 if (!first_ackt) 3087 if (!first_ackt)
3109 first_ackt = last_ackt; 3088 first_ackt = last_ackt;
@@ -3121,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3121 tp->delivered += acked_pcount; 3100 tp->delivered += acked_pcount;
3122 if (!tcp_skb_spurious_retrans(tp, skb)) 3101 if (!tcp_skb_spurious_retrans(tp, skb))
3123 tcp_rack_advance(tp, sacked, scb->end_seq, 3102 tcp_rack_advance(tp, sacked, scb->end_seq,
3124 skb->skb_mstamp); 3103 tcp_skb_timestamp_us(skb));
3125 } 3104 }
3126 if (sacked & TCPCB_LOST) 3105 if (sacked & TCPCB_LOST)
3127 tp->lost_out -= acked_pcount; 3106 tp->lost_out -= acked_pcount;
@@ -3215,7 +3194,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3215 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3194 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3216 } 3195 }
3217 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3196 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3218 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { 3197 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
3198 tcp_skb_timestamp_us(skb))) {
3219 /* Do not re-arm RTO if the sack RTT is measured from data sent 3199 /* Do not re-arm RTO if the sack RTT is measured from data sent
3220 * after when the head was last (re)transmitted. Otherwise the 3200 * after when the head was last (re)transmitted. Otherwise the
3221 * timeout may continue to extend in loss recovery. 3201 * timeout may continue to extend in loss recovery.
@@ -3275,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk)
3275 } else { 3255 } else {
3276 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); 3256 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3277 3257
3278 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3258 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3279 when, TCP_RTO_MAX); 3259 when, TCP_RTO_MAX, NULL);
3280 } 3260 }
3281} 3261}
3282 3262
@@ -4199,6 +4179,17 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4199 tcp_sack_extend(tp->duplicate_sack, seq, end_seq); 4179 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4200} 4180}
4201 4181
4182static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
4183{
4184 /* When the ACK path fails or drops most ACKs, the sender would
4185 * timeout and spuriously retransmit the same segment repeatedly.
4186 * The receiver remembers and reflects via DSACKs. Leverage the
4187 * DSACK state and change the txhash to re-route speculatively.
4188 */
4189 if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
4190 sk_rethink_txhash(sk);
4191}
4192
4202static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) 4193static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4203{ 4194{
4204 struct tcp_sock *tp = tcp_sk(sk); 4195 struct tcp_sock *tp = tcp_sk(sk);
@@ -4211,6 +4202,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4211 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { 4202 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4212 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 4203 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4213 4204
4205 tcp_rcv_spurious_retrans(sk, skb);
4214 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) 4206 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4215 end_seq = tp->rcv_nxt; 4207 end_seq = tp->rcv_nxt;
4216 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); 4208 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
@@ -4755,6 +4747,7 @@ queue_and_out:
4755 } 4747 }
4756 4748
4757 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4749 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4750 tcp_rcv_spurious_retrans(sk, skb);
4758 /* A retransmit, 2nd most common case. Force an immediate ack. */ 4751 /* A retransmit, 2nd most common case. Force an immediate ack. */
4759 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4752 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4760 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4753 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cd426313a298..de47038afdf0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
544 BUG_ON(!skb); 544 BUG_ON(!skb);
545 545
546 tcp_mstamp_refresh(tp); 546 tcp_mstamp_refresh(tp);
547 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); 547 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
548 remaining = icsk->icsk_rto - 548 remaining = icsk->icsk_rto -
549 usecs_to_jiffies(delta_us); 549 usecs_to_jiffies(delta_us);
550 550
@@ -2551,7 +2551,7 @@ static int __net_init tcp_sk_init(struct net *net)
2551 net->ipv4.sysctl_tcp_tw_reuse = 2; 2551 net->ipv4.sysctl_tcp_tw_reuse = 2;
2552 2552
2553 cnt = tcp_hashinfo.ehash_mask + 1; 2553 cnt = tcp_hashinfo.ehash_mask + 1;
2554 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2554 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2555 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2555 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2556 2556
2557 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2557 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 597dbd749f05..9c34b97d365d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,6 +45,21 @@
45 45
46#include <trace/events/tcp.h> 46#include <trace/events/tcp.h>
47 47
48/* Refresh clocks of a TCP socket,
49 * ensuring monotically increasing values.
50 */
51void tcp_mstamp_refresh(struct tcp_sock *tp)
52{
53 u64 val = tcp_clock_ns();
54
55 if (val > tp->tcp_clock_cache)
56 tp->tcp_clock_cache = val;
57
58 val = div_u64(val, NSEC_PER_USEC);
59 if (val > tp->tcp_mstamp)
60 tp->tcp_mstamp = val;
61}
62
48static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 63static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
49 int push_one, gfp_t gfp); 64 int push_one, gfp_t gfp);
50 65
@@ -179,21 +194,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
179 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 194 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
180} 195}
181 196
182
183u32 tcp_default_init_rwnd(u32 mss)
184{
185 /* Initial receive window should be twice of TCP_INIT_CWND to
186 * enable proper sending of new unsent data during fast recovery
187 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
188 * limit when mss is larger than 1460.
189 */
190 u32 init_rwnd = TCP_INIT_CWND * 2;
191
192 if (mss > 1460)
193 init_rwnd = max((1460 * init_rwnd) / mss, 2U);
194 return init_rwnd;
195}
196
197/* Determine a window scaling and initial window to offer. 197/* Determine a window scaling and initial window to offer.
198 * Based on the assumption that the given amount of space 198 * Based on the assumption that the given amount of space
199 * will be offered. Store the results in the tp structure. 199 * will be offered. Store the results in the tp structure.
@@ -228,7 +228,10 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
228 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) 228 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
229 (*rcv_wnd) = min(space, MAX_TCP_WINDOW); 229 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
230 else 230 else
231 (*rcv_wnd) = space; 231 (*rcv_wnd) = min_t(u32, space, U16_MAX);
232
233 if (init_rcv_wnd)
234 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
232 235
233 (*rcv_wscale) = 0; 236 (*rcv_wscale) = 0;
234 if (wscale_ok) { 237 if (wscale_ok) {
@@ -241,11 +244,6 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
241 (*rcv_wscale)++; 244 (*rcv_wscale)++;
242 } 245 }
243 } 246 }
244
245 if (!init_rcv_wnd) /* Use default unless specified otherwise */
246 init_rcv_wnd = tcp_default_init_rwnd(mss);
247 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
248
249 /* Set the clamp no higher than max representable value */ 247 /* Set the clamp no higher than max representable value */
250 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); 248 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
251} 249}
@@ -977,28 +975,28 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
977 return HRTIMER_NORESTART; 975 return HRTIMER_NORESTART;
978} 976}
979 977
980static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) 978static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
979 u64 prior_wstamp)
981{ 980{
982 u64 len_ns; 981 struct tcp_sock *tp = tcp_sk(sk);
983 u32 rate;
984 982
985 if (!tcp_needs_internal_pacing(sk)) 983 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
986 return; 984 if (sk->sk_pacing_status != SK_PACING_NONE) {
987 rate = sk->sk_pacing_rate; 985 unsigned long rate = sk->sk_pacing_rate;
988 if (!rate || rate == ~0U)
989 return;
990 986
991 len_ns = (u64)skb->len * NSEC_PER_SEC; 987 /* Original sch_fq does not pace first 10 MSS
992 do_div(len_ns, rate); 988 * Note that tp->data_segs_out overflows after 2^32 packets,
993 hrtimer_start(&tcp_sk(sk)->pacing_timer, 989 * this is a minor annoyance.
994 ktime_add_ns(ktime_get(), len_ns), 990 */
995 HRTIMER_MODE_ABS_PINNED_SOFT); 991 if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
996 sock_hold(sk); 992 u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
997} 993 u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
998 994
999static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) 995 /* take into account OS jitter */
1000{ 996 len_ns -= min_t(u64, len_ns / 2, credit);
1001 skb->skb_mstamp = tp->tcp_mstamp; 997 tp->tcp_wstamp_ns += len_ns;
998 }
999 }
1002 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); 1000 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1003} 1001}
1004 1002
@@ -1025,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1025 struct sk_buff *oskb = NULL; 1023 struct sk_buff *oskb = NULL;
1026 struct tcp_md5sig_key *md5; 1024 struct tcp_md5sig_key *md5;
1027 struct tcphdr *th; 1025 struct tcphdr *th;
1026 u64 prior_wstamp;
1028 int err; 1027 int err;
1029 1028
1030 BUG_ON(!skb || !tcp_skb_pcount(skb)); 1029 BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -1045,7 +1044,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1045 if (unlikely(!skb)) 1044 if (unlikely(!skb))
1046 return -ENOBUFS; 1045 return -ENOBUFS;
1047 } 1046 }
1048 skb->skb_mstamp = tp->tcp_mstamp; 1047
1048 prior_wstamp = tp->tcp_wstamp_ns;
1049 tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1050
1051 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1049 1052
1050 inet = inet_sk(sk); 1053 inet = inet_sk(sk);
1051 tcb = TCP_SKB_CB(skb); 1054 tcb = TCP_SKB_CB(skb);
@@ -1137,7 +1140,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1137 tcp_event_data_sent(tp, sk); 1140 tcp_event_data_sent(tp, sk);
1138 tp->data_segs_out += tcp_skb_pcount(skb); 1141 tp->data_segs_out += tcp_skb_pcount(skb);
1139 tp->bytes_sent += skb->len - tcp_header_size; 1142 tp->bytes_sent += skb->len - tcp_header_size;
1140 tcp_internal_pacing(sk, skb);
1141 } 1143 }
1142 1144
1143 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) 1145 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1149,8 +1151,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1149 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); 1151 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1150 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); 1152 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1151 1153
1152 /* Our usage of tstamp should remain private */ 1154 /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
1153 skb->tstamp = 0;
1154 1155
1155 /* Cleanup our debris for IP stacks */ 1156 /* Cleanup our debris for IP stacks */
1156 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), 1157 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1163,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1163 err = net_xmit_eval(err); 1164 err = net_xmit_eval(err);
1164 } 1165 }
1165 if (!err && oskb) { 1166 if (!err && oskb) {
1166 tcp_update_skb_after_send(tp, oskb); 1167 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1167 tcp_rate_skb_sent(sk, oskb); 1168 tcp_rate_skb_sent(sk, oskb);
1168 } 1169 }
1169 return err; 1170 return err;
@@ -1698,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1698{ 1699{
1699 u32 bytes, segs; 1700 u32 bytes, segs;
1700 1701
1701 bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, 1702 bytes = min_t(unsigned long,
1702 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); 1703 sk->sk_pacing_rate >> sk->sk_pacing_shift,
1704 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1703 1705
1704 /* Goal is to send at least one packet per ms, 1706 /* Goal is to send at least one packet per ms,
1705 * not one big TSO packet every 100 ms. 1707 * not one big TSO packet every 100 ms.
@@ -1966,7 +1968,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1966 head = tcp_rtx_queue_head(sk); 1968 head = tcp_rtx_queue_head(sk);
1967 if (!head) 1969 if (!head)
1968 goto send_now; 1970 goto send_now;
1969 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); 1971 age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
1970 /* If next ACK is likely to come too late (half srtt), do not defer */ 1972 /* If next ACK is likely to come too late (half srtt), do not defer */
1971 if (age < (tp->srtt_us >> 4)) 1973 if (age < (tp->srtt_us >> 4))
1972 goto send_now; 1974 goto send_now;
@@ -2172,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk)
2172 return -1; 2174 return -1;
2173} 2175}
2174 2176
2175static bool tcp_pacing_check(const struct sock *sk) 2177static bool tcp_pacing_check(struct sock *sk)
2176{ 2178{
2177 return tcp_needs_internal_pacing(sk) && 2179 struct tcp_sock *tp = tcp_sk(sk);
2178 hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); 2180
2181 if (!tcp_needs_internal_pacing(sk))
2182 return false;
2183
2184 if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2185 return false;
2186
2187 if (!hrtimer_is_queued(&tp->pacing_timer)) {
2188 hrtimer_start(&tp->pacing_timer,
2189 ns_to_ktime(tp->tcp_wstamp_ns),
2190 HRTIMER_MODE_ABS_PINNED_SOFT);
2191 sock_hold(sk);
2192 }
2193 return true;
2179} 2194}
2180 2195
2181/* TCP Small Queues : 2196/* TCP Small Queues :
@@ -2192,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk)
2192static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, 2207static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2193 unsigned int factor) 2208 unsigned int factor)
2194{ 2209{
2195 unsigned int limit; 2210 unsigned long limit;
2196 2211
2197 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); 2212 limit = max_t(unsigned long,
2198 limit = min_t(u32, limit, 2213 2 * skb->truesize,
2214 sk->sk_pacing_rate >> sk->sk_pacing_shift);
2215 limit = min_t(unsigned long, limit,
2199 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); 2216 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2200 limit <<= factor; 2217 limit <<= factor;
2201 2218
@@ -2304,18 +2321,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2304 while ((skb = tcp_send_head(sk))) { 2321 while ((skb = tcp_send_head(sk))) {
2305 unsigned int limit; 2322 unsigned int limit;
2306 2323
2324 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2325 /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
2326 skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2327 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2328 goto repair; /* Skip network transmission */
2329 }
2330
2307 if (tcp_pacing_check(sk)) 2331 if (tcp_pacing_check(sk))
2308 break; 2332 break;
2309 2333
2310 tso_segs = tcp_init_tso_segs(skb, mss_now); 2334 tso_segs = tcp_init_tso_segs(skb, mss_now);
2311 BUG_ON(!tso_segs); 2335 BUG_ON(!tso_segs);
2312 2336
2313 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2314 /* "skb_mstamp" is used as a start point for the retransmit timer */
2315 tcp_update_skb_after_send(tp, skb);
2316 goto repair; /* Skip network transmission */
2317 }
2318
2319 cwnd_quota = tcp_cwnd_test(tp, skb); 2337 cwnd_quota = tcp_cwnd_test(tp, skb);
2320 if (!cwnd_quota) { 2338 if (!cwnd_quota) {
2321 if (push_one == 2) 2339 if (push_one == 2)
@@ -2437,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2437 if (rto_delta_us > 0) 2455 if (rto_delta_us > 0)
2438 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); 2456 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2439 2457
2440 inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, 2458 tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2441 TCP_RTO_MAX); 2459 TCP_RTO_MAX, NULL);
2442 return true; 2460 return true;
2443} 2461}
2444 2462
@@ -2887,7 +2905,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2887 } tcp_skb_tsorted_restore(skb); 2905 } tcp_skb_tsorted_restore(skb);
2888 2906
2889 if (!err) { 2907 if (!err) {
2890 tcp_update_skb_after_send(tp, skb); 2908 tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
2891 tcp_rate_skb_sent(sk, skb); 2909 tcp_rate_skb_sent(sk, skb);
2892 } 2910 }
2893 } else { 2911 } else {
@@ -3002,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
3002 3020
3003 if (skb == rtx_head && 3021 if (skb == rtx_head &&
3004 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) 3022 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3005 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 3023 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3006 inet_csk(sk)->icsk_rto, 3024 inet_csk(sk)->icsk_rto,
3007 TCP_RTO_MAX); 3025 TCP_RTO_MAX,
3026 skb);
3008 } 3027 }
3009} 3028}
3010 3029
@@ -3205,10 +3224,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3205 memset(&opts, 0, sizeof(opts)); 3224 memset(&opts, 0, sizeof(opts));
3206#ifdef CONFIG_SYN_COOKIES 3225#ifdef CONFIG_SYN_COOKIES
3207 if (unlikely(req->cookie_ts)) 3226 if (unlikely(req->cookie_ts))
3208 skb->skb_mstamp = cookie_init_timestamp(req); 3227 skb->skb_mstamp_ns = cookie_init_timestamp(req);
3209 else 3228 else
3210#endif 3229#endif
3211 skb->skb_mstamp = tcp_clock_us(); 3230 skb->skb_mstamp_ns = tcp_clock_ns();
3212 3231
3213#ifdef CONFIG_TCP_MD5SIG 3232#ifdef CONFIG_TCP_MD5SIG
3214 rcu_read_lock(); 3233 rcu_read_lock();
@@ -3424,7 +3443,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3424 3443
3425 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); 3444 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3426 3445
3427 syn->skb_mstamp = syn_data->skb_mstamp; 3446 syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3428 3447
3429 /* Now full SYN+DATA was cloned and sent (or not), 3448 /* Now full SYN+DATA was cloned and sent (or not),
3430 * remove the SYN from the original skb (syn_data) 3449 * remove the SYN from the original skb (syn_data)
@@ -3734,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)
3734 icsk->icsk_probes_out = 1; 3753 icsk->icsk_probes_out = 1;
3735 probe_max = TCP_RESOURCE_PROBE_INTERVAL; 3754 probe_max = TCP_RESOURCE_PROBE_INTERVAL;
3736 } 3755 }
3737 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3756 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3738 tcp_probe0_when(sk, probe_max), 3757 tcp_probe0_when(sk, probe_max),
3739 TCP_RTO_MAX); 3758 TCP_RTO_MAX,
3759 NULL);
3740} 3760}
3741 3761
3742int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) 3762int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 4dff40dad4dc..baed2186c7c6 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
55 * bandwidth estimate. 55 * bandwidth estimate.
56 */ 56 */
57 if (!tp->packets_out) { 57 if (!tp->packets_out) {
58 tp->first_tx_mstamp = skb->skb_mstamp; 58 u64 tstamp_us = tcp_skb_timestamp_us(skb);
59 tp->delivered_mstamp = skb->skb_mstamp; 59
60 tp->first_tx_mstamp = tstamp_us;
61 tp->delivered_mstamp = tstamp_us;
60 } 62 }
61 63
62 TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; 64 TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
@@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
88 rs->is_app_limited = scb->tx.is_app_limited; 90 rs->is_app_limited = scb->tx.is_app_limited;
89 rs->is_retrans = scb->sacked & TCPCB_RETRANS; 91 rs->is_retrans = scb->sacked & TCPCB_RETRANS;
90 92
93 /* Record send time of most recently ACKed packet: */
94 tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
91 /* Find the duration of the "send phase" of this window: */ 95 /* Find the duration of the "send phase" of this window: */
92 rs->interval_us = tcp_stamp_us_delta( 96 rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
93 skb->skb_mstamp, 97 scb->tx.first_tx_mstamp);
94 scb->tx.first_tx_mstamp);
95 98
96 /* Record send time of most recently ACKed packet: */
97 tp->first_tx_mstamp = skb->skb_mstamp;
98 } 99 }
99 /* Mark off the skb delivered once it's sacked to avoid being 100 /* Mark off the skb delivered once it's sacked to avoid being
100 * used again when it's cumulatively acked. For acked packets 101 * used again when it's cumulatively acked. For acked packets
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c81aadff769b..fdb715bdd2d1 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
50s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) 50s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
51{ 51{
52 return tp->rack.rtt_us + reo_wnd - 52 return tp->rack.rtt_us + reo_wnd -
53 tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); 53 tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
54} 54}
55 55
56/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): 56/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
@@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
91 !(scb->sacked & TCPCB_SACKED_RETRANS)) 91 !(scb->sacked & TCPCB_SACKED_RETRANS))
92 continue; 92 continue;
93 93
94 if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, 94 if (!tcp_rack_sent_after(tp->rack.mstamp,
95 tcp_skb_timestamp_us(skb),
95 tp->rack.end_seq, scb->end_seq)) 96 tp->rack.end_seq, scb->end_seq))
96 break; 97 break;
97 98
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7fdf222a0bdf..676020663ce8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
360 */ 360 */
361 start_ts = tcp_skb_timestamp(skb); 361 start_ts = tcp_skb_timestamp(skb);
362 if (!start_ts) 362 if (!start_ts)
363 skb->skb_mstamp = tp->tcp_mstamp; 363 skb->skb_mstamp_ns = tp->tcp_clock_cache;
364 else if (icsk->icsk_user_timeout && 364 else if (icsk->icsk_user_timeout &&
365 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) 365 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
366 goto abort; 366 goto abort;
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index a5995bb2eaca..95df7f7f6328 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -6,7 +6,7 @@
6 * 6 *
7 */ 7 */
8 8
9#include<linux/module.h> 9#include <linux/module.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/list.h> 12#include <linux/list.h>
@@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
29 return NULL; 29 return NULL;
30} 30}
31 31
32static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
33{
34 struct tcp_ulp_ops *e;
35
36 list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
37 if (e->uid == ulp)
38 return e;
39 }
40
41 return NULL;
42}
43
44static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) 32static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
45{ 33{
46 const struct tcp_ulp_ops *ulp = NULL; 34 const struct tcp_ulp_ops *ulp = NULL;
@@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
63 return ulp; 51 return ulp;
64} 52}
65 53
66static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
67{
68 const struct tcp_ulp_ops *ulp;
69
70 rcu_read_lock();
71 ulp = tcp_ulp_find_id(uid);
72 if (!ulp || !try_module_get(ulp->owner))
73 ulp = NULL;
74 rcu_read_unlock();
75 return ulp;
76}
77
78/* Attach new upper layer protocol to the list 54/* Attach new upper layer protocol to the list
79 * of available protocols. 55 * of available protocols.
80 */ 56 */
@@ -123,6 +99,10 @@ void tcp_cleanup_ulp(struct sock *sk)
123{ 99{
124 struct inet_connection_sock *icsk = inet_csk(sk); 100 struct inet_connection_sock *icsk = inet_csk(sk);
125 101
102 /* No sock_owned_by_me() check here as at the time the
103 * stack calls this function, the socket is dead and
104 * about to be destroyed.
105 */
126 if (!icsk->icsk_ulp_ops) 106 if (!icsk->icsk_ulp_ops)
127 return; 107 return;
128 108
@@ -133,54 +113,35 @@ void tcp_cleanup_ulp(struct sock *sk)
133 icsk->icsk_ulp_ops = NULL; 113 icsk->icsk_ulp_ops = NULL;
134} 114}
135 115
136/* Change upper layer protocol for socket */ 116static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
137int tcp_set_ulp(struct sock *sk, const char *name)
138{ 117{
139 struct inet_connection_sock *icsk = inet_csk(sk); 118 struct inet_connection_sock *icsk = inet_csk(sk);
140 const struct tcp_ulp_ops *ulp_ops; 119 int err;
141 int err = 0;
142 120
121 err = -EEXIST;
143 if (icsk->icsk_ulp_ops) 122 if (icsk->icsk_ulp_ops)
144 return -EEXIST; 123 goto out_err;
145
146 ulp_ops = __tcp_ulp_find_autoload(name);
147 if (!ulp_ops)
148 return -ENOENT;
149
150 if (!ulp_ops->user_visible) {
151 module_put(ulp_ops->owner);
152 return -ENOENT;
153 }
154 124
155 err = ulp_ops->init(sk); 125 err = ulp_ops->init(sk);
156 if (err) { 126 if (err)
157 module_put(ulp_ops->owner); 127 goto out_err;
158 return err;
159 }
160 128
161 icsk->icsk_ulp_ops = ulp_ops; 129 icsk->icsk_ulp_ops = ulp_ops;
162 return 0; 130 return 0;
131out_err:
132 module_put(ulp_ops->owner);
133 return err;
163} 134}
164 135
165int tcp_set_ulp_id(struct sock *sk, int ulp) 136int tcp_set_ulp(struct sock *sk, const char *name)
166{ 137{
167 struct inet_connection_sock *icsk = inet_csk(sk);
168 const struct tcp_ulp_ops *ulp_ops; 138 const struct tcp_ulp_ops *ulp_ops;
169 int err;
170 139
171 if (icsk->icsk_ulp_ops) 140 sock_owned_by_me(sk);
172 return -EEXIST;
173 141
174 ulp_ops = __tcp_ulp_lookup(ulp); 142 ulp_ops = __tcp_ulp_find_autoload(name);
175 if (!ulp_ops) 143 if (!ulp_ops)
176 return -ENOENT; 144 return -ENOENT;
177 145
178 err = ulp_ops->init(sk); 146 return __tcp_set_ulp(sk, ulp_ops);
179 if (err) {
180 module_put(ulp_ops->owner);
181 return err;
182 }
183
184 icsk->icsk_ulp_ops = ulp_ops;
185 return 0;
186} 147}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c32a4c16b7ff..ca3ed931f2a9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -609,8 +609,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
609 struct net *net = dev_net(skb->dev); 609 struct net *net = dev_net(skb->dev);
610 610
611 sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, 611 sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
612 iph->saddr, uh->source, skb->dev->ifindex, 0, 612 iph->saddr, uh->source, skb->dev->ifindex,
613 udptable, NULL); 613 inet_sdif(skb), udptable, NULL);
614 if (!sk) { 614 if (!sk) {
615 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 615 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
616 return; /* No socket for error */ 616 return; /* No socket for error */
@@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1042 } 1042 }
1043 1043
1044 if (ipv4_is_multicast(daddr)) { 1044 if (ipv4_is_multicast(daddr)) {
1045 if (!ipc.oif) 1045 if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
1046 ipc.oif = inet->mc_index; 1046 ipc.oif = inet->mc_index;
1047 if (!saddr) 1047 if (!saddr)
1048 saddr = inet->mc_addr; 1048 saddr = inet->mc_addr;
@@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1889 return 0; 1889 return 0;
1890} 1890}
1891 1891
1892static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); 1892DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
1893void udp_encap_enable(void) 1893void udp_encap_enable(void)
1894{ 1894{
1895 static_branch_enable(&udp_encap_needed_key); 1895 static_branch_enable(&udp_encap_needed_key);
@@ -2120,8 +2120,24 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
2120 /* Note, we are only interested in != 0 or == 0, thus the 2120 /* Note, we are only interested in != 0 or == 0, thus the
2121 * force to int. 2121 * force to int.
2122 */ 2122 */
2123 return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, 2123 err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
2124 inet_compute_pseudo); 2124 inet_compute_pseudo);
2125 if (err)
2126 return err;
2127
2128 if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
2129 /* If SW calculated the value, we know it's bad */
2130 if (skb->csum_complete_sw)
2131 return 1;
2132
2133 /* HW says the value is bad. Let's validate that.
2134 * skb->csum is no longer the full packet checksum,
2135 * so don't treat it as such.
2136 */
2137 skb_checksum_complete_unset(skb);
2138 }
2139
2140 return 0;
2125} 2141}
2126 2142
2127/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and 2143/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0c0522b79b43..802f2bc00d69 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
405{ 405{
406 struct udphdr *uh = udp_gro_udphdr(skb); 406 struct udphdr *uh = udp_gro_udphdr(skb);
407 407
408 if (unlikely(!uh)) 408 if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
409 goto flush; 409 goto flush;
410 410
411 /* Don't bother verifying checksum if we're going to flush anyway. */ 411 /* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4e81ff2f4588..63a808d5af15 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -666,6 +666,7 @@ errout:
666static int inet6_netconf_dump_devconf(struct sk_buff *skb, 666static int inet6_netconf_dump_devconf(struct sk_buff *skb,
667 struct netlink_callback *cb) 667 struct netlink_callback *cb)
668{ 668{
669 const struct nlmsghdr *nlh = cb->nlh;
669 struct net *net = sock_net(skb->sk); 670 struct net *net = sock_net(skb->sk);
670 int h, s_h; 671 int h, s_h;
671 int idx, s_idx; 672 int idx, s_idx;
@@ -673,6 +674,21 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
673 struct inet6_dev *idev; 674 struct inet6_dev *idev;
674 struct hlist_head *head; 675 struct hlist_head *head;
675 676
677 if (cb->strict_check) {
678 struct netlink_ext_ack *extack = cb->extack;
679 struct netconfmsg *ncm;
680
681 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
682 NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
683 return -EINVAL;
684 }
685
686 if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
687 NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
688 return -EINVAL;
689 }
690 }
691
676 s_h = cb->args[0]; 692 s_h = cb->args[0];
677 s_idx = idx = cb->args[1]; 693 s_idx = idx = cb->args[1];
678 694
@@ -692,7 +708,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
692 if (inet6_netconf_fill_devconf(skb, dev->ifindex, 708 if (inet6_netconf_fill_devconf(skb, dev->ifindex,
693 &idev->cnf, 709 &idev->cnf,
694 NETLINK_CB(cb->skb).portid, 710 NETLINK_CB(cb->skb).portid,
695 cb->nlh->nlmsg_seq, 711 nlh->nlmsg_seq,
696 RTM_NEWNETCONF, 712 RTM_NEWNETCONF,
697 NLM_F_MULTI, 713 NLM_F_MULTI,
698 NETCONFA_ALL) < 0) { 714 NETCONFA_ALL) < 0) {
@@ -709,7 +725,7 @@ cont:
709 if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, 725 if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
710 net->ipv6.devconf_all, 726 net->ipv6.devconf_all,
711 NETLINK_CB(cb->skb).portid, 727 NETLINK_CB(cb->skb).portid,
712 cb->nlh->nlmsg_seq, 728 nlh->nlmsg_seq,
713 RTM_NEWNETCONF, NLM_F_MULTI, 729 RTM_NEWNETCONF, NLM_F_MULTI,
714 NETCONFA_ALL) < 0) 730 NETCONFA_ALL) < 0)
715 goto done; 731 goto done;
@@ -720,7 +736,7 @@ cont:
720 if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, 736 if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
721 net->ipv6.devconf_dflt, 737 net->ipv6.devconf_dflt,
722 NETLINK_CB(cb->skb).portid, 738 NETLINK_CB(cb->skb).portid,
723 cb->nlh->nlmsg_seq, 739 nlh->nlmsg_seq,
724 RTM_NEWNETCONF, NLM_F_MULTI, 740 RTM_NEWNETCONF, NLM_F_MULTI,
725 NETCONFA_ALL) < 0) 741 NETCONFA_ALL) < 0)
726 goto done; 742 goto done;
@@ -997,6 +1013,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
997 if (addr_type == IPV6_ADDR_ANY || 1013 if (addr_type == IPV6_ADDR_ANY ||
998 addr_type & IPV6_ADDR_MULTICAST || 1014 addr_type & IPV6_ADDR_MULTICAST ||
999 (!(idev->dev->flags & IFF_LOOPBACK) && 1015 (!(idev->dev->flags & IFF_LOOPBACK) &&
1016 !netif_is_l3_master(idev->dev) &&
1000 addr_type & IPV6_ADDR_LOOPBACK)) 1017 addr_type & IPV6_ADDR_LOOPBACK))
1001 return ERR_PTR(-EADDRNOTAVAIL); 1018 return ERR_PTR(-EADDRNOTAVAIL);
1002 1019
@@ -4489,6 +4506,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
4489 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, 4506 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
4490 [IFA_FLAGS] = { .len = sizeof(u32) }, 4507 [IFA_FLAGS] = { .len = sizeof(u32) },
4491 [IFA_RT_PRIORITY] = { .len = sizeof(u32) }, 4508 [IFA_RT_PRIORITY] = { .len = sizeof(u32) },
4509 [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
4492}; 4510};
4493 4511
4494static int 4512static int
@@ -4791,19 +4809,40 @@ static inline int inet6_ifaddr_msgsize(void)
4791 + nla_total_size(4) /* IFA_RT_PRIORITY */; 4809 + nla_total_size(4) /* IFA_RT_PRIORITY */;
4792} 4810}
4793 4811
4812enum addr_type_t {
4813 UNICAST_ADDR,
4814 MULTICAST_ADDR,
4815 ANYCAST_ADDR,
4816};
4817
4818struct inet6_fill_args {
4819 u32 portid;
4820 u32 seq;
4821 int event;
4822 unsigned int flags;
4823 int netnsid;
4824 int ifindex;
4825 enum addr_type_t type;
4826};
4827
4794static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, 4828static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
4795 u32 portid, u32 seq, int event, unsigned int flags) 4829 struct inet6_fill_args *args)
4796{ 4830{
4797 struct nlmsghdr *nlh; 4831 struct nlmsghdr *nlh;
4798 u32 preferred, valid; 4832 u32 preferred, valid;
4799 4833
4800 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); 4834 nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
4835 sizeof(struct ifaddrmsg), args->flags);
4801 if (!nlh) 4836 if (!nlh)
4802 return -EMSGSIZE; 4837 return -EMSGSIZE;
4803 4838
4804 put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), 4839 put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
4805 ifa->idev->dev->ifindex); 4840 ifa->idev->dev->ifindex);
4806 4841
4842 if (args->netnsid >= 0 &&
4843 nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
4844 goto error;
4845
4807 if (!((ifa->flags&IFA_F_PERMANENT) && 4846 if (!((ifa->flags&IFA_F_PERMANENT) &&
4808 (ifa->prefered_lft == INFINITY_LIFE_TIME))) { 4847 (ifa->prefered_lft == INFINITY_LIFE_TIME))) {
4809 preferred = ifa->prefered_lft; 4848 preferred = ifa->prefered_lft;
@@ -4853,7 +4892,7 @@ error:
4853} 4892}
4854 4893
4855static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, 4894static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
4856 u32 portid, u32 seq, int event, u16 flags) 4895 struct inet6_fill_args *args)
4857{ 4896{
4858 struct nlmsghdr *nlh; 4897 struct nlmsghdr *nlh;
4859 u8 scope = RT_SCOPE_UNIVERSE; 4898 u8 scope = RT_SCOPE_UNIVERSE;
@@ -4862,10 +4901,15 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
4862 if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) 4901 if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
4863 scope = RT_SCOPE_SITE; 4902 scope = RT_SCOPE_SITE;
4864 4903
4865 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); 4904 nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
4905 sizeof(struct ifaddrmsg), args->flags);
4866 if (!nlh) 4906 if (!nlh)
4867 return -EMSGSIZE; 4907 return -EMSGSIZE;
4868 4908
4909 if (args->netnsid >= 0 &&
4910 nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
4911 return -EMSGSIZE;
4912
4869 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); 4913 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
4870 if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || 4914 if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
4871 put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, 4915 put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
@@ -4879,7 +4923,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
4879} 4923}
4880 4924
4881static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, 4925static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
4882 u32 portid, u32 seq, int event, unsigned int flags) 4926 struct inet6_fill_args *args)
4883{ 4927{
4884 struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); 4928 struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
4885 int ifindex = dev ? dev->ifindex : 1; 4929 int ifindex = dev ? dev->ifindex : 1;
@@ -4889,10 +4933,15 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
4889 if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) 4933 if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
4890 scope = RT_SCOPE_SITE; 4934 scope = RT_SCOPE_SITE;
4891 4935
4892 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); 4936 nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
4937 sizeof(struct ifaddrmsg), args->flags);
4893 if (!nlh) 4938 if (!nlh)
4894 return -EMSGSIZE; 4939 return -EMSGSIZE;
4895 4940
4941 if (args->netnsid >= 0 &&
4942 nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
4943 return -EMSGSIZE;
4944
4896 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); 4945 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
4897 if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || 4946 if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
4898 put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, 4947 put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
@@ -4905,36 +4954,27 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
4905 return 0; 4954 return 0;
4906} 4955}
4907 4956
4908enum addr_type_t {
4909 UNICAST_ADDR,
4910 MULTICAST_ADDR,
4911 ANYCAST_ADDR,
4912};
4913
4914/* called with rcu_read_lock() */ 4957/* called with rcu_read_lock() */
4915static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, 4958static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
4916 struct netlink_callback *cb, enum addr_type_t type, 4959 struct netlink_callback *cb, int s_ip_idx,
4917 int s_ip_idx, int *p_ip_idx) 4960 struct inet6_fill_args *fillargs)
4918{ 4961{
4919 struct ifmcaddr6 *ifmca; 4962 struct ifmcaddr6 *ifmca;
4920 struct ifacaddr6 *ifaca; 4963 struct ifacaddr6 *ifaca;
4964 int ip_idx = 0;
4921 int err = 1; 4965 int err = 1;
4922 int ip_idx = *p_ip_idx;
4923 4966
4924 read_lock_bh(&idev->lock); 4967 read_lock_bh(&idev->lock);
4925 switch (type) { 4968 switch (fillargs->type) {
4926 case UNICAST_ADDR: { 4969 case UNICAST_ADDR: {
4927 struct inet6_ifaddr *ifa; 4970 struct inet6_ifaddr *ifa;
4971 fillargs->event = RTM_NEWADDR;
4928 4972
4929 /* unicast address incl. temp addr */ 4973 /* unicast address incl. temp addr */
4930 list_for_each_entry(ifa, &idev->addr_list, if_list) { 4974 list_for_each_entry(ifa, &idev->addr_list, if_list) {
4931 if (ip_idx < s_ip_idx) 4975 if (ip_idx < s_ip_idx)
4932 goto next; 4976 goto next;
4933 err = inet6_fill_ifaddr(skb, ifa, 4977 err = inet6_fill_ifaddr(skb, ifa, fillargs);
4934 NETLINK_CB(cb->skb).portid,
4935 cb->nlh->nlmsg_seq,
4936 RTM_NEWADDR,
4937 NLM_F_MULTI);
4938 if (err < 0) 4978 if (err < 0)
4939 break; 4979 break;
4940 nl_dump_check_consistent(cb, nlmsg_hdr(skb)); 4980 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -4944,31 +4984,26 @@ next:
4944 break; 4984 break;
4945 } 4985 }
4946 case MULTICAST_ADDR: 4986 case MULTICAST_ADDR:
4987 fillargs->event = RTM_GETMULTICAST;
4988
4947 /* multicast address */ 4989 /* multicast address */
4948 for (ifmca = idev->mc_list; ifmca; 4990 for (ifmca = idev->mc_list; ifmca;
4949 ifmca = ifmca->next, ip_idx++) { 4991 ifmca = ifmca->next, ip_idx++) {
4950 if (ip_idx < s_ip_idx) 4992 if (ip_idx < s_ip_idx)
4951 continue; 4993 continue;
4952 err = inet6_fill_ifmcaddr(skb, ifmca, 4994 err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
4953 NETLINK_CB(cb->skb).portid,
4954 cb->nlh->nlmsg_seq,
4955 RTM_GETMULTICAST,
4956 NLM_F_MULTI);
4957 if (err < 0) 4995 if (err < 0)
4958 break; 4996 break;
4959 } 4997 }
4960 break; 4998 break;
4961 case ANYCAST_ADDR: 4999 case ANYCAST_ADDR:
5000 fillargs->event = RTM_GETANYCAST;
4962 /* anycast address */ 5001 /* anycast address */
4963 for (ifaca = idev->ac_list; ifaca; 5002 for (ifaca = idev->ac_list; ifaca;
4964 ifaca = ifaca->aca_next, ip_idx++) { 5003 ifaca = ifaca->aca_next, ip_idx++) {
4965 if (ip_idx < s_ip_idx) 5004 if (ip_idx < s_ip_idx)
4966 continue; 5005 continue;
4967 err = inet6_fill_ifacaddr(skb, ifaca, 5006 err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
4968 NETLINK_CB(cb->skb).portid,
4969 cb->nlh->nlmsg_seq,
4970 RTM_GETANYCAST,
4971 NLM_F_MULTI);
4972 if (err < 0) 5007 if (err < 0)
4973 break; 5008 break;
4974 } 5009 }
@@ -4977,42 +5012,128 @@ next:
4977 break; 5012 break;
4978 } 5013 }
4979 read_unlock_bh(&idev->lock); 5014 read_unlock_bh(&idev->lock);
4980 *p_ip_idx = ip_idx; 5015 cb->args[2] = ip_idx;
4981 return err; 5016 return err;
4982} 5017}
4983 5018
5019static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
5020 struct inet6_fill_args *fillargs,
5021 struct net **tgt_net, struct sock *sk,
5022 struct netlink_callback *cb)
5023{
5024 struct netlink_ext_ack *extack = cb->extack;
5025 struct nlattr *tb[IFA_MAX+1];
5026 struct ifaddrmsg *ifm;
5027 int err, i;
5028
5029 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
5030 NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
5031 return -EINVAL;
5032 }
5033
5034 ifm = nlmsg_data(nlh);
5035 if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
5036 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
5037 return -EINVAL;
5038 }
5039
5040 fillargs->ifindex = ifm->ifa_index;
5041 if (fillargs->ifindex) {
5042 cb->answer_flags |= NLM_F_DUMP_FILTERED;
5043 fillargs->flags |= NLM_F_DUMP_FILTERED;
5044 }
5045
5046 err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
5047 ifa_ipv6_policy, extack);
5048 if (err < 0)
5049 return err;
5050
5051 for (i = 0; i <= IFA_MAX; ++i) {
5052 if (!tb[i])
5053 continue;
5054
5055 if (i == IFA_TARGET_NETNSID) {
5056 struct net *net;
5057
5058 fillargs->netnsid = nla_get_s32(tb[i]);
5059 net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
5060 if (IS_ERR(net)) {
5061 fillargs->netnsid = -1;
5062 NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id");
5063 return PTR_ERR(net);
5064 }
5065 *tgt_net = net;
5066 } else {
5067 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
5068 return -EINVAL;
5069 }
5070 }
5071
5072 return 0;
5073}
5074
4984static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, 5075static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
4985 enum addr_type_t type) 5076 enum addr_type_t type)
4986{ 5077{
5078 const struct nlmsghdr *nlh = cb->nlh;
5079 struct inet6_fill_args fillargs = {
5080 .portid = NETLINK_CB(cb->skb).portid,
5081 .seq = cb->nlh->nlmsg_seq,
5082 .flags = NLM_F_MULTI,
5083 .netnsid = -1,
5084 .type = type,
5085 };
4987 struct net *net = sock_net(skb->sk); 5086 struct net *net = sock_net(skb->sk);
5087 struct net *tgt_net = net;
5088 int idx, s_idx, s_ip_idx;
4988 int h, s_h; 5089 int h, s_h;
4989 int idx, ip_idx;
4990 int s_idx, s_ip_idx;
4991 struct net_device *dev; 5090 struct net_device *dev;
4992 struct inet6_dev *idev; 5091 struct inet6_dev *idev;
4993 struct hlist_head *head; 5092 struct hlist_head *head;
5093 int err = 0;
4994 5094
4995 s_h = cb->args[0]; 5095 s_h = cb->args[0];
4996 s_idx = idx = cb->args[1]; 5096 s_idx = idx = cb->args[1];
4997 s_ip_idx = ip_idx = cb->args[2]; 5097 s_ip_idx = cb->args[2];
5098
5099 if (cb->strict_check) {
5100 err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
5101 skb->sk, cb);
5102 if (err < 0)
5103 goto put_tgt_net;
5104
5105 err = 0;
5106 if (fillargs.ifindex) {
5107 dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
5108 if (!dev) {
5109 err = -ENODEV;
5110 goto put_tgt_net;
5111 }
5112 idev = __in6_dev_get(dev);
5113 if (idev) {
5114 err = in6_dump_addrs(idev, skb, cb, s_ip_idx,
5115 &fillargs);
5116 }
5117 goto put_tgt_net;
5118 }
5119 }
4998 5120
4999 rcu_read_lock(); 5121 rcu_read_lock();
5000 cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq; 5122 cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq;
5001 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 5123 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
5002 idx = 0; 5124 idx = 0;
5003 head = &net->dev_index_head[h]; 5125 head = &tgt_net->dev_index_head[h];
5004 hlist_for_each_entry_rcu(dev, head, index_hlist) { 5126 hlist_for_each_entry_rcu(dev, head, index_hlist) {
5005 if (idx < s_idx) 5127 if (idx < s_idx)
5006 goto cont; 5128 goto cont;
5007 if (h > s_h || idx > s_idx) 5129 if (h > s_h || idx > s_idx)
5008 s_ip_idx = 0; 5130 s_ip_idx = 0;
5009 ip_idx = 0;
5010 idev = __in6_dev_get(dev); 5131 idev = __in6_dev_get(dev);
5011 if (!idev) 5132 if (!idev)
5012 goto cont; 5133 goto cont;
5013 5134
5014 if (in6_dump_addrs(idev, skb, cb, type, 5135 if (in6_dump_addrs(idev, skb, cb, s_ip_idx,
5015 s_ip_idx, &ip_idx) < 0) 5136 &fillargs) < 0)
5016 goto done; 5137 goto done;
5017cont: 5138cont:
5018 idx++; 5139 idx++;
@@ -5022,9 +5143,11 @@ done:
5022 rcu_read_unlock(); 5143 rcu_read_unlock();
5023 cb->args[0] = h; 5144 cb->args[0] = h;
5024 cb->args[1] = idx; 5145 cb->args[1] = idx;
5025 cb->args[2] = ip_idx; 5146put_tgt_net:
5147 if (fillargs.netnsid >= 0)
5148 put_net(tgt_net);
5026 5149
5027 return skb->len; 5150 return err < 0 ? err : skb->len;
5028} 5151}
5029 5152
5030static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 5153static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
@@ -5053,6 +5176,14 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
5053 struct netlink_ext_ack *extack) 5176 struct netlink_ext_ack *extack)
5054{ 5177{
5055 struct net *net = sock_net(in_skb->sk); 5178 struct net *net = sock_net(in_skb->sk);
5179 struct inet6_fill_args fillargs = {
5180 .portid = NETLINK_CB(in_skb).portid,
5181 .seq = nlh->nlmsg_seq,
5182 .event = RTM_NEWADDR,
5183 .flags = 0,
5184 .netnsid = -1,
5185 };
5186 struct net *tgt_net = net;
5056 struct ifaddrmsg *ifm; 5187 struct ifaddrmsg *ifm;
5057 struct nlattr *tb[IFA_MAX+1]; 5188 struct nlattr *tb[IFA_MAX+1];
5058 struct in6_addr *addr = NULL, *peer; 5189 struct in6_addr *addr = NULL, *peer;
@@ -5066,15 +5197,24 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
5066 if (err < 0) 5197 if (err < 0)
5067 return err; 5198 return err;
5068 5199
5200 if (tb[IFA_TARGET_NETNSID]) {
5201 fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);
5202
5203 tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
5204 fillargs.netnsid);
5205 if (IS_ERR(tgt_net))
5206 return PTR_ERR(tgt_net);
5207 }
5208
5069 addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); 5209 addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
5070 if (!addr) 5210 if (!addr)
5071 return -EINVAL; 5211 return -EINVAL;
5072 5212
5073 ifm = nlmsg_data(nlh); 5213 ifm = nlmsg_data(nlh);
5074 if (ifm->ifa_index) 5214 if (ifm->ifa_index)
5075 dev = dev_get_by_index(net, ifm->ifa_index); 5215 dev = dev_get_by_index(tgt_net, ifm->ifa_index);
5076 5216
5077 ifa = ipv6_get_ifaddr(net, addr, dev, 1); 5217 ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
5078 if (!ifa) { 5218 if (!ifa) {
5079 err = -EADDRNOTAVAIL; 5219 err = -EADDRNOTAVAIL;
5080 goto errout; 5220 goto errout;
@@ -5086,20 +5226,22 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
5086 goto errout_ifa; 5226 goto errout_ifa;
5087 } 5227 }
5088 5228
5089 err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid, 5229 err = inet6_fill_ifaddr(skb, ifa, &fillargs);
5090 nlh->nlmsg_seq, RTM_NEWADDR, 0);
5091 if (err < 0) { 5230 if (err < 0) {
5092 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ 5231 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
5093 WARN_ON(err == -EMSGSIZE); 5232 WARN_ON(err == -EMSGSIZE);
5094 kfree_skb(skb); 5233 kfree_skb(skb);
5095 goto errout_ifa; 5234 goto errout_ifa;
5096 } 5235 }
5097 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5236 err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
5098errout_ifa: 5237errout_ifa:
5099 in6_ifa_put(ifa); 5238 in6_ifa_put(ifa);
5100errout: 5239errout:
5101 if (dev) 5240 if (dev)
5102 dev_put(dev); 5241 dev_put(dev);
5242 if (fillargs.netnsid >= 0)
5243 put_net(tgt_net);
5244
5103 return err; 5245 return err;
5104} 5246}
5105 5247
@@ -5107,13 +5249,20 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
5107{ 5249{
5108 struct sk_buff *skb; 5250 struct sk_buff *skb;
5109 struct net *net = dev_net(ifa->idev->dev); 5251 struct net *net = dev_net(ifa->idev->dev);
5252 struct inet6_fill_args fillargs = {
5253 .portid = 0,
5254 .seq = 0,
5255 .event = event,
5256 .flags = 0,
5257 .netnsid = -1,
5258 };
5110 int err = -ENOBUFS; 5259 int err = -ENOBUFS;
5111 5260
5112 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); 5261 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
5113 if (!skb) 5262 if (!skb)
5114 goto errout; 5263 goto errout;
5115 5264
5116 err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); 5265 err = inet6_fill_ifaddr(skb, ifa, &fillargs);
5117 if (err < 0) { 5266 if (err < 0) {
5118 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ 5267 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
5119 WARN_ON(err == -EMSGSIZE); 5268 WARN_ON(err == -EMSGSIZE);
@@ -5529,6 +5678,31 @@ nla_put_failure:
5529 return -EMSGSIZE; 5678 return -EMSGSIZE;
5530} 5679}
5531 5680
5681static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
5682 struct netlink_ext_ack *extack)
5683{
5684 struct ifinfomsg *ifm;
5685
5686 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
5687 NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
5688 return -EINVAL;
5689 }
5690
5691 if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
5692 NL_SET_ERR_MSG_MOD(extack, "Invalid data after header");
5693 return -EINVAL;
5694 }
5695
5696 ifm = nlmsg_data(nlh);
5697 if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
5698 ifm->ifi_change || ifm->ifi_index) {
5699 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
5700 return -EINVAL;
5701 }
5702
5703 return 0;
5704}
5705
5532static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) 5706static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
5533{ 5707{
5534 struct net *net = sock_net(skb->sk); 5708 struct net *net = sock_net(skb->sk);
@@ -5538,6 +5712,16 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
5538 struct inet6_dev *idev; 5712 struct inet6_dev *idev;
5539 struct hlist_head *head; 5713 struct hlist_head *head;
5540 5714
5715 /* only requests using strict checking can pass data to
5716 * influence the dump
5717 */
5718 if (cb->strict_check) {
5719 int err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack);
5720
5721 if (err < 0)
5722 return err;
5723 }
5724
5541 s_h = cb->args[0]; 5725 s_h = cb->args[0];
5542 s_idx = cb->args[1]; 5726 s_idx = cb->args[1];
5543 5727
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 1d6ced37ad71..0d1ee82ee55b 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -458,20 +458,52 @@ static int ip6addrlbl_fill(struct sk_buff *skb,
458 return 0; 458 return 0;
459} 459}
460 460
461static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
462 struct netlink_ext_ack *extack)
463{
464 struct ifaddrlblmsg *ifal;
465
466 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) {
467 NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request");
468 return -EINVAL;
469 }
470
471 ifal = nlmsg_data(nlh);
472 if (ifal->__ifal_reserved || ifal->ifal_prefixlen ||
473 ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) {
474 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request");
475 return -EINVAL;
476 }
477
478 if (nlmsg_attrlen(nlh, sizeof(*ifal))) {
479 NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst");
480 return -EINVAL;
481 }
482
483 return 0;
484}
485
461static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) 486static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
462{ 487{
488 const struct nlmsghdr *nlh = cb->nlh;
463 struct net *net = sock_net(skb->sk); 489 struct net *net = sock_net(skb->sk);
464 struct ip6addrlbl_entry *p; 490 struct ip6addrlbl_entry *p;
465 int idx = 0, s_idx = cb->args[0]; 491 int idx = 0, s_idx = cb->args[0];
466 int err; 492 int err;
467 493
494 if (cb->strict_check) {
495 err = ip6addrlbl_valid_dump_req(nlh, cb->extack);
496 if (err < 0)
497 return err;
498 }
499
468 rcu_read_lock(); 500 rcu_read_lock();
469 hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { 501 hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
470 if (idx >= s_idx) { 502 if (idx >= s_idx) {
471 err = ip6addrlbl_fill(skb, p, 503 err = ip6addrlbl_fill(skb, p,
472 net->ipv6.ip6addrlbl_table.seq, 504 net->ipv6.ip6addrlbl_table.seq,
473 NETLINK_CB(cb->skb).portid, 505 NETLINK_CB(cb->skb).portid,
474 cb->nlh->nlmsg_seq, 506 nlh->nlmsg_seq,
475 RTM_NEWADDRLABEL, 507 RTM_NEWADDRLABEL,
476 NLM_F_MULTI); 508 NLM_F_MULTI);
477 if (err < 0) 509 if (err < 0)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9a4261e50272..3f4d61017a69 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -209,6 +209,7 @@ lookup_protocol:
209 np->hop_limit = -1; 209 np->hop_limit = -1;
210 np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; 210 np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
211 np->mc_loop = 1; 211 np->mc_loop = 1;
212 np->mc_all = 1;
212 np->pmtudisc = IPV6_PMTUDISC_WANT; 213 np->pmtudisc = IPV6_PMTUDISC_WANT;
213 np->repflow = net->ipv6.sysctl.flowlabel_reflect; 214 np->repflow = net->ipv6.sysctl.flowlabel_reflect;
214 sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; 215 sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
@@ -467,12 +468,10 @@ void inet6_destroy_sock(struct sock *sk)
467 /* Release rx options */ 468 /* Release rx options */
468 469
469 skb = xchg(&np->pktoptions, NULL); 470 skb = xchg(&np->pktoptions, NULL);
470 if (skb) 471 kfree_skb(skb);
471 kfree_skb(skb);
472 472
473 skb = xchg(&np->rxpmtu, NULL); 473 skb = xchg(&np->rxpmtu, NULL);
474 if (skb) 474 kfree_skb(skb);
475 kfree_skb(skb);
476 475
477 /* Free flowlabels */ 476 /* Free flowlabels */
478 fl6_free_socklist(sk); 477 fl6_free_socklist(sk);
@@ -902,6 +901,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
902 901
903static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { 902static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
904 .inet6_bind = __inet6_bind, 903 .inet6_bind = __inet6_bind,
904 .udp6_lib_lookup = __udp6_lib_lookup,
905}; 905};
906 906
907static int __init inet6_init(void) 907static int __init inet6_init(void)
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 88a7579c23bd..63b2b66f9dfa 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -601,12 +601,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
601 601
602static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) 602static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
603{ 603{
604 struct ip_esp_hdr *esph;
605 struct crypto_aead *aead = x->data; 604 struct crypto_aead *aead = x->data;
606 struct aead_request *req; 605 struct aead_request *req;
607 struct sk_buff *trailer; 606 struct sk_buff *trailer;
608 int ivlen = crypto_aead_ivsize(aead); 607 int ivlen = crypto_aead_ivsize(aead);
609 int elen = skb->len - sizeof(*esph) - ivlen; 608 int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
610 int nfrags; 609 int nfrags;
611 int assoclen; 610 int assoclen;
612 int seqhilen; 611 int seqhilen;
@@ -616,7 +615,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
616 u8 *iv; 615 u8 *iv;
617 struct scatterlist *sg; 616 struct scatterlist *sg;
618 617
619 if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) { 618 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) {
620 ret = -EINVAL; 619 ret = -EINVAL;
621 goto out; 620 goto out;
622 } 621 }
@@ -626,7 +625,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
626 goto out; 625 goto out;
627 } 626 }
628 627
629 assoclen = sizeof(*esph); 628 assoclen = sizeof(struct ip_esp_hdr);
630 seqhilen = 0; 629 seqhilen = 0;
631 630
632 if (x->props.flags & XFRM_STATE_ESN) { 631 if (x->props.flags & XFRM_STATE_ESN) {
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 547515e8450a..377717045f8f 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -88,8 +88,24 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
88 * Note, we are only interested in != 0 or == 0, thus the 88 * Note, we are only interested in != 0 or == 0, thus the
89 * force to int. 89 * force to int.
90 */ 90 */
91 return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, 91 err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
92 ip6_compute_pseudo); 92 ip6_compute_pseudo);
93 if (err)
94 return err;
95
96 if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
97 /* If SW calculated the value, we know it's bad */
98 if (skb->csum_complete_sw)
99 return 1;
100
101 /* HW says the value is bad. Let's validate that.
102 * skb->csum is no longer the full packet checksum,
103 * so don't treat is as such.
104 */
105 skb_checksum_complete_unset(skb);
106 }
107
108 return 0;
93} 109}
94EXPORT_SYMBOL(udp6_csum_init); 110EXPORT_SYMBOL(udp6_csum_init);
95 111
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cbe46175bb59..1b8bc008b53b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31 31
32#include <net/ip.h>
32#include <net/ipv6.h> 33#include <net/ipv6.h>
33#include <net/ndisc.h> 34#include <net/ndisc.h>
34#include <net/addrconf.h> 35#include <net/addrconf.h>
@@ -46,6 +47,7 @@ struct fib6_cleaner {
46 int (*func)(struct fib6_info *, void *arg); 47 int (*func)(struct fib6_info *, void *arg);
47 int sernum; 48 int sernum;
48 void *arg; 49 void *arg;
50 bool skip_notify;
49}; 51};
50 52
51#ifdef CONFIG_IPV6_SUBTREES 53#ifdef CONFIG_IPV6_SUBTREES
@@ -160,8 +162,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
160 } 162 }
161 163
162 INIT_LIST_HEAD(&f6i->fib6_siblings); 164 INIT_LIST_HEAD(&f6i->fib6_siblings);
163 f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
164
165 atomic_inc(&f6i->fib6_ref); 165 atomic_inc(&f6i->fib6_ref);
166 166
167 return f6i; 167 return f6i;
@@ -171,7 +171,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
171{ 171{
172 struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); 172 struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
173 struct rt6_exception_bucket *bucket; 173 struct rt6_exception_bucket *bucket;
174 struct dst_metrics *m;
175 174
176 WARN_ON(f6i->fib6_node); 175 WARN_ON(f6i->fib6_node);
177 176
@@ -205,9 +204,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
205 if (f6i->fib6_nh.nh_dev) 204 if (f6i->fib6_nh.nh_dev)
206 dev_put(f6i->fib6_nh.nh_dev); 205 dev_put(f6i->fib6_nh.nh_dev);
207 206
208 m = f6i->fib6_metrics; 207 ip_fib_metrics_put(f6i->fib6_metrics);
209 if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
210 kfree(m);
211 208
212 kfree(f6i); 209 kfree(f6i);
213} 210}
@@ -570,17 +567,31 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
570 567
571static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 568static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
572{ 569{
570 const struct nlmsghdr *nlh = cb->nlh;
573 struct net *net = sock_net(skb->sk); 571 struct net *net = sock_net(skb->sk);
572 struct rt6_rtnl_dump_arg arg = {};
574 unsigned int h, s_h; 573 unsigned int h, s_h;
575 unsigned int e = 0, s_e; 574 unsigned int e = 0, s_e;
576 struct rt6_rtnl_dump_arg arg;
577 struct fib6_walker *w; 575 struct fib6_walker *w;
578 struct fib6_table *tb; 576 struct fib6_table *tb;
579 struct hlist_head *head; 577 struct hlist_head *head;
580 int res = 0; 578 int res = 0;
581 579
582 s_h = cb->args[0]; 580 if (cb->strict_check) {
583 s_e = cb->args[1]; 581 int err;
582
583 err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
584 if (err < 0)
585 return err;
586 } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
587 struct rtmsg *rtm = nlmsg_data(nlh);
588
589 arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED);
590 }
591
592 /* fib entries are never clones */
593 if (arg.filter.flags & RTM_F_CLONED)
594 return skb->len;
584 595
585 w = (void *)cb->args[2]; 596 w = (void *)cb->args[2];
586 if (!w) { 597 if (!w) {
@@ -606,6 +617,23 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
606 arg.net = net; 617 arg.net = net;
607 w->args = &arg; 618 w->args = &arg;
608 619
620 if (arg.filter.table_id) {
621 tb = fib6_get_table(net, arg.filter.table_id);
622 if (!tb) {
623 if (arg.filter.dump_all_families)
624 return skb->len;
625
626 NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
627 return -ENOENT;
628 }
629
630 res = fib6_dump_table(tb, skb, cb);
631 goto out;
632 }
633
634 s_h = cb->args[0];
635 s_e = cb->args[1];
636
609 rcu_read_lock(); 637 rcu_read_lock();
610 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { 638 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
611 e = 0; 639 e = 0;
@@ -615,16 +643,16 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
615 goto next; 643 goto next;
616 res = fib6_dump_table(tb, skb, cb); 644 res = fib6_dump_table(tb, skb, cb);
617 if (res != 0) 645 if (res != 0)
618 goto out; 646 goto out_unlock;
619next: 647next:
620 e++; 648 e++;
621 } 649 }
622 } 650 }
623out: 651out_unlock:
624 rcu_read_unlock(); 652 rcu_read_unlock();
625 cb->args[1] = e; 653 cb->args[1] = e;
626 cb->args[0] = h; 654 cb->args[0] = h;
627 655out:
628 res = res < 0 ? res : skb->len; 656 res = res < 0 ? res : skb->len;
629 if (res <= 0) 657 if (res <= 0)
630 fib6_dump_end(cb); 658 fib6_dump_end(cb);
@@ -1954,6 +1982,7 @@ static int fib6_clean_node(struct fib6_walker *w)
1954 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); 1982 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
1955 struct nl_info info = { 1983 struct nl_info info = {
1956 .nl_net = c->net, 1984 .nl_net = c->net,
1985 .skip_notify = c->skip_notify,
1957 }; 1986 };
1958 1987
1959 if (c->sernum != FIB6_NO_SERNUM_CHANGE && 1988 if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
@@ -2005,7 +2034,7 @@ static int fib6_clean_node(struct fib6_walker *w)
2005 2034
2006static void fib6_clean_tree(struct net *net, struct fib6_node *root, 2035static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2007 int (*func)(struct fib6_info *, void *arg), 2036 int (*func)(struct fib6_info *, void *arg),
2008 int sernum, void *arg) 2037 int sernum, void *arg, bool skip_notify)
2009{ 2038{
2010 struct fib6_cleaner c; 2039 struct fib6_cleaner c;
2011 2040
@@ -2017,13 +2046,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2017 c.sernum = sernum; 2046 c.sernum = sernum;
2018 c.arg = arg; 2047 c.arg = arg;
2019 c.net = net; 2048 c.net = net;
2049 c.skip_notify = skip_notify;
2020 2050
2021 fib6_walk(net, &c.w); 2051 fib6_walk(net, &c.w);
2022} 2052}
2023 2053
2024static void __fib6_clean_all(struct net *net, 2054static void __fib6_clean_all(struct net *net,
2025 int (*func)(struct fib6_info *, void *), 2055 int (*func)(struct fib6_info *, void *),
2026 int sernum, void *arg) 2056 int sernum, void *arg, bool skip_notify)
2027{ 2057{
2028 struct fib6_table *table; 2058 struct fib6_table *table;
2029 struct hlist_head *head; 2059 struct hlist_head *head;
@@ -2035,7 +2065,7 @@ static void __fib6_clean_all(struct net *net,
2035 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 2065 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2036 spin_lock_bh(&table->tb6_lock); 2066 spin_lock_bh(&table->tb6_lock);
2037 fib6_clean_tree(net, &table->tb6_root, 2067 fib6_clean_tree(net, &table->tb6_root,
2038 func, sernum, arg); 2068 func, sernum, arg, skip_notify);
2039 spin_unlock_bh(&table->tb6_lock); 2069 spin_unlock_bh(&table->tb6_lock);
2040 } 2070 }
2041 } 2071 }
@@ -2045,14 +2075,21 @@ static void __fib6_clean_all(struct net *net,
2045void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), 2075void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2046 void *arg) 2076 void *arg)
2047{ 2077{
2048 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); 2078 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2079}
2080
2081void fib6_clean_all_skip_notify(struct net *net,
2082 int (*func)(struct fib6_info *, void *),
2083 void *arg)
2084{
2085 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2049} 2086}
2050 2087
2051static void fib6_flush_trees(struct net *net) 2088static void fib6_flush_trees(struct net *net)
2052{ 2089{
2053 int new_sernum = fib6_new_sernum(net); 2090 int new_sernum = fib6_new_sernum(net);
2054 2091
2055 __fib6_clean_all(net, NULL, new_sernum, NULL); 2092 __fib6_clean_all(net, NULL, new_sernum, NULL, false);
2056} 2093}
2057 2094
2058/* 2095/*
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e493b041d4ac..515adbdba1d2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -427,35 +427,17 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
427 u8 type, u8 code, int offset, __be32 info) 427 u8 type, u8 code, int offset, __be32 info)
428{ 428{
429 struct net *net = dev_net(skb->dev); 429 struct net *net = dev_net(skb->dev);
430 const struct gre_base_hdr *greh;
431 const struct ipv6hdr *ipv6h; 430 const struct ipv6hdr *ipv6h;
432 int grehlen = sizeof(*greh); 431 struct tnl_ptk_info tpi;
433 struct ip6_tnl *t; 432 struct ip6_tnl *t;
434 int key_off = 0;
435 __be16 flags;
436 __be32 key;
437 433
438 if (!pskb_may_pull(skb, offset + grehlen)) 434 if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
439 return; 435 offset) < 0)
440 greh = (const struct gre_base_hdr *)(skb->data + offset);
441 flags = greh->flags;
442 if (flags & (GRE_VERSION | GRE_ROUTING))
443 return; 436 return;
444 if (flags & GRE_CSUM)
445 grehlen += 4;
446 if (flags & GRE_KEY) {
447 key_off = grehlen + offset;
448 grehlen += 4;
449 }
450 437
451 if (!pskb_may_pull(skb, offset + grehlen))
452 return;
453 ipv6h = (const struct ipv6hdr *)skb->data; 438 ipv6h = (const struct ipv6hdr *)skb->data;
454 greh = (const struct gre_base_hdr *)(skb->data + offset);
455 key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
456
457 t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, 439 t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
458 key, greh->protocol); 440 tpi.key, tpi.proto);
459 if (!t) 441 if (!t)
460 return; 442 return;
461 443
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..96577e742afd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
178 */ 178 */
179 if ((ipv6_addr_loopback(&hdr->saddr) || 179 if ((ipv6_addr_loopback(&hdr->saddr) ||
180 ipv6_addr_loopback(&hdr->daddr)) && 180 ipv6_addr_loopback(&hdr->daddr)) &&
181 !(dev->flags & IFF_LOOPBACK)) 181 !(dev->flags & IFF_LOOPBACK) &&
182 !netif_is_l3_master(dev))
182 goto err; 183 goto err;
183 184
184 /* RFC4291 Errata ID: 3480 185 /* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f9f8f554d141..89e0d5118afe 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -725,7 +725,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
725 725
726 skb = frag; 726 skb = frag;
727 frag = skb->next; 727 frag = skb->next;
728 skb->next = NULL; 728 skb_mark_not_on_list(skb);
729 } 729 }
730 730
731 kfree(tmp_hdr); 731 kfree(tmp_hdr);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0b7e0249c13..e2ea691e42c6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
85static void ip6mr_free_table(struct mr_table *mrt); 85static void ip6mr_free_table(struct mr_table *mrt);
86 86
87static void ip6_mr_forward(struct net *net, struct mr_table *mrt, 87static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
88 struct sk_buff *skb, struct mfc6_cache *cache); 88 struct net_device *dev, struct sk_buff *skb,
89 struct mfc6_cache *cache);
89static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, 90static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
90 mifi_t mifi, int assert); 91 mifi_t mifi, int assert);
91static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, 92static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
@@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
138 .flags = FIB_LOOKUP_NOREF, 139 .flags = FIB_LOOKUP_NOREF,
139 }; 140 };
140 141
142 /* update flow if oif or iif point to device enslaved to l3mdev */
143 l3mdev_update_flow(net, flowi6_to_flowi(flp6));
144
141 err = fib_rules_lookup(net->ipv6.mr6_rules_ops, 145 err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
142 flowi6_to_flowi(flp6), 0, &arg); 146 flowi6_to_flowi(flp6), 0, &arg);
143 if (err < 0) 147 if (err < 0)
@@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
164 return -EINVAL; 168 return -EINVAL;
165 } 169 }
166 170
167 mrt = ip6mr_get_table(rule->fr_net, rule->table); 171 arg->table = fib_rule_get_table(rule, arg);
172
173 mrt = ip6mr_get_table(rule->fr_net, arg->table);
168 if (!mrt) 174 if (!mrt)
169 return -EAGAIN; 175 return -EAGAIN;
170 res->mrt = mrt; 176 res->mrt = mrt;
@@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
1014 } 1020 }
1015 rtnl_unicast(skb, net, NETLINK_CB(skb).portid); 1021 rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
1016 } else 1022 } else
1017 ip6_mr_forward(net, mrt, skb, c); 1023 ip6_mr_forward(net, mrt, skb->dev, skb, c);
1018 } 1024 }
1019} 1025}
1020 1026
@@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
1120 1126
1121/* Queue a packet for resolution. It gets locked cache entry! */ 1127/* Queue a packet for resolution. It gets locked cache entry! */
1122static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, 1128static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
1123 struct sk_buff *skb) 1129 struct sk_buff *skb, struct net_device *dev)
1124{ 1130{
1125 struct mfc6_cache *c; 1131 struct mfc6_cache *c;
1126 bool found = false; 1132 bool found = false;
@@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
1180 kfree_skb(skb); 1186 kfree_skb(skb);
1181 err = -ENOBUFS; 1187 err = -ENOBUFS;
1182 } else { 1188 } else {
1189 if (dev) {
1190 skb->dev = dev;
1191 skb->skb_iif = dev->ifindex;
1192 }
1183 skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); 1193 skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
1184 err = 0; 1194 err = 0;
1185 } 1195 }
@@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
2043} 2053}
2044 2054
2045static void ip6_mr_forward(struct net *net, struct mr_table *mrt, 2055static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
2046 struct sk_buff *skb, struct mfc6_cache *c) 2056 struct net_device *dev, struct sk_buff *skb,
2057 struct mfc6_cache *c)
2047{ 2058{
2048 int psend = -1; 2059 int psend = -1;
2049 int vif, ct; 2060 int vif, ct;
2050 int true_vifi = ip6mr_find_vif(mrt, skb->dev); 2061 int true_vifi = ip6mr_find_vif(mrt, dev);
2051 2062
2052 vif = c->_c.mfc_parent; 2063 vif = c->_c.mfc_parent;
2053 c->_c.mfc_un.res.pkt++; 2064 c->_c.mfc_un.res.pkt++;
@@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
2073 /* 2084 /*
2074 * Wrong interface: drop packet and (maybe) send PIM assert. 2085 * Wrong interface: drop packet and (maybe) send PIM assert.
2075 */ 2086 */
2076 if (mrt->vif_table[vif].dev != skb->dev) { 2087 if (mrt->vif_table[vif].dev != dev) {
2077 c->_c.mfc_un.res.wrong_if++; 2088 c->_c.mfc_un.res.wrong_if++;
2078 2089
2079 if (true_vifi >= 0 && mrt->mroute_do_assert && 2090 if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2154,6 +2165,19 @@ int ip6_mr_input(struct sk_buff *skb)
2154 .flowi6_mark = skb->mark, 2165 .flowi6_mark = skb->mark,
2155 }; 2166 };
2156 int err; 2167 int err;
2168 struct net_device *dev;
2169
2170 /* skb->dev passed in is the master dev for vrfs.
2171 * Get the proper interface that does have a vif associated with it.
2172 */
2173 dev = skb->dev;
2174 if (netif_is_l3_master(skb->dev)) {
2175 dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
2176 if (!dev) {
2177 kfree_skb(skb);
2178 return -ENODEV;
2179 }
2180 }
2157 2181
2158 err = ip6mr_fib_lookup(net, &fl6, &mrt); 2182 err = ip6mr_fib_lookup(net, &fl6, &mrt);
2159 if (err < 0) { 2183 if (err < 0) {
@@ -2165,7 +2189,7 @@ int ip6_mr_input(struct sk_buff *skb)
2165 cache = ip6mr_cache_find(mrt, 2189 cache = ip6mr_cache_find(mrt,
2166 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); 2190 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
2167 if (!cache) { 2191 if (!cache) {
2168 int vif = ip6mr_find_vif(mrt, skb->dev); 2192 int vif = ip6mr_find_vif(mrt, dev);
2169 2193
2170 if (vif >= 0) 2194 if (vif >= 0)
2171 cache = ip6mr_cache_find_any(mrt, 2195 cache = ip6mr_cache_find_any(mrt,
@@ -2179,9 +2203,9 @@ int ip6_mr_input(struct sk_buff *skb)
2179 if (!cache) { 2203 if (!cache) {
2180 int vif; 2204 int vif;
2181 2205
2182 vif = ip6mr_find_vif(mrt, skb->dev); 2206 vif = ip6mr_find_vif(mrt, dev);
2183 if (vif >= 0) { 2207 if (vif >= 0) {
2184 int err = ip6mr_cache_unresolved(mrt, vif, skb); 2208 int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);
2185 read_unlock(&mrt_lock); 2209 read_unlock(&mrt_lock);
2186 2210
2187 return err; 2211 return err;
@@ -2191,7 +2215,7 @@ int ip6_mr_input(struct sk_buff *skb)
2191 return -ENODEV; 2215 return -ENODEV;
2192 } 2216 }
2193 2217
2194 ip6_mr_forward(net, mrt, skb, cache); 2218 ip6_mr_forward(net, mrt, dev, skb, cache);
2195 2219
2196 read_unlock(&mrt_lock); 2220 read_unlock(&mrt_lock);
2197 2221
@@ -2257,7 +2281,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2257 iph->saddr = rt->rt6i_src.addr; 2281 iph->saddr = rt->rt6i_src.addr;
2258 iph->daddr = rt->rt6i_dst.addr; 2282 iph->daddr = rt->rt6i_dst.addr;
2259 2283
2260 err = ip6mr_cache_unresolved(mrt, vif, skb2); 2284 err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
2261 read_unlock(&mrt_lock); 2285 read_unlock(&mrt_lock);
2262 2286
2263 return err; 2287 return err;
@@ -2433,6 +2457,33 @@ errout:
2433 2457
2434static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) 2458static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2435{ 2459{
2460 const struct nlmsghdr *nlh = cb->nlh;
2461 struct fib_dump_filter filter = {};
2462 int err;
2463
2464 if (cb->strict_check) {
2465 err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
2466 &filter, cb);
2467 if (err < 0)
2468 return err;
2469 }
2470
2471 if (filter.table_id) {
2472 struct mr_table *mrt;
2473
2474 mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);
2475 if (!mrt) {
2476 if (filter.dump_all_families)
2477 return skb->len;
2478
2479 NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
2480 return -ENOENT;
2481 }
2482 err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
2483 &mfc_unres_lock, &filter);
2484 return skb->len ? : err;
2485 }
2486
2436 return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, 2487 return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
2437 _ip6mr_fill_mroute, &mfc_unres_lock); 2488 _ip6mr_fill_mroute, &mfc_unres_lock, &filter);
2438} 2489}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c0cac9cc3a28..381ce38940ae 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -674,6 +674,13 @@ done:
674 retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); 674 retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
675 break; 675 break;
676 } 676 }
677 case IPV6_MULTICAST_ALL:
678 if (optlen < sizeof(int))
679 goto e_inval;
680 np->mc_all = valbool;
681 retv = 0;
682 break;
683
677 case MCAST_JOIN_GROUP: 684 case MCAST_JOIN_GROUP:
678 case MCAST_LEAVE_GROUP: 685 case MCAST_LEAVE_GROUP:
679 { 686 {
@@ -1266,6 +1273,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
1266 val = np->mcast_oif; 1273 val = np->mcast_oif;
1267 break; 1274 break;
1268 1275
1276 case IPV6_MULTICAST_ALL:
1277 val = np->mc_all;
1278 break;
1279
1269 case IPV6_UNICAST_IF: 1280 case IPV6_UNICAST_IF:
1270 val = (__force int)htonl((__u32) np->ucast_oif); 1281 val = (__force int)htonl((__u32) np->ucast_oif);
1271 break; 1282 break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index dbab62e3f0d7..21f6deb2aec9 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -636,7 +636,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
636 } 636 }
637 if (!mc) { 637 if (!mc) {
638 rcu_read_unlock(); 638 rcu_read_unlock();
639 return true; 639 return np->mc_all;
640 } 640 }
641 read_lock(&mc->sflock); 641 read_lock(&mc->sflock);
642 psl = mc->sflist; 642 psl = mc->sflist;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0ec273997d1d..659ecf4e4b3c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
1533 1533
1534 if (!ndopts.nd_opts_rh) { 1534 if (!ndopts.nd_opts_rh) {
1535 ip6_redirect_no_header(skb, dev_net(skb->dev), 1535 ip6_redirect_no_header(skb, dev_net(skb->dev),
1536 skb->dev->ifindex, 0); 1536 skb->dev->ifindex);
1537 return; 1537 return;
1538 } 1538 }
1539 1539
@@ -1732,10 +1732,9 @@ int ndisc_rcv(struct sk_buff *skb)
1732 return 0; 1732 return 0;
1733 } 1733 }
1734 1734
1735 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
1736
1737 switch (msg->icmph.icmp6_type) { 1735 switch (msg->icmph.icmp6_type) {
1738 case NDISC_NEIGHBOUR_SOLICITATION: 1736 case NDISC_NEIGHBOUR_SOLICITATION:
1737 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
1739 ndisc_recv_ns(skb); 1738 ndisc_recv_ns(skb);
1740 break; 1739 break;
1741 1740
@@ -1784,6 +1783,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
1784 change_info = ptr; 1783 change_info = ptr;
1785 if (change_info->flags_changed & IFF_NOARP) 1784 if (change_info->flags_changed & IFF_NOARP)
1786 neigh_changeaddr(&nd_tbl, dev); 1785 neigh_changeaddr(&nd_tbl, dev);
1786 if (!netif_carrier_ok(dev))
1787 neigh_carrier_down(&nd_tbl, dev);
1787 break; 1788 break;
1788 case NETDEV_DOWN: 1789 case NETDEV_DOWN:
1789 neigh_ifdown(&nd_tbl, dev); 1790 neigh_ifdown(&nd_tbl, dev);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 8b147440fbdc..af737b47b9b5 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -65,7 +65,10 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
65 } 65 }
66 66
67 hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); 67 hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
68 BUG_ON(hp == NULL); 68 if (!hp) {
69 par->hotdrop = true;
70 return false;
71 }
69 72
70 /* Calculate the header length */ 73 /* Calculate the header length */
71 if (nexthdr == NEXTHDR_FRAGMENT) 74 if (nexthdr == NEXTHDR_FRAGMENT)
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 2c99b94eeca3..21bf6bf04323 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -137,7 +137,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
137 sizeof(_addr), 137 sizeof(_addr),
138 &_addr); 138 &_addr);
139 139
140 BUG_ON(ap == NULL); 140 if (ap == NULL) {
141 par->hotdrop = true;
142 return false;
143 }
141 144
142 if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { 145 if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
143 pr_debug("i=%d temp=%d;\n", i, temp); 146 pr_debug("i=%d temp=%d;\n", i, temp);
@@ -166,7 +169,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
166 + temp * sizeof(_addr), 169 + temp * sizeof(_addr),
167 sizeof(_addr), 170 sizeof(_addr),
168 &_addr); 171 &_addr);
169 BUG_ON(ap == NULL); 172 if (ap == NULL) {
173 par->hotdrop = true;
174 return false;
175 }
170 176
171 if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) 177 if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
172 break; 178 break;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 8f68a518d9db..b8ac369f98ad 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -450,7 +450,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
450 sub_frag_mem_limit(fq->q.net, head->truesize); 450 sub_frag_mem_limit(fq->q.net, head->truesize);
451 451
452 head->ignore_df = 1; 452 head->ignore_df = 1;
453 head->next = NULL; 453 skb_mark_not_on_list(head);
454 head->dev = dev; 454 head->dev = dev;
455 head->tstamp = fq->q.stamp; 455 head->tstamp = fq->q.stamp;
456 ipv6_hdr(head)->payload_len = htons(payload_len); 456 ipv6_hdr(head)->payload_len = htons(payload_len);
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index e6eb7cf9b54f..3e4bf2286abe 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -87,18 +87,30 @@ static struct notifier_block masq_dev_notifier = {
87struct masq_dev_work { 87struct masq_dev_work {
88 struct work_struct work; 88 struct work_struct work;
89 struct net *net; 89 struct net *net;
90 struct in6_addr addr;
90 int ifindex; 91 int ifindex;
91}; 92};
92 93
94static int inet_cmp(struct nf_conn *ct, void *work)
95{
96 struct masq_dev_work *w = (struct masq_dev_work *)work;
97 struct nf_conntrack_tuple *tuple;
98
99 if (!device_cmp(ct, (void *)(long)w->ifindex))
100 return 0;
101
102 tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
103
104 return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6);
105}
106
93static void iterate_cleanup_work(struct work_struct *work) 107static void iterate_cleanup_work(struct work_struct *work)
94{ 108{
95 struct masq_dev_work *w; 109 struct masq_dev_work *w;
96 long index;
97 110
98 w = container_of(work, struct masq_dev_work, work); 111 w = container_of(work, struct masq_dev_work, work);
99 112
100 index = w->ifindex; 113 nf_ct_iterate_cleanup_net(w->net, inet_cmp, (void *)w, 0, 0);
101 nf_ct_iterate_cleanup_net(w->net, device_cmp, (void *)index, 0, 0);
102 114
103 put_net(w->net); 115 put_net(w->net);
104 kfree(w); 116 kfree(w);
@@ -147,6 +159,7 @@ static int masq_inet_event(struct notifier_block *this,
147 INIT_WORK(&w->work, iterate_cleanup_work); 159 INIT_WORK(&w->work, iterate_cleanup_work);
148 w->ifindex = dev->ifindex; 160 w->ifindex = dev->ifindex;
149 w->net = net; 161 w->net = net;
162 w->addr = ifa->addr;
150 schedule_work(&w->work); 163 schedule_work(&w->work);
151 164
152 return NOTIFY_DONE; 165 return NOTIFY_DONE;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c5b4f79296e..5c3c92713096 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -145,7 +145,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
145 */ 145 */
146 if (end < fq->q.len || 146 if (end < fq->q.len ||
147 ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) 147 ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
148 goto err; 148 goto discard_fq;
149 fq->q.flags |= INET_FRAG_LAST_IN; 149 fq->q.flags |= INET_FRAG_LAST_IN;
150 fq->q.len = end; 150 fq->q.len = end;
151 } else { 151 } else {
@@ -162,20 +162,20 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
162 if (end > fq->q.len) { 162 if (end > fq->q.len) {
163 /* Some bits beyond end -> corruption. */ 163 /* Some bits beyond end -> corruption. */
164 if (fq->q.flags & INET_FRAG_LAST_IN) 164 if (fq->q.flags & INET_FRAG_LAST_IN)
165 goto err; 165 goto discard_fq;
166 fq->q.len = end; 166 fq->q.len = end;
167 } 167 }
168 } 168 }
169 169
170 if (end == offset) 170 if (end == offset)
171 goto err; 171 goto discard_fq;
172 172
173 /* Point into the IP datagram 'data' part. */ 173 /* Point into the IP datagram 'data' part. */
174 if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) 174 if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
175 goto err; 175 goto discard_fq;
176 176
177 if (pskb_trim_rcsum(skb, end - offset)) 177 if (pskb_trim_rcsum(skb, end - offset))
178 goto err; 178 goto discard_fq;
179 179
180 /* Find out which fragments are in front and at the back of us 180 /* Find out which fragments are in front and at the back of us
181 * in the chain of fragments so far. We must know where to put 181 * in the chain of fragments so far. We must know where to put
@@ -388,7 +388,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
388 } 388 }
389 sub_frag_mem_limit(fq->q.net, sum_truesize); 389 sub_frag_mem_limit(fq->q.net, sum_truesize);
390 390
391 head->next = NULL; 391 skb_mark_not_on_list(head);
392 head->dev = dev; 392 head->dev = dev;
393 head->tstamp = fq->q.stamp; 393 head->tstamp = fq->q.stamp;
394 ipv6_hdr(head)->payload_len = htons(payload_len); 394 ipv6_hdr(head)->payload_len = htons(payload_len);
@@ -418,6 +418,7 @@ out_fail:
418 rcu_read_lock(); 418 rcu_read_lock();
419 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); 419 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
420 rcu_read_unlock(); 420 rcu_read_unlock();
421 inet_frag_kill(&fq->q);
421 return -1; 422 return -1;
422} 423}
423 424
@@ -553,7 +554,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
553 554
554 table[0].data = &net->ipv6.frags.high_thresh; 555 table[0].data = &net->ipv6.frags.high_thresh;
555 table[0].extra1 = &net->ipv6.frags.low_thresh; 556 table[0].extra1 = &net->ipv6.frags.low_thresh;
556 table[0].extra2 = &init_net.ipv6.frags.high_thresh;
557 table[1].data = &net->ipv6.frags.low_thresh; 557 table[1].data = &net->ipv6.frags.low_thresh;
558 table[1].extra2 = &net->ipv6.frags.high_thresh; 558 table[1].extra2 = &net->ipv6.frags.high_thresh;
559 table[2].data = &net->ipv6.frags.timeout; 559 table[2].data = &net->ipv6.frags.timeout;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index abcb5ae77319..2a7423c39456 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -364,14 +364,11 @@ EXPORT_SYMBOL(ip6_dst_alloc);
364 364
365static void ip6_dst_destroy(struct dst_entry *dst) 365static void ip6_dst_destroy(struct dst_entry *dst)
366{ 366{
367 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
368 struct rt6_info *rt = (struct rt6_info *)dst; 367 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct fib6_info *from; 368 struct fib6_info *from;
370 struct inet6_dev *idev; 369 struct inet6_dev *idev;
371 370
372 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) 371 ip_dst_metrics_put(dst);
373 kfree(p);
374
375 rt6_uncached_list_del(rt); 372 rt6_uncached_list_del(rt);
376 373
377 idev = rt->rt6i_idev; 374 idev = rt->rt6i_idev;
@@ -978,11 +975,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978{ 975{
979 rt->rt6i_flags &= ~RTF_EXPIRES; 976 rt->rt6i_flags &= ~RTF_EXPIRES;
980 rcu_assign_pointer(rt->from, from); 977 rcu_assign_pointer(rt->from, from);
981 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 978 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
982 if (from->fib6_metrics != &dst_default_metrics) {
983 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
984 refcount_inc(&from->fib6_metrics->refcnt);
985 }
986} 979}
987 980
988/* Caller must already hold reference to @ort */ 981/* Caller must already hold reference to @ort */
@@ -1000,7 +993,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
1000#ifdef CONFIG_IPV6_SUBTREES 993#ifdef CONFIG_IPV6_SUBTREES
1001 rt->rt6i_src = ort->fib6_src; 994 rt->rt6i_src = ort->fib6_src;
1002#endif 995#endif
1003 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1004} 996}
1005 997
1006static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 998static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1454,11 +1446,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
1454 if (ort->fib6_src.plen) 1446 if (ort->fib6_src.plen)
1455 src_key = &nrt->rt6i_src.addr; 1447 src_key = &nrt->rt6i_src.addr;
1456#endif 1448#endif
1457
1458 /* Update rt6i_prefsrc as it could be changed
1459 * in rt6_remove_prefsrc()
1460 */
1461 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1462 /* rt6_mtu_change() might lower mtu on ort. 1449 /* rt6_mtu_change() might lower mtu on ort.
1463 * Only insert this exception route if its mtu 1450 * Only insert this exception route if its mtu
1464 * is less than ort's mtu value. 1451 * is less than ort's mtu value.
@@ -1640,25 +1627,6 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1640 rcu_read_unlock(); 1627 rcu_read_unlock();
1641} 1628}
1642 1629
1643static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1644{
1645 struct rt6_exception_bucket *bucket;
1646 struct rt6_exception *rt6_ex;
1647 int i;
1648
1649 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1650 lockdep_is_held(&rt6_exception_lock));
1651
1652 if (bucket) {
1653 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1654 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1655 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1656 }
1657 bucket++;
1658 }
1659 }
1660}
1661
1662static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1630static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1663 struct rt6_info *rt, int mtu) 1631 struct rt6_info *rt, int mtu)
1664{ 1632{
@@ -2103,7 +2071,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2103{ 2071{
2104 bool any_src; 2072 bool any_src;
2105 2073
2106 if (rt6_need_strict(&fl6->daddr)) { 2074 if (ipv6_addr_type(&fl6->daddr) &
2075 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2107 struct dst_entry *dst; 2076 struct dst_entry *dst;
2108 2077
2109 dst = l3mdev_link_scope_lookup(net, fl6); 2078 dst = l3mdev_link_scope_lookup(net, fl6);
@@ -2373,15 +2342,14 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2373{ 2342{
2374 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2343 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2375 struct dst_entry *dst; 2344 struct dst_entry *dst;
2376 struct flowi6 fl6; 2345 struct flowi6 fl6 = {
2377 2346 .flowi6_oif = oif,
2378 memset(&fl6, 0, sizeof(fl6)); 2347 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2379 fl6.flowi6_oif = oif; 2348 .daddr = iph->daddr,
2380 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2349 .saddr = iph->saddr,
2381 fl6.daddr = iph->daddr; 2350 .flowlabel = ip6_flowinfo(iph),
2382 fl6.saddr = iph->saddr; 2351 .flowi6_uid = uid,
2383 fl6.flowlabel = ip6_flowinfo(iph); 2352 };
2384 fl6.flowi6_uid = uid;
2385 2353
2386 dst = ip6_route_output(net, NULL, &fl6); 2354 dst = ip6_route_output(net, NULL, &fl6);
2387 if (!dst->error) 2355 if (!dst->error)
@@ -2532,16 +2500,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2532{ 2500{
2533 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2501 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2534 struct dst_entry *dst; 2502 struct dst_entry *dst;
2535 struct flowi6 fl6; 2503 struct flowi6 fl6 = {
2536 2504 .flowi6_iif = LOOPBACK_IFINDEX,
2537 memset(&fl6, 0, sizeof(fl6)); 2505 .flowi6_oif = oif,
2538 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2506 .flowi6_mark = mark,
2539 fl6.flowi6_oif = oif; 2507 .daddr = iph->daddr,
2540 fl6.flowi6_mark = mark; 2508 .saddr = iph->saddr,
2541 fl6.daddr = iph->daddr; 2509 .flowlabel = ip6_flowinfo(iph),
2542 fl6.saddr = iph->saddr; 2510 .flowi6_uid = uid,
2543 fl6.flowlabel = ip6_flowinfo(iph); 2511 };
2544 fl6.flowi6_uid = uid;
2545 2512
2546 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 2513 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2547 rt6_do_redirect(dst, NULL, skb); 2514 rt6_do_redirect(dst, NULL, skb);
@@ -2549,21 +2516,18 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2549} 2516}
2550EXPORT_SYMBOL_GPL(ip6_redirect); 2517EXPORT_SYMBOL_GPL(ip6_redirect);
2551 2518
2552void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2519void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2553 u32 mark)
2554{ 2520{
2555 const struct ipv6hdr *iph = ipv6_hdr(skb); 2521 const struct ipv6hdr *iph = ipv6_hdr(skb);
2556 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2522 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2557 struct dst_entry *dst; 2523 struct dst_entry *dst;
2558 struct flowi6 fl6; 2524 struct flowi6 fl6 = {
2559 2525 .flowi6_iif = LOOPBACK_IFINDEX,
2560 memset(&fl6, 0, sizeof(fl6)); 2526 .flowi6_oif = oif,
2561 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2527 .daddr = msg->dest,
2562 fl6.flowi6_oif = oif; 2528 .saddr = iph->daddr,
2563 fl6.flowi6_mark = mark; 2529 .flowi6_uid = sock_net_uid(net, NULL),
2564 fl6.daddr = msg->dest; 2530 };
2565 fl6.saddr = iph->daddr;
2566 fl6.flowi6_uid = sock_net_uid(net, NULL);
2567 2531
2568 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 2532 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2569 rt6_do_redirect(dst, NULL, skb); 2533 rt6_do_redirect(dst, NULL, skb);
@@ -2734,24 +2698,6 @@ out:
2734 return entries > rt_max_size; 2698 return entries > rt_max_size;
2735} 2699}
2736 2700
2737static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2738 struct fib6_config *cfg)
2739{
2740 struct dst_metrics *p;
2741
2742 if (!cfg->fc_mx)
2743 return 0;
2744
2745 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2746 if (unlikely(!p))
2747 return -ENOMEM;
2748
2749 refcount_set(&p->refcnt, 1);
2750 rt->fib6_metrics = p;
2751
2752 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2753}
2754
2755static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2701static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2756 struct fib6_config *cfg, 2702 struct fib6_config *cfg,
2757 const struct in6_addr *gw_addr, 2703 const struct in6_addr *gw_addr,
@@ -2799,6 +2745,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
2799 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); 2745 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2800 if (grt) { 2746 if (grt) {
2801 if (!grt->dst.error && 2747 if (!grt->dst.error &&
2748 /* ignore match if it is the default route */
2749 grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2802 (grt->rt6i_flags & flags || dev != grt->dst.dev)) { 2750 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2803 NL_SET_ERR_MSG(extack, 2751 NL_SET_ERR_MSG(extack,
2804 "Nexthop has invalid gateway or device mismatch"); 2752 "Nexthop has invalid gateway or device mismatch");
@@ -3027,13 +2975,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3027 if (!rt) 2975 if (!rt)
3028 goto out; 2976 goto out;
3029 2977
2978 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
2979 if (IS_ERR(rt->fib6_metrics)) {
2980 err = PTR_ERR(rt->fib6_metrics);
2981 /* Do not leave garbage there. */
2982 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2983 goto out;
2984 }
2985
3030 if (cfg->fc_flags & RTF_ADDRCONF) 2986 if (cfg->fc_flags & RTF_ADDRCONF)
3031 rt->dst_nocount = true; 2987 rt->dst_nocount = true;
3032 2988
3033 err = ip6_convert_metrics(net, rt, cfg);
3034 if (err < 0)
3035 goto out;
3036
3037 if (cfg->fc_flags & RTF_EXPIRES) 2989 if (cfg->fc_flags & RTF_EXPIRES)
3038 fib6_set_expires(rt, jiffies + 2990 fib6_set_expires(rt, jiffies +
3039 clock_t_to_jiffies(cfg->fc_expires)); 2991 clock_t_to_jiffies(cfg->fc_expires));
@@ -3142,8 +3094,6 @@ install_route:
3142 rt->fib6_nh.nh_dev = dev; 3094 rt->fib6_nh.nh_dev = dev;
3143 rt->fib6_table = table; 3095 rt->fib6_table = table;
3144 3096
3145 cfg->fc_nlinfo.nl_net = dev_net(dev);
3146
3147 if (idev) 3097 if (idev)
3148 in6_dev_put(idev); 3098 in6_dev_put(idev);
3149 3099
@@ -3635,23 +3585,23 @@ static void rtmsg_to_fib6_config(struct net *net,
3635 struct in6_rtmsg *rtmsg, 3585 struct in6_rtmsg *rtmsg,
3636 struct fib6_config *cfg) 3586 struct fib6_config *cfg)
3637{ 3587{
3638 memset(cfg, 0, sizeof(*cfg)); 3588 *cfg = (struct fib6_config){
3589 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3590 : RT6_TABLE_MAIN,
3591 .fc_ifindex = rtmsg->rtmsg_ifindex,
3592 .fc_metric = rtmsg->rtmsg_metric,
3593 .fc_expires = rtmsg->rtmsg_info,
3594 .fc_dst_len = rtmsg->rtmsg_dst_len,
3595 .fc_src_len = rtmsg->rtmsg_src_len,
3596 .fc_flags = rtmsg->rtmsg_flags,
3597 .fc_type = rtmsg->rtmsg_type,
3639 3598
3640 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3599 .fc_nlinfo.nl_net = net,
3641 : RT6_TABLE_MAIN;
3642 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3643 cfg->fc_metric = rtmsg->rtmsg_metric;
3644 cfg->fc_expires = rtmsg->rtmsg_info;
3645 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3646 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3647 cfg->fc_flags = rtmsg->rtmsg_flags;
3648 cfg->fc_type = rtmsg->rtmsg_type;
3649
3650 cfg->fc_nlinfo.nl_net = net;
3651 3600
3652 cfg->fc_dst = rtmsg->rtmsg_dst; 3601 .fc_dst = rtmsg->rtmsg_dst,
3653 cfg->fc_src = rtmsg->rtmsg_src; 3602 .fc_src = rtmsg->rtmsg_src,
3654 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3603 .fc_gateway = rtmsg->rtmsg_gateway,
3604 };
3655} 3605}
3656 3606
3657int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3607int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
@@ -3758,6 +3708,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
3758 if (!f6i) 3708 if (!f6i)
3759 return ERR_PTR(-ENOMEM); 3709 return ERR_PTR(-ENOMEM);
3760 3710
3711 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
3761 f6i->dst_nocount = true; 3712 f6i->dst_nocount = true;
3762 f6i->dst_host = true; 3713 f6i->dst_host = true;
3763 f6i->fib6_protocol = RTPROT_KERNEL; 3714 f6i->fib6_protocol = RTPROT_KERNEL;
@@ -3800,8 +3751,6 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3800 spin_lock_bh(&rt6_exception_lock); 3751 spin_lock_bh(&rt6_exception_lock);
3801 /* remove prefsrc entry */ 3752 /* remove prefsrc entry */
3802 rt->fib6_prefsrc.plen = 0; 3753 rt->fib6_prefsrc.plen = 0;
3803 /* need to update cache as well */
3804 rt6_exceptions_remove_prefsrc(rt);
3805 spin_unlock_bh(&rt6_exception_lock); 3754 spin_unlock_bh(&rt6_exception_lock);
3806 } 3755 }
3807 return 0; 3756 return 0;
@@ -4079,8 +4028,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4079 .event = event, 4028 .event = event,
4080 }, 4029 },
4081 }; 4030 };
4031 struct net *net = dev_net(dev);
4082 4032
4083 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); 4033 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4034 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4035 else
4036 fib6_clean_all(net, fib6_ifdown, &arg);
4084} 4037}
4085 4038
4086void rt6_disable_ip(struct net_device *dev, unsigned long event) 4039void rt6_disable_ip(struct net_device *dev, unsigned long event)
@@ -4170,20 +4123,25 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4170 int err; 4123 int err;
4171 4124
4172 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4125 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4173 NULL); 4126 extack);
4174 if (err < 0) 4127 if (err < 0)
4175 goto errout; 4128 goto errout;
4176 4129
4177 err = -EINVAL; 4130 err = -EINVAL;
4178 rtm = nlmsg_data(nlh); 4131 rtm = nlmsg_data(nlh);
4179 memset(cfg, 0, sizeof(*cfg));
4180 4132
4181 cfg->fc_table = rtm->rtm_table; 4133 *cfg = (struct fib6_config){
4182 cfg->fc_dst_len = rtm->rtm_dst_len; 4134 .fc_table = rtm->rtm_table,
4183 cfg->fc_src_len = rtm->rtm_src_len; 4135 .fc_dst_len = rtm->rtm_dst_len,
4184 cfg->fc_flags = RTF_UP; 4136 .fc_src_len = rtm->rtm_src_len,
4185 cfg->fc_protocol = rtm->rtm_protocol; 4137 .fc_flags = RTF_UP,
4186 cfg->fc_type = rtm->rtm_type; 4138 .fc_protocol = rtm->rtm_protocol,
4139 .fc_type = rtm->rtm_type,
4140
4141 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4142 .fc_nlinfo.nlh = nlh,
4143 .fc_nlinfo.nl_net = sock_net(skb->sk),
4144 };
4187 4145
4188 if (rtm->rtm_type == RTN_UNREACHABLE || 4146 if (rtm->rtm_type == RTN_UNREACHABLE ||
4189 rtm->rtm_type == RTN_BLACKHOLE || 4147 rtm->rtm_type == RTN_BLACKHOLE ||
@@ -4199,10 +4157,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4199 4157
4200 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4158 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4201 4159
4202 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4203 cfg->fc_nlinfo.nlh = nlh;
4204 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4205
4206 if (tb[RTA_GATEWAY]) { 4160 if (tb[RTA_GATEWAY]) {
4207 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4161 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4208 cfg->fc_flags |= RTF_GATEWAY; 4162 cfg->fc_flags |= RTF_GATEWAY;
@@ -4815,28 +4769,52 @@ nla_put_failure:
4815 return -EMSGSIZE; 4769 return -EMSGSIZE;
4816} 4770}
4817 4771
4772static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4773 const struct net_device *dev)
4774{
4775 if (f6i->fib6_nh.nh_dev == dev)
4776 return true;
4777
4778 if (f6i->fib6_nsiblings) {
4779 struct fib6_info *sibling, *next_sibling;
4780
4781 list_for_each_entry_safe(sibling, next_sibling,
4782 &f6i->fib6_siblings, fib6_siblings) {
4783 if (sibling->fib6_nh.nh_dev == dev)
4784 return true;
4785 }
4786 }
4787
4788 return false;
4789}
4790
4818int rt6_dump_route(struct fib6_info *rt, void *p_arg) 4791int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4819{ 4792{
4820 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4793 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4794 struct fib_dump_filter *filter = &arg->filter;
4795 unsigned int flags = NLM_F_MULTI;
4821 struct net *net = arg->net; 4796 struct net *net = arg->net;
4822 4797
4823 if (rt == net->ipv6.fib6_null_entry) 4798 if (rt == net->ipv6.fib6_null_entry)
4824 return 0; 4799 return 0;
4825 4800
4826 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4801 if ((filter->flags & RTM_F_PREFIX) &&
4827 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4802 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4828 4803 /* success since this is not a prefix route */
4829 /* user wants prefix routes only */ 4804 return 1;
4830 if (rtm->rtm_flags & RTM_F_PREFIX && 4805 }
4831 !(rt->fib6_flags & RTF_PREFIX_RT)) { 4806 if (filter->filter_set) {
4832 /* success since this is not a prefix route */ 4807 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4808 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4809 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4833 return 1; 4810 return 1;
4834 } 4811 }
4812 flags |= NLM_F_DUMP_FILTERED;
4835 } 4813 }
4836 4814
4837 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, 4815 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4838 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, 4816 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4839 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); 4817 arg->cb->nlh->nlmsg_seq, flags);
4840} 4818}
4841 4819
4842static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4820static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4850,7 +4828,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4850 struct rt6_info *rt; 4828 struct rt6_info *rt;
4851 struct sk_buff *skb; 4829 struct sk_buff *skb;
4852 struct rtmsg *rtm; 4830 struct rtmsg *rtm;
4853 struct flowi6 fl6; 4831 struct flowi6 fl6 = {};
4854 bool fibmatch; 4832 bool fibmatch;
4855 4833
4856 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4834 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
@@ -4859,7 +4837,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4859 goto errout; 4837 goto errout;
4860 4838
4861 err = -EINVAL; 4839 err = -EINVAL;
4862 memset(&fl6, 0, sizeof(fl6));
4863 rtm = nlmsg_data(nlh); 4840 rtm = nlmsg_data(nlh);
4864 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4841 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4865 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4842 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
@@ -5084,7 +5061,10 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5084 return 0; 5061 return 0;
5085} 5062}
5086 5063
5087struct ctl_table ipv6_route_table_template[] = { 5064static int zero;
5065static int one = 1;
5066
5067static struct ctl_table ipv6_route_table_template[] = {
5088 { 5068 {
5089 .procname = "flush", 5069 .procname = "flush",
5090 .data = &init_net.ipv6.sysctl.flush_delay, 5070 .data = &init_net.ipv6.sysctl.flush_delay,
@@ -5155,6 +5135,15 @@ struct ctl_table ipv6_route_table_template[] = {
5155 .mode = 0644, 5135 .mode = 0644,
5156 .proc_handler = proc_dointvec_ms_jiffies, 5136 .proc_handler = proc_dointvec_ms_jiffies,
5157 }, 5137 },
5138 {
5139 .procname = "skip_notify_on_dev_down",
5140 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5141 .maxlen = sizeof(int),
5142 .mode = 0644,
5143 .proc_handler = proc_dointvec,
5144 .extra1 = &zero,
5145 .extra2 = &one,
5146 },
5158 { } 5147 { }
5159}; 5148};
5160 5149
@@ -5178,6 +5167,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5178 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 5167 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5179 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 5168 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5180 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 5169 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5170 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5181 5171
5182 /* Don't export sysctls to unprivileged users */ 5172 /* Don't export sysctls to unprivileged users */
5183 if (net->user_ns != &init_user_ns) 5173 if (net->user_ns != &init_user_ns)
@@ -5242,6 +5232,7 @@ static int __net_init ip6_route_net_init(struct net *net)
5242 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 5232 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5243 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 5233 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5244 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 5234 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5235 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5245 5236
5246 net->ipv6.ip6_rt_gc_expire = 30*HZ; 5237 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5247 5238
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e9400ffa7875..51c9f75f34b9 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -534,13 +534,13 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
534 534
535 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 535 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
536 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 536 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
537 t->parms.link, 0, iph->protocol, 0); 537 t->parms.link, iph->protocol);
538 err = 0; 538 err = 0;
539 goto out; 539 goto out;
540 } 540 }
541 if (type == ICMP_REDIRECT) { 541 if (type == ICMP_REDIRECT) {
542 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, 542 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link,
543 iph->protocol, 0); 543 iph->protocol);
544 err = 0; 544 err = 0;
545 goto out; 545 goto out;
546 } 546 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b36694b6716e..d2d97d07ef27 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -478,7 +478,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
478 struct net *net = dev_net(skb->dev); 478 struct net *net = dev_net(skb->dev);
479 479
480 sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, 480 sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
481 inet6_iif(skb), 0, udptable, skb); 481 inet6_iif(skb), inet6_sdif(skb), udptable, skb);
482 if (!sk) { 482 if (!sk) {
483 __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), 483 __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
484 ICMP6_MIB_INERRORS); 484 ICMP6_MIB_INERRORS);
@@ -548,7 +548,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
548 __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); 548 __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
549} 549}
550 550
551static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); 551DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
552void udpv6_encap_enable(void) 552void udpv6_encap_enable(void)
553{ 553{
554 static_branch_enable(&udpv6_encap_needed_key); 554 static_branch_enable(&udpv6_encap_needed_key);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 95dee9ca8d22..1b8e161ac527 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -119,7 +119,7 @@ static struct sk_buff *udp6_gro_receive(struct list_head *head,
119{ 119{
120 struct udphdr *uh = udp_gro_udphdr(skb); 120 struct udphdr *uh = udp_gro_udphdr(skb);
121 121
122 if (unlikely(!uh)) 122 if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
123 goto flush; 123 goto flush;
124 124
125 /* Don't bother verifying checksum if we're going to flush anyway. */ 125 /* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index e2f16a0173a9..0bed4cc20603 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -48,7 +48,7 @@ static struct iucv_interface *pr_iucv;
48static const u8 iprm_shutdown[8] = 48static const u8 iprm_shutdown[8] =
49 {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}; 49 {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
50 50
51#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class)) 51#define TRGCLS_SIZE FIELD_SIZEOF(struct iucv_message, class)
52 52
53#define __iucv_sock_wait(sk, condition, timeo, ret) \ 53#define __iucv_sock_wait(sk, condition, timeo, ret) \
54do { \ 54do { \
@@ -320,13 +320,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
320 struct sk_buff *nskb; 320 struct sk_buff *nskb;
321 int err, confirm_recv = 0; 321 int err, confirm_recv = 0;
322 322
323 memset(skb->head, 0, ETH_HLEN); 323 phs_hdr = skb_push(skb, sizeof(*phs_hdr));
324 phs_hdr = skb_push(skb, sizeof(struct af_iucv_trans_hdr)); 324 memset(phs_hdr, 0, sizeof(*phs_hdr));
325 skb_reset_mac_header(skb);
326 skb_reset_network_header(skb); 325 skb_reset_network_header(skb);
327 skb_push(skb, ETH_HLEN);
328 skb_reset_mac_header(skb);
329 memset(phs_hdr, 0, sizeof(struct af_iucv_trans_hdr));
330 326
331 phs_hdr->magic = ETH_P_AF_IUCV; 327 phs_hdr->magic = ETH_P_AF_IUCV;
332 phs_hdr->version = 1; 328 phs_hdr->version = 1;
@@ -350,6 +346,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
350 if (imsg) 346 if (imsg)
351 memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message)); 347 memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message));
352 348
349 skb_push(skb, ETH_HLEN);
350 memset(skb->data, 0, ETH_HLEN);
351
353 skb->dev = iucv->hs_dev; 352 skb->dev = iucv->hs_dev;
354 if (!skb->dev) { 353 if (!skb->dev) {
355 err = -ENODEV; 354 err = -ENODEV;
@@ -1505,7 +1504,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
1505 struct sock *sk = sock->sk; 1504 struct sock *sk = sock->sk;
1506 __poll_t mask = 0; 1505 __poll_t mask = 0;
1507 1506
1508 sock_poll_wait(file, wait); 1507 sock_poll_wait(file, sock, wait);
1509 1508
1510 if (sk->sk_state == IUCV_LISTEN) 1509 if (sk->sk_state == IUCV_LISTEN)
1511 return iucv_accept_poll(sk); 1510 return iucv_accept_poll(sk);
@@ -1943,8 +1942,7 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16])
1943/***************** HiperSockets transport callbacks ********************/ 1942/***************** HiperSockets transport callbacks ********************/
1944static void afiucv_swap_src_dest(struct sk_buff *skb) 1943static void afiucv_swap_src_dest(struct sk_buff *skb)
1945{ 1944{
1946 struct af_iucv_trans_hdr *trans_hdr = 1945 struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
1947 (struct af_iucv_trans_hdr *)skb->data;
1948 char tmpID[8]; 1946 char tmpID[8];
1949 char tmpName[8]; 1947 char tmpName[8];
1950 1948
@@ -1967,13 +1965,12 @@ static void afiucv_swap_src_dest(struct sk_buff *skb)
1967 **/ 1965 **/
1968static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) 1966static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
1969{ 1967{
1968 struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
1970 struct sock *nsk; 1969 struct sock *nsk;
1971 struct iucv_sock *iucv, *niucv; 1970 struct iucv_sock *iucv, *niucv;
1972 struct af_iucv_trans_hdr *trans_hdr;
1973 int err; 1971 int err;
1974 1972
1975 iucv = iucv_sk(sk); 1973 iucv = iucv_sk(sk);
1976 trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
1977 if (!iucv) { 1974 if (!iucv) {
1978 /* no sock - connection refused */ 1975 /* no sock - connection refused */
1979 afiucv_swap_src_dest(skb); 1976 afiucv_swap_src_dest(skb);
@@ -2034,15 +2031,13 @@ out:
2034static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb) 2031static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb)
2035{ 2032{
2036 struct iucv_sock *iucv = iucv_sk(sk); 2033 struct iucv_sock *iucv = iucv_sk(sk);
2037 struct af_iucv_trans_hdr *trans_hdr =
2038 (struct af_iucv_trans_hdr *)skb->data;
2039 2034
2040 if (!iucv) 2035 if (!iucv)
2041 goto out; 2036 goto out;
2042 if (sk->sk_state != IUCV_BOUND) 2037 if (sk->sk_state != IUCV_BOUND)
2043 goto out; 2038 goto out;
2044 bh_lock_sock(sk); 2039 bh_lock_sock(sk);
2045 iucv->msglimit_peer = trans_hdr->window; 2040 iucv->msglimit_peer = iucv_trans_hdr(skb)->window;
2046 sk->sk_state = IUCV_CONNECTED; 2041 sk->sk_state = IUCV_CONNECTED;
2047 sk->sk_state_change(sk); 2042 sk->sk_state_change(sk);
2048 bh_unlock_sock(sk); 2043 bh_unlock_sock(sk);
@@ -2098,8 +2093,6 @@ out:
2098static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb) 2093static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
2099{ 2094{
2100 struct iucv_sock *iucv = iucv_sk(sk); 2095 struct iucv_sock *iucv = iucv_sk(sk);
2101 struct af_iucv_trans_hdr *trans_hdr =
2102 (struct af_iucv_trans_hdr *)skb->data;
2103 2096
2104 if (!iucv) 2097 if (!iucv)
2105 return NET_RX_SUCCESS; 2098 return NET_RX_SUCCESS;
@@ -2107,7 +2100,7 @@ static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
2107 if (sk->sk_state != IUCV_CONNECTED) 2100 if (sk->sk_state != IUCV_CONNECTED)
2108 return NET_RX_SUCCESS; 2101 return NET_RX_SUCCESS;
2109 2102
2110 atomic_sub(trans_hdr->window, &iucv->msg_sent); 2103 atomic_sub(iucv_trans_hdr(skb)->window, &iucv->msg_sent);
2111 iucv_sock_wake_msglim(sk); 2104 iucv_sock_wake_msglim(sk);
2112 return NET_RX_SUCCESS; 2105 return NET_RX_SUCCESS;
2113} 2106}
@@ -2170,22 +2163,13 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
2170 int err = NET_RX_SUCCESS; 2163 int err = NET_RX_SUCCESS;
2171 char nullstring[8]; 2164 char nullstring[8];
2172 2165
2173 if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) { 2166 if (!pskb_may_pull(skb, sizeof(*trans_hdr))) {
2174 WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d", 2167 WARN_ONCE(1, "AF_IUCV failed to receive skb, len=%u", skb->len);
2175 (int)skb->len,
2176 (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr)));
2177 kfree_skb(skb); 2168 kfree_skb(skb);
2178 return NET_RX_SUCCESS; 2169 return NET_RX_SUCCESS;
2179 } 2170 }
2180 if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) 2171
2181 if (skb_linearize(skb)) { 2172 trans_hdr = iucv_trans_hdr(skb);
2182 WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d",
2183 (int)skb->len);
2184 kfree_skb(skb);
2185 return NET_RX_SUCCESS;
2186 }
2187 skb_pull(skb, ETH_HLEN);
2188 trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
2189 EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName)); 2173 EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
2190 EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID)); 2174 EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID));
2191 EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName)); 2175 EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName));
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 1beeea9549fa..b99e73a7e7e0 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -730,7 +730,6 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
730 struct sk_buff *skb = NULL; 730 struct sk_buff *skb = NULL;
731 struct sock *sk = sock->sk; 731 struct sock *sk = sock->sk;
732 struct llc_sock *llc = llc_sk(sk); 732 struct llc_sock *llc = llc_sk(sk);
733 unsigned long cpu_flags;
734 size_t copied = 0; 733 size_t copied = 0;
735 u32 peek_seq = 0; 734 u32 peek_seq = 0;
736 u32 *seq, skb_len; 735 u32 *seq, skb_len;
@@ -855,9 +854,8 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
855 goto copy_uaddr; 854 goto copy_uaddr;
856 855
857 if (!(flags & MSG_PEEK)) { 856 if (!(flags & MSG_PEEK)) {
858 spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); 857 skb_unlink(skb, &sk->sk_receive_queue);
859 sk_eat_skb(sk, skb); 858 kfree_skb(skb);
860 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
861 *seq = 0; 859 *seq = 0;
862 } 860 }
863 861
@@ -878,9 +876,8 @@ copy_uaddr:
878 llc_cmsg_rcv(msg, skb); 876 llc_cmsg_rcv(msg, skb);
879 877
880 if (!(flags & MSG_PEEK)) { 878 if (!(flags & MSG_PEEK)) {
881 spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); 879 skb_unlink(skb, &sk->sk_receive_queue);
882 sk_eat_skb(sk, skb); 880 kfree_skb(skb);
883 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
884 *seq = 0; 881 *seq = 0;
885 } 882 }
886 883
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 260b3dc1b4a2..64d4bef04e73 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -127,9 +127,7 @@ void llc_sap_close(struct llc_sap *sap)
127 list_del_rcu(&sap->node); 127 list_del_rcu(&sap->node);
128 spin_unlock_bh(&llc_sap_list_lock); 128 spin_unlock_bh(&llc_sap_list_lock);
129 129
130 synchronize_rcu(); 130 kfree_rcu(sap, rcu);
131
132 kfree(sap);
133} 131}
134 132
135static struct packet_type llc_packet_type __read_mostly = { 133static struct packet_type llc_packet_type __read_mostly = {
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 76e30f4797fb..f869e35d0974 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -27,20 +27,6 @@ config MAC80211_RC_MINSTREL
27 ---help--- 27 ---help---
28 This option enables the 'minstrel' TX rate control algorithm 28 This option enables the 'minstrel' TX rate control algorithm
29 29
30config MAC80211_RC_MINSTREL_HT
31 bool "Minstrel 802.11n support" if EXPERT
32 depends on MAC80211_RC_MINSTREL
33 default y
34 ---help---
35 This option enables the 'minstrel_ht' TX rate control algorithm
36
37config MAC80211_RC_MINSTREL_VHT
38 bool "Minstrel 802.11ac support" if EXPERT
39 depends on MAC80211_RC_MINSTREL_HT
40 default n
41 ---help---
42 This option enables VHT in the 'minstrel_ht' TX rate control algorithm
43
44choice 30choice
45 prompt "Default rate control algorithm" 31 prompt "Default rate control algorithm"
46 depends on MAC80211_HAS_RC 32 depends on MAC80211_HAS_RC
@@ -62,8 +48,7 @@ endchoice
62 48
63config MAC80211_RC_DEFAULT 49config MAC80211_RC_DEFAULT
64 string 50 string
65 default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT 51 default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL
66 default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
67 default "" 52 default ""
68 53
69endif 54endif
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index bb707789ef2b..4f03ebe732fa 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -53,13 +53,14 @@ mac80211-$(CONFIG_PM) += pm.o
53 53
54CFLAGS_trace.o := -I$(src) 54CFLAGS_trace.o := -I$(src)
55 55
56rc80211_minstrel-y := rc80211_minstrel.o 56rc80211_minstrel-y := \
57rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o 57 rc80211_minstrel.o \
58 rc80211_minstrel_ht.o
58 59
59rc80211_minstrel_ht-y := rc80211_minstrel_ht.o 60rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += \
60rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o 61 rc80211_minstrel_debugfs.o \
62 rc80211_minstrel_ht_debugfs.o
61 63
62mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) 64mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
63mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
64 65
65ccflags-y += -DDEBUG 66ccflags-y += -DDEBUG
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5d22eda8a6b1..51622333d460 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -158,12 +158,10 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
158 if (ret) 158 if (ret)
159 return ret; 159 return ret;
160 160
161 if (type == NL80211_IFTYPE_AP_VLAN && 161 if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) {
162 params && params->use_4addr == 0) {
163 RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); 162 RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
164 ieee80211_check_fast_rx_iface(sdata); 163 ieee80211_check_fast_rx_iface(sdata);
165 } else if (type == NL80211_IFTYPE_STATION && 164 } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) {
166 params && params->use_4addr >= 0) {
167 sdata->u.mgd.use_4addr = params->use_4addr; 165 sdata->u.mgd.use_4addr = params->use_4addr;
168 } 166 }
169 167
@@ -792,6 +790,48 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
792 return 0; 790 return 0;
793} 791}
794 792
793static int ieee80211_set_ftm_responder_params(
794 struct ieee80211_sub_if_data *sdata,
795 const u8 *lci, size_t lci_len,
796 const u8 *civicloc, size_t civicloc_len)
797{
798 struct ieee80211_ftm_responder_params *new, *old;
799 struct ieee80211_bss_conf *bss_conf;
800 u8 *pos;
801 int len;
802
803 if ((!lci || !lci_len) && (!civicloc || !civicloc_len))
804 return 1;
805
806 bss_conf = &sdata->vif.bss_conf;
807 old = bss_conf->ftmr_params;
808 len = lci_len + civicloc_len;
809
810 new = kzalloc(sizeof(*new) + len, GFP_KERNEL);
811 if (!new)
812 return -ENOMEM;
813
814 pos = (u8 *)(new + 1);
815 if (lci_len) {
816 new->lci_len = lci_len;
817 new->lci = pos;
818 memcpy(pos, lci, lci_len);
819 pos += lci_len;
820 }
821
822 if (civicloc_len) {
823 new->civicloc_len = civicloc_len;
824 new->civicloc = pos;
825 memcpy(pos, civicloc, civicloc_len);
826 pos += civicloc_len;
827 }
828
829 bss_conf->ftmr_params = new;
830 kfree(old);
831
832 return 0;
833}
834
795static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, 835static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
796 struct cfg80211_beacon_data *params, 836 struct cfg80211_beacon_data *params,
797 const struct ieee80211_csa_settings *csa) 837 const struct ieee80211_csa_settings *csa)
@@ -865,6 +905,20 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
865 if (err == 0) 905 if (err == 0)
866 changed |= BSS_CHANGED_AP_PROBE_RESP; 906 changed |= BSS_CHANGED_AP_PROBE_RESP;
867 907
908 if (params->ftm_responder != -1) {
909 sdata->vif.bss_conf.ftm_responder = params->ftm_responder;
910 err = ieee80211_set_ftm_responder_params(sdata,
911 params->lci,
912 params->lci_len,
913 params->civicloc,
914 params->civicloc_len);
915
916 if (err < 0)
917 return err;
918
919 changed |= BSS_CHANGED_FTM_RESPONDER;
920 }
921
868 rcu_assign_pointer(sdata->u.ap.beacon, new); 922 rcu_assign_pointer(sdata->u.ap.beacon, new);
869 923
870 if (old) 924 if (old)
@@ -911,6 +965,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
911 965
912 sdata->vif.bss_conf.beacon_int = params->beacon_interval; 966 sdata->vif.bss_conf.beacon_int = params->beacon_interval;
913 967
968 if (params->he_cap)
969 sdata->vif.bss_conf.he_support = true;
970
914 mutex_lock(&local->mtx); 971 mutex_lock(&local->mtx);
915 err = ieee80211_vif_use_channel(sdata, &params->chandef, 972 err = ieee80211_vif_use_channel(sdata, &params->chandef,
916 IEEE80211_CHANCTX_SHARED); 973 IEEE80211_CHANCTX_SHARED);
@@ -1062,6 +1119,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
1062 kfree_rcu(old_probe_resp, rcu_head); 1119 kfree_rcu(old_probe_resp, rcu_head);
1063 sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF; 1120 sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF;
1064 1121
1122 kfree(sdata->vif.bss_conf.ftmr_params);
1123 sdata->vif.bss_conf.ftmr_params = NULL;
1124
1065 __sta_info_flush(sdata, true); 1125 __sta_info_flush(sdata, true);
1066 ieee80211_free_keys(sdata, true); 1126 ieee80211_free_keys(sdata, true);
1067 1127
@@ -1092,50 +1152,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
1092 return 0; 1152 return 0;
1093} 1153}
1094 1154
1095/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
1096struct iapp_layer2_update {
1097 u8 da[ETH_ALEN]; /* broadcast */
1098 u8 sa[ETH_ALEN]; /* STA addr */
1099 __be16 len; /* 6 */
1100 u8 dsap; /* 0 */
1101 u8 ssap; /* 0 */
1102 u8 control;
1103 u8 xid_info[3];
1104} __packed;
1105
1106static void ieee80211_send_layer2_update(struct sta_info *sta)
1107{
1108 struct iapp_layer2_update *msg;
1109 struct sk_buff *skb;
1110
1111 /* Send Level 2 Update Frame to update forwarding tables in layer 2
1112 * bridge devices */
1113
1114 skb = dev_alloc_skb(sizeof(*msg));
1115 if (!skb)
1116 return;
1117 msg = skb_put(skb, sizeof(*msg));
1118
1119 /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
1120 * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
1121
1122 eth_broadcast_addr(msg->da);
1123 memcpy(msg->sa, sta->sta.addr, ETH_ALEN);
1124 msg->len = htons(6);
1125 msg->dsap = 0;
1126 msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */
1127 msg->control = 0xaf; /* XID response lsb.1111F101.
1128 * F=0 (no poll command; unsolicited frame) */
1129 msg->xid_info[0] = 0x81; /* XID format identifier */
1130 msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */
1131 msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */
1132
1133 skb->dev = sta->sdata->dev;
1134 skb->protocol = eth_type_trans(skb, sta->sdata->dev);
1135 memset(skb->cb, 0, sizeof(skb->cb));
1136 netif_rx_ni(skb);
1137}
1138
1139static int sta_apply_auth_flags(struct ieee80211_local *local, 1155static int sta_apply_auth_flags(struct ieee80211_local *local,
1140 struct sta_info *sta, 1156 struct sta_info *sta,
1141 u32 mask, u32 set) 1157 u32 mask, u32 set)
@@ -1499,7 +1515,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
1499 } 1515 }
1500 1516
1501 if (layer2_update) 1517 if (layer2_update)
1502 ieee80211_send_layer2_update(sta); 1518 cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr);
1503 1519
1504 rcu_read_unlock(); 1520 rcu_read_unlock();
1505 1521
@@ -1601,7 +1617,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
1601 if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) 1617 if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
1602 ieee80211_vif_inc_num_mcast(sta->sdata); 1618 ieee80211_vif_inc_num_mcast(sta->sdata);
1603 1619
1604 ieee80211_send_layer2_update(sta); 1620 cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr);
1605 } 1621 }
1606 1622
1607 err = sta_apply_parameters(local, sta, params); 1623 err = sta_apply_parameters(local, sta, params);
@@ -2918,6 +2934,20 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
2918 memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); 2934 memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
2919 pos += beacon->probe_resp_len; 2935 pos += beacon->probe_resp_len;
2920 } 2936 }
2937 if (beacon->ftm_responder)
2938 new_beacon->ftm_responder = beacon->ftm_responder;
2939 if (beacon->lci) {
2940 new_beacon->lci_len = beacon->lci_len;
2941 new_beacon->lci = pos;
2942 memcpy(pos, beacon->lci, beacon->lci_len);
2943 pos += beacon->lci_len;
2944 }
2945 if (beacon->civicloc) {
2946 new_beacon->civicloc_len = beacon->civicloc_len;
2947 new_beacon->civicloc = pos;
2948 memcpy(pos, beacon->civicloc, beacon->civicloc_len);
2949 pos += beacon->civicloc_len;
2950 }
2921 2951
2922 return new_beacon; 2952 return new_beacon;
2923} 2953}
@@ -3808,6 +3838,17 @@ out:
3808 return ret; 3838 return ret;
3809} 3839}
3810 3840
3841static int
3842ieee80211_get_ftm_responder_stats(struct wiphy *wiphy,
3843 struct net_device *dev,
3844 struct cfg80211_ftm_responder_stats *ftm_stats)
3845{
3846 struct ieee80211_local *local = wiphy_priv(wiphy);
3847 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3848
3849 return drv_get_ftm_responder_stats(local, sdata, ftm_stats);
3850}
3851
3811const struct cfg80211_ops mac80211_config_ops = { 3852const struct cfg80211_ops mac80211_config_ops = {
3812 .add_virtual_intf = ieee80211_add_iface, 3853 .add_virtual_intf = ieee80211_add_iface,
3813 .del_virtual_intf = ieee80211_del_iface, 3854 .del_virtual_intf = ieee80211_del_iface,
@@ -3902,4 +3943,5 @@ const struct cfg80211_ops mac80211_config_ops = {
3902 .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, 3943 .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
3903 .tx_control_port = ieee80211_tx_control_port, 3944 .tx_control_port = ieee80211_tx_control_port,
3904 .get_txq_stats = ieee80211_get_txq_stats, 3945 .get_txq_stats = ieee80211_get_txq_stats,
3946 .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats,
3905}; 3947};
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index b5adf3625d16..3fe541e358f3 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright (C) 2018 Intel Corporation
6 * 7 *
7 * GPLv2 8 * GPLv2
8 * 9 *
@@ -214,6 +215,9 @@ static const char *hw_flag_names[] = {
214 FLAG(SUPPORTS_TDLS_BUFFER_STA), 215 FLAG(SUPPORTS_TDLS_BUFFER_STA),
215 FLAG(DEAUTH_NEED_MGD_TX_PREP), 216 FLAG(DEAUTH_NEED_MGD_TX_PREP),
216 FLAG(DOESNT_SUPPORT_QOS_NDP), 217 FLAG(DOESNT_SUPPORT_QOS_NDP),
218 FLAG(BUFF_MMPDU_TXQ),
219 FLAG(SUPPORTS_VHT_EXT_NSS_BW),
220 FLAG(STA_MMPDU_TXQ),
217#undef FLAG 221#undef FLAG
218}; 222};
219 223
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 4105081dc1df..af5185a836e5 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -4,6 +4,7 @@
4 * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright(c) 2016 Intel Deutschland GmbH 6 * Copyright(c) 2016 Intel Deutschland GmbH
7 * Copyright (C) 2018 Intel Corporation
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as 10 * it under the terms of the GNU General Public License version 2 as
@@ -140,7 +141,7 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
140{ 141{
141 struct sta_info *sta = file->private_data; 142 struct sta_info *sta = file->private_data;
142 struct ieee80211_local *local = sta->local; 143 struct ieee80211_local *local = sta->local;
143 size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1); 144 size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2);
144 char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; 145 char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
145 struct txq_info *txqi; 146 struct txq_info *txqi;
146 ssize_t rv; 147 ssize_t rv;
@@ -162,7 +163,9 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
162 bufsz+buf-p, 163 bufsz+buf-p,
163 "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); 164 "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
164 165
165 for (i = 0; i < IEEE80211_NUM_TIDS; i++) { 166 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
167 if (!sta->sta.txq[i])
168 continue;
166 txqi = to_txq_info(sta->sta.txq[i]); 169 txqi = to_txq_info(sta->sta.txq[i]);
167 p += scnprintf(p, bufsz+buf-p, 170 p += scnprintf(p, bufsz+buf-p,
168 "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n", 171 "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n",
@@ -487,12 +490,368 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
487 p += scnprintf(p, sizeof(buf)+buf-p, 490 p += scnprintf(p, sizeof(buf)+buf-p,
488 "MCS TX highest: %d Mbps\n", 491 "MCS TX highest: %d Mbps\n",
489 le16_to_cpu(vhtc->vht_mcs.tx_highest)); 492 le16_to_cpu(vhtc->vht_mcs.tx_highest));
493#undef PFLAG
490 } 494 }
491 495
492 return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); 496 return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
493} 497}
494STA_OPS(vht_capa); 498STA_OPS(vht_capa);
495 499
500static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
501 size_t count, loff_t *ppos)
502{
503 char *buf, *p;
504 size_t buf_sz = PAGE_SIZE;
505 struct sta_info *sta = file->private_data;
506 struct ieee80211_sta_he_cap *hec = &sta->sta.he_cap;
507 struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp;
508 u8 ppe_size;
509 u8 *cap;
510 int i;
511 ssize_t ret;
512
513 buf = kmalloc(buf_sz, GFP_KERNEL);
514 if (!buf)
515 return -ENOMEM;
516 p = buf;
517
518 p += scnprintf(p, buf_sz + buf - p, "HE %ssupported\n",
519 hec->has_he ? "" : "not ");
520 if (!hec->has_he)
521 goto out;
522
523 cap = hec->he_cap_elem.mac_cap_info;
524 p += scnprintf(p, buf_sz + buf - p,
525 "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
526 cap[0], cap[1], cap[2], cap[3], cap[4], cap[5]);
527
528#define PRINT(fmt, ...) \
529 p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \
530 ##__VA_ARGS__)
531
532#define PFLAG(t, n, a, b) \
533 do { \
534 if (cap[n] & IEEE80211_HE_##t##_CAP##n##_##a) \
535 PRINT("%s", b); \
536 } while (0)
537
538#define PFLAG_RANGE(t, i, n, s, m, off, fmt) \
539 do { \
540 u8 msk = IEEE80211_HE_##t##_CAP##i##_##n##_MASK; \
541 u8 idx = ((cap[i] & msk) >> (ffs(msk) - 1)) + off; \
542 PRINT(fmt, (s << idx) + (m * idx)); \
543 } while (0)
544
545#define PFLAG_RANGE_DEFAULT(t, i, n, s, m, off, fmt, a, b) \
546 do { \
547 if (cap[i] == IEEE80211_HE_##t ##_CAP##i##_##n##_##a) { \
548 PRINT("%s", b); \
549 break; \
550 } \
551 PFLAG_RANGE(t, i, n, s, m, off, fmt); \
552 } while (0)
553
554 PFLAG(MAC, 0, HTC_HE, "HTC-HE");
555 PFLAG(MAC, 0, TWT_REQ, "TWT-REQ");
556 PFLAG(MAC, 0, TWT_RES, "TWT-RES");
557 PFLAG_RANGE_DEFAULT(MAC, 0, DYNAMIC_FRAG, 0, 1, 0,
558 "DYNAMIC-FRAG-LEVEL-%d", NOT_SUPP, "NOT-SUPP");
559 PFLAG_RANGE_DEFAULT(MAC, 0, MAX_NUM_FRAG_MSDU, 1, 0, 0,
560 "MAX-NUM-FRAG-MSDU-%d", UNLIMITED, "UNLIMITED");
561
562 PFLAG_RANGE_DEFAULT(MAC, 1, MIN_FRAG_SIZE, 128, 0, -1,
563 "MIN-FRAG-SIZE-%d", UNLIMITED, "UNLIMITED");
564 PFLAG_RANGE_DEFAULT(MAC, 1, TF_MAC_PAD_DUR, 0, 8, 0,
565 "TF-MAC-PAD-DUR-%dUS", MASK, "UNKNOWN");
566 PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_RX_QOS, 0, 1, 1,
567 "MULTI-TID-AGG-RX-QOS-%d");
568
569 if (cap[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) {
570 switch (((cap[2] << 1) | (cap[1] >> 7)) & 0x3) {
571 case 0:
572 PRINT("LINK-ADAPTATION-NO-FEEDBACK");
573 break;
574 case 1:
575 PRINT("LINK-ADAPTATION-RESERVED");
576 break;
577 case 2:
578 PRINT("LINK-ADAPTATION-UNSOLICITED-FEEDBACK");
579 break;
580 case 3:
581 PRINT("LINK-ADAPTATION-BOTH");
582 break;
583 }
584 }
585
586 PFLAG(MAC, 2, ALL_ACK, "ALL-ACK");
587 PFLAG(MAC, 2, TRS, "TRS");
588 PFLAG(MAC, 2, BSR, "BSR");
589 PFLAG(MAC, 2, BCAST_TWT, "BCAST-TWT");
590 PFLAG(MAC, 2, 32BIT_BA_BITMAP, "32BIT-BA-BITMAP");
591 PFLAG(MAC, 2, MU_CASCADING, "MU-CASCADING");
592 PFLAG(MAC, 2, ACK_EN, "ACK-EN");
593
594 PFLAG(MAC, 3, OMI_CONTROL, "OMI-CONTROL");
595 PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA");
596
597 switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) {
598 case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT:
599 PRINT("MAX-AMPDU-LEN-EXP-USE-VHT");
600 break;
601 case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1:
602 PRINT("MAX-AMPDU-LEN-EXP-VHT-1");
603 break;
604 case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2:
605 PRINT("MAX-AMPDU-LEN-EXP-VHT-2");
606 break;
607 case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED:
608 PRINT("MAX-AMPDU-LEN-EXP-RESERVED");
609 break;
610 }
611
612 PFLAG(MAC, 3, AMSDU_FRAG, "AMSDU-FRAG");
613 PFLAG(MAC, 3, FLEX_TWT_SCHED, "FLEX-TWT-SCHED");
614 PFLAG(MAC, 3, RX_CTRL_FRAME_TO_MULTIBSS, "RX-CTRL-FRAME-TO-MULTIBSS");
615
616 PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG");
617 PFLAG(MAC, 4, QTP, "QTP");
618 PFLAG(MAC, 4, BQR, "BQR");
619 PFLAG(MAC, 4, SRP_RESP, "SRP-RESP");
620 PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP");
621 PFLAG(MAC, 4, OPS, "OPS");
622 PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU");
623
624 PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7);
625
626 PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION,
627 "SUBCHAN-SELECVITE-TRANSMISSION");
628 PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU");
629 PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX");
630
631 cap = hec->he_cap_elem.phy_cap_info;
632 p += scnprintf(p, buf_sz + buf - p,
633 "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
634 cap[0], cap[1], cap[2], cap[3], cap[4], cap[5], cap[6],
635 cap[7], cap[8], cap[9], cap[10]);
636
637 PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_IN_2G,
638 "CHANNEL-WIDTH-SET-40MHZ-IN-2G");
639 PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G,
640 "CHANNEL-WIDTH-SET-40MHZ-80MHZ-IN-5G");
641 PFLAG(PHY, 0, CHANNEL_WIDTH_SET_160MHZ_IN_5G,
642 "CHANNEL-WIDTH-SET-160MHZ-IN-5G");
643 PFLAG(PHY, 0, CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
644 "CHANNEL-WIDTH-SET-80PLUS80-MHZ-IN-5G");
645 PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G,
646 "CHANNEL-WIDTH-SET-RU-MAPPING-IN-2G");
647 PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G,
648 "CHANNEL-WIDTH-SET-RU-MAPPING-IN-5G");
649
650 switch (cap[1] & IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK) {
651 case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ:
652 PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-20MHZ");
653 break;
654 case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ:
655 PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-40MHZ");
656 break;
657 case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ:
658 PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-20MHZ");
659 break;
660 case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ:
661 PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-40MHZ");
662 break;
663 }
664
665 PFLAG(PHY, 1, DEVICE_CLASS_A,
666 "IEEE80211-HE-PHY-CAP1-DEVICE-CLASS-A");
667 PFLAG(PHY, 1, LDPC_CODING_IN_PAYLOAD,
668 "LDPC-CODING-IN-PAYLOAD");
669 PFLAG(PHY, 1, HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US,
670 "HY-CAP1-HE-LTF-AND-GI-FOR-HE-PPDUS-0-8US");
671 PRINT("MIDAMBLE-RX-MAX-NSTS-%d", ((cap[2] << 1) | (cap[1] >> 7)) & 0x3);
672
673 PFLAG(PHY, 2, NDP_4x_LTF_AND_3_2US, "NDP-4X-LTF-AND-3-2US");
674 PFLAG(PHY, 2, STBC_TX_UNDER_80MHZ, "STBC-TX-UNDER-80MHZ");
675 PFLAG(PHY, 2, STBC_RX_UNDER_80MHZ, "STBC-RX-UNDER-80MHZ");
676 PFLAG(PHY, 2, DOPPLER_TX, "DOPPLER-TX");
677 PFLAG(PHY, 2, DOPPLER_RX, "DOPPLER-RX");
678 PFLAG(PHY, 2, UL_MU_FULL_MU_MIMO, "UL-MU-FULL-MU-MIMO");
679 PFLAG(PHY, 2, UL_MU_PARTIAL_MU_MIMO, "UL-MU-PARTIAL-MU-MIMO");
680
681 switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK) {
682 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM:
683 PRINT("DCM-MAX-CONST-TX-NO-DCM");
684 break;
685 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK:
686 PRINT("DCM-MAX-CONST-TX-BPSK");
687 break;
688 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK:
689 PRINT("DCM-MAX-CONST-TX-QPSK");
690 break;
691 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM:
692 PRINT("DCM-MAX-CONST-TX-16-QAM");
693 break;
694 }
695
696 PFLAG(PHY, 3, DCM_MAX_TX_NSS_1, "DCM-MAX-TX-NSS-1");
697 PFLAG(PHY, 3, DCM_MAX_TX_NSS_2, "DCM-MAX-TX-NSS-2");
698
699 switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK) {
700 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM:
701 PRINT("DCM-MAX-CONST-RX-NO-DCM");
702 break;
703 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK:
704 PRINT("DCM-MAX-CONST-RX-BPSK");
705 break;
706 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK:
707 PRINT("DCM-MAX-CONST-RX-QPSK");
708 break;
709 case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM:
710 PRINT("DCM-MAX-CONST-RX-16-QAM");
711 break;
712 }
713
714 PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1");
715 PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2");
716 PFLAG(PHY, 3, RX_HE_MU_PPDU_FROM_NON_AP_STA,
717 "RX-HE-MU-PPDU-FROM-NON-AP-STA");
718 PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER");
719
720 PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE");
721 PFLAG(PHY, 4, MU_BEAMFORMER, "MU-BEAMFORMER");
722
723 PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_UNDER_80MHZ, 0, 1, 4,
724 "BEAMFORMEE-MAX-STS-UNDER-%d");
725 PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_ABOVE_80MHZ, 0, 1, 4,
726 "BEAMFORMEE-MAX-STS-ABOVE-%d");
727
728 PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ, 0, 1, 1,
729 "NUM-SND-DIM-UNDER-80MHZ-%d");
730 PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ, 0, 1, 1,
731 "NUM-SND-DIM-ABOVE-80MHZ-%d");
732 PFLAG(PHY, 5, NG16_SU_FEEDBACK, "NG16-SU-FEEDBACK");
733 PFLAG(PHY, 5, NG16_MU_FEEDBACK, "NG16-MU-FEEDBACK");
734
735 PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU");
736 PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU");
737 PFLAG(PHY, 6, TRIG_SU_BEAMFORMER_FB, "TRIG-SU-BEAMFORMER-FB");
738 PFLAG(PHY, 6, TRIG_MU_BEAMFORMER_FB, "TRIG-MU-BEAMFORMER-FB");
739 PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB");
740 PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE");
741 PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO,
742 "PARTIAL-BANDWIDTH-DL-MUMIMO");
743 PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT");
744
745 PFLAG(PHY, 7, SRP_BASED_SR, "SRP-BASED-SR");
746 PFLAG(PHY, 7, POWER_BOOST_FACTOR_AR, "POWER-BOOST-FACTOR-AR");
747 PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI,
748 "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI");
749 PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d");
750 PFLAG(PHY, 7, STBC_TX_ABOVE_80MHZ, "STBC-TX-ABOVE-80MHZ");
751 PFLAG(PHY, 7, STBC_RX_ABOVE_80MHZ, "STBC-RX-ABOVE-80MHZ");
752
753 PFLAG(PHY, 8, HE_ER_SU_PPDU_4XLTF_AND_08_US_GI,
754 "HE-ER-SU-PPDU-4XLTF-AND-08-US-GI");
755 PFLAG(PHY, 8, 20MHZ_IN_40MHZ_HE_PPDU_IN_2G,
756 "20MHZ-IN-40MHZ-HE-PPDU-IN-2G");
757 PFLAG(PHY, 8, 20MHZ_IN_160MHZ_HE_PPDU, "20MHZ-IN-160MHZ-HE-PPDU");
758 PFLAG(PHY, 8, 80MHZ_IN_160MHZ_HE_PPDU, "80MHZ-IN-160MHZ-HE-PPDU");
759 PFLAG(PHY, 8, HE_ER_SU_1XLTF_AND_08_US_GI,
760 "HE-ER-SU-1XLTF-AND-08-US-GI");
761 PFLAG(PHY, 8, MIDAMBLE_RX_TX_2X_AND_1XLTF,
762 "MIDAMBLE-RX-TX-2X-AND-1XLTF");
763
764 switch (cap[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_MASK) {
765 case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_20MHZ:
766 PRINT("DDCM-MAX-BW-20MHZ");
767 break;
768 case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_40MHZ:
769 PRINT("DCM-MAX-BW-40MHZ");
770 break;
771 case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_80MHZ:
772 PRINT("DCM-MAX-BW-80MHZ");
773 break;
774 case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_160_OR_80P80_MHZ:
775 PRINT("DCM-MAX-BW-160-OR-80P80-MHZ");
776 break;
777 }
778
779 PFLAG(PHY, 9, LONGER_THAN_16_SIGB_OFDM_SYM,
780 "LONGER-THAN-16-SIGB-OFDM-SYM");
781 PFLAG(PHY, 9, NON_TRIGGERED_CQI_FEEDBACK,
782 "NON-TRIGGERED-CQI-FEEDBACK");
783 PFLAG(PHY, 9, TX_1024_QAM_LESS_THAN_242_TONE_RU,
784 "TX-1024-QAM-LESS-THAN-242-TONE-RU");
785 PFLAG(PHY, 9, RX_1024_QAM_LESS_THAN_242_TONE_RU,
786 "RX-1024-QAM-LESS-THAN-242-TONE-RU");
787 PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB,
788 "RX-FULL-BW-SU-USING-MU-WITH-COMP-SIGB");
789 PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB,
790 "RX-FULL-BW-SU-USING-MU-WITH-NON-COMP-SIGB");
791
792#undef PFLAG_RANGE_DEFAULT
793#undef PFLAG_RANGE
794#undef PFLAG
795
796#define PRINT_NSS_SUPP(f, n) \
797 do { \
798 int i; \
799 u16 v = le16_to_cpu(nss->f); \
800 p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \
801 for (i = 0; i < 8; i += 2) { \
802 switch ((v >> i) & 0x3) { \
803 case 0: \
804 PRINT(n "-%d-SUPPORT-0-7", i / 2); \
805 break; \
806 case 1: \
807 PRINT(n "-%d-SUPPORT-0-9", i / 2); \
808 break; \
809 case 2: \
810 PRINT(n "-%d-SUPPORT-0-11", i / 2); \
811 break; \
812 case 3: \
813 PRINT(n "-%d-NOT-SUPPORTED", i / 2); \
814 break; \
815 } \
816 } \
817 } while (0)
818
819 PRINT_NSS_SUPP(rx_mcs_80, "RX-MCS-80");
820 PRINT_NSS_SUPP(tx_mcs_80, "TX-MCS-80");
821
822 if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) {
823 PRINT_NSS_SUPP(rx_mcs_160, "RX-MCS-160");
824 PRINT_NSS_SUPP(tx_mcs_160, "TX-MCS-160");
825 }
826
827 if (cap[0] &
828 IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) {
829 PRINT_NSS_SUPP(rx_mcs_80p80, "RX-MCS-80P80");
830 PRINT_NSS_SUPP(tx_mcs_80p80, "TX-MCS-80P80");
831 }
832
833#undef PRINT_NSS_SUPP
834#undef PRINT
835
836 if (!(cap[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT))
837 goto out;
838
839 p += scnprintf(p, buf_sz + buf - p, "PPE-THRESHOLDS: %#.2x",
840 hec->ppe_thres[0]);
841
842 ppe_size = ieee80211_he_ppe_size(hec->ppe_thres[0], cap);
843 for (i = 1; i < ppe_size; i++) {
844 p += scnprintf(p, buf_sz + buf - p, " %#.2x",
845 hec->ppe_thres[i]);
846 }
847 p += scnprintf(p, buf_sz + buf - p, "\n");
848
849out:
850 ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
851 kfree(buf);
852 return ret;
853}
854STA_OPS(he_capa);
496 855
497#define DEBUGFS_ADD(name) \ 856#define DEBUGFS_ADD(name) \
498 debugfs_create_file(#name, 0400, \ 857 debugfs_create_file(#name, 0400, \
@@ -538,6 +897,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
538 DEBUGFS_ADD(agg_status); 897 DEBUGFS_ADD(agg_status);
539 DEBUGFS_ADD(ht_capa); 898 DEBUGFS_ADD(ht_capa);
540 DEBUGFS_ADD(vht_capa); 899 DEBUGFS_ADD(vht_capa);
900 DEBUGFS_ADD(he_capa);
541 901
542 DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); 902 DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates);
543 DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); 903 DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 8f6998091d26..0b1747a2313d 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1173,6 +1173,32 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
1173 local->ops->wake_tx_queue(&local->hw, &txq->txq); 1173 local->ops->wake_tx_queue(&local->hw, &txq->txq);
1174} 1174}
1175 1175
1176static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local,
1177 struct sk_buff *head,
1178 struct sk_buff *skb)
1179{
1180 if (!local->ops->can_aggregate_in_amsdu)
1181 return true;
1182
1183 return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb);
1184}
1185
1186static inline int
1187drv_get_ftm_responder_stats(struct ieee80211_local *local,
1188 struct ieee80211_sub_if_data *sdata,
1189 struct cfg80211_ftm_responder_stats *ftm_stats)
1190{
1191 u32 ret = -EOPNOTSUPP;
1192
1193 if (local->ops->get_ftm_responder_stats)
1194 ret = local->ops->get_ftm_responder_stats(&local->hw,
1195 &sdata->vif,
1196 ftm_stats);
1197 trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats);
1198
1199 return ret;
1200}
1201
1176static inline int drv_start_nan(struct ieee80211_local *local, 1202static inline int drv_start_nan(struct ieee80211_local *local,
1177 struct ieee80211_sub_if_data *sdata, 1203 struct ieee80211_sub_if_data *sdata,
1178 struct cfg80211_nan_conf *conf) 1204 struct cfg80211_nan_conf *conf)
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index f0f5fedb8caa..0d704e8d7078 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1070,7 +1070,9 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
1070 struct ieee80211_vht_cap cap_ie; 1070 struct ieee80211_vht_cap cap_ie;
1071 struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; 1071 struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap;
1072 1072
1073 ieee80211_chandef_vht_oper(elems->vht_operation, 1073 ieee80211_chandef_vht_oper(&local->hw,
1074 elems->vht_operation,
1075 elems->ht_operation,
1074 &chandef); 1076 &chandef);
1075 memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); 1077 memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
1076 ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, 1078 ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 172aeae21ae9..10a05062e4a0 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -377,6 +377,7 @@ struct ieee80211_mgd_auth_data {
377 u8 key[WLAN_KEY_LEN_WEP104]; 377 u8 key[WLAN_KEY_LEN_WEP104];
378 u8 key_len, key_idx; 378 u8 key_len, key_idx;
379 bool done; 379 bool done;
380 bool peer_confirmed;
380 bool timeout_started; 381 bool timeout_started;
381 382
382 u16 sae_trans, sae_status; 383 u16 sae_trans, sae_status;
@@ -818,6 +819,7 @@ enum txq_info_flags {
818 IEEE80211_TXQ_STOP, 819 IEEE80211_TXQ_STOP,
819 IEEE80211_TXQ_AMPDU, 820 IEEE80211_TXQ_AMPDU,
820 IEEE80211_TXQ_NO_AMSDU, 821 IEEE80211_TXQ_NO_AMSDU,
822 IEEE80211_TXQ_STOP_NETIF_TX,
821}; 823};
822 824
823/** 825/**
@@ -1198,6 +1200,9 @@ struct ieee80211_local {
1198 /* number of RX chains the hardware has */ 1200 /* number of RX chains the hardware has */
1199 u8 rx_chains; 1201 u8 rx_chains;
1200 1202
1203 /* bitmap of which sbands were copied */
1204 u8 sband_allocated;
1205
1201 int tx_headroom; /* required headroom for hardware/radiotap */ 1206 int tx_headroom; /* required headroom for hardware/radiotap */
1202 1207
1203 /* Tasklet and skb queue to process calls from IRQ mode. All frames 1208 /* Tasklet and skb queue to process calls from IRQ mode. All frames
@@ -1226,6 +1231,7 @@ struct ieee80211_local {
1226 1231
1227 struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; 1232 struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
1228 struct tasklet_struct tx_pending_tasklet; 1233 struct tasklet_struct tx_pending_tasklet;
1234 struct tasklet_struct wake_txqs_tasklet;
1229 1235
1230 atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; 1236 atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
1231 1237
@@ -2038,6 +2044,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
2038 struct ieee80211_sub_if_data *sdata); 2044 struct ieee80211_sub_if_data *sdata);
2039void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, 2045void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
2040 struct txq_info *txqi); 2046 struct txq_info *txqi);
2047void ieee80211_wake_txqs(unsigned long data);
2041void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, 2048void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
2042 u16 transaction, u16 auth_alg, u16 status, 2049 u16 transaction, u16 auth_alg, u16 status,
2043 const u8 *extra, size_t extra_len, const u8 *bssid, 2050 const u8 *extra, size_t extra_len, const u8 *bssid,
@@ -2106,7 +2113,9 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
2106/* channel management */ 2113/* channel management */
2107bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, 2114bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
2108 struct cfg80211_chan_def *chandef); 2115 struct cfg80211_chan_def *chandef);
2109bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, 2116bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
2117 const struct ieee80211_vht_operation *oper,
2118 const struct ieee80211_ht_operation *htop,
2110 struct cfg80211_chan_def *chandef); 2119 struct cfg80211_chan_def *chandef);
2111u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); 2120u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
2112 2121
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index c054ac85793c..4700718e010f 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -248,6 +248,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
248 (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) 248 (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
249 increment_tailroom_need_count(sdata); 249 increment_tailroom_need_count(sdata);
250 250
251 key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
251 ret = drv_set_key(key->local, DISABLE_KEY, sdata, 252 ret = drv_set_key(key->local, DISABLE_KEY, sdata,
252 sta ? &sta->sta : NULL, &key->conf); 253 sta ? &sta->sta : NULL, &key->conf);
253 254
@@ -256,8 +257,65 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
256 "failed to remove key (%d, %pM) from hardware (%d)\n", 257 "failed to remove key (%d, %pM) from hardware (%d)\n",
257 key->conf.keyidx, 258 key->conf.keyidx,
258 sta ? sta->sta.addr : bcast_addr, ret); 259 sta ? sta->sta.addr : bcast_addr, ret);
260}
259 261
260 key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; 262static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
263 struct ieee80211_key *new_key,
264 bool ptk0rekey)
265{
266 struct ieee80211_sub_if_data *sdata;
267 struct ieee80211_local *local;
268 struct sta_info *sta;
269 int ret;
270
271 /* Aggregation sessions are OK when running on SW crypto.
272 * A broken remote STA may cause issues not observed with HW
273 * crypto, though.
274 */
275 if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
276 return 0;
277
278 assert_key_lock(old_key->local);
279 sta = old_key->sta;
280
281 /* PTK only using key ID 0 needs special handling on rekey */
282 if (new_key && sta && ptk0rekey) {
283 local = old_key->local;
284 sdata = old_key->sdata;
285
286 /* Stop TX till we are on the new key */
287 old_key->flags |= KEY_FLAG_TAINTED;
288 ieee80211_clear_fast_xmit(sta);
289
290 /* Aggregation sessions during rekey are complicated due to the
291 * reorder buffer and retransmits. Side step that by blocking
292 * aggregation during rekey and tear down running sessions.
293 */
294 if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
295 set_sta_flag(sta, WLAN_STA_BLOCK_BA);
296 ieee80211_sta_tear_down_BA_sessions(sta,
297 AGG_STOP_LOCAL_REQUEST);
298 }
299
300 if (!wiphy_ext_feature_isset(local->hw.wiphy,
301 NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) {
302 pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.",
303 sta->sta.addr);
304 /* Flushing the driver queues *may* help prevent
305 * the clear text leaks and freezes.
306 */
307 ieee80211_flush_queues(local, sdata, false);
308 }
309 }
310
311 ieee80211_key_disable_hw_accel(old_key);
312
313 if (new_key)
314 ret = ieee80211_key_enable_hw_accel(new_key);
315 else
316 ret = 0;
317
318 return ret;
261} 319}
262 320
263static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, 321static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
@@ -316,38 +374,57 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
316} 374}
317 375
318 376
319static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, 377static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
320 struct sta_info *sta, 378 struct sta_info *sta,
321 bool pairwise, 379 bool pairwise,
322 struct ieee80211_key *old, 380 struct ieee80211_key *old,
323 struct ieee80211_key *new) 381 struct ieee80211_key *new)
324{ 382{
325 int idx; 383 int idx;
384 int ret;
326 bool defunikey, defmultikey, defmgmtkey; 385 bool defunikey, defmultikey, defmgmtkey;
327 386
328 /* caller must provide at least one old/new */ 387 /* caller must provide at least one old/new */
329 if (WARN_ON(!new && !old)) 388 if (WARN_ON(!new && !old))
330 return; 389 return 0;
331 390
332 if (new) 391 if (new)
333 list_add_tail_rcu(&new->list, &sdata->key_list); 392 list_add_tail_rcu(&new->list, &sdata->key_list);
334 393
335 WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); 394 WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
336 395
337 if (old) 396 if (old) {
338 idx = old->conf.keyidx; 397 idx = old->conf.keyidx;
339 else 398 /* TODO: proper implement and test "Extended Key ID for
399 * Individually Addressed Frames" from IEEE 802.11-2016.
400 * Till then always assume only key ID 0 is used for
401 * pairwise keys.*/
402 ret = ieee80211_hw_key_replace(old, new, pairwise);
403 } else {
404 /* new must be provided in case old is not */
340 idx = new->conf.keyidx; 405 idx = new->conf.keyidx;
406 if (!new->local->wowlan)
407 ret = ieee80211_key_enable_hw_accel(new);
408 else
409 ret = 0;
410 }
411
412 if (ret)
413 return ret;
341 414
342 if (sta) { 415 if (sta) {
343 if (pairwise) { 416 if (pairwise) {
344 rcu_assign_pointer(sta->ptk[idx], new); 417 rcu_assign_pointer(sta->ptk[idx], new);
345 sta->ptk_idx = idx; 418 sta->ptk_idx = idx;
346 ieee80211_check_fast_xmit(sta); 419 if (new) {
420 clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
421 ieee80211_check_fast_xmit(sta);
422 }
347 } else { 423 } else {
348 rcu_assign_pointer(sta->gtk[idx], new); 424 rcu_assign_pointer(sta->gtk[idx], new);
349 } 425 }
350 ieee80211_check_fast_rx(sta); 426 if (new)
427 ieee80211_check_fast_rx(sta);
351 } else { 428 } else {
352 defunikey = old && 429 defunikey = old &&
353 old == key_mtx_dereference(sdata->local, 430 old == key_mtx_dereference(sdata->local,
@@ -380,6 +457,8 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
380 457
381 if (old) 458 if (old)
382 list_del_rcu(&old->list); 459 list_del_rcu(&old->list);
460
461 return 0;
383} 462}
384 463
385struct ieee80211_key * 464struct ieee80211_key *
@@ -575,9 +654,6 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
575static void __ieee80211_key_destroy(struct ieee80211_key *key, 654static void __ieee80211_key_destroy(struct ieee80211_key *key,
576 bool delay_tailroom) 655 bool delay_tailroom)
577{ 656{
578 if (key->local)
579 ieee80211_key_disable_hw_accel(key);
580
581 if (key->local) { 657 if (key->local) {
582 struct ieee80211_sub_if_data *sdata = key->sdata; 658 struct ieee80211_sub_if_data *sdata = key->sdata;
583 659
@@ -654,7 +730,6 @@ int ieee80211_key_link(struct ieee80211_key *key,
654 struct ieee80211_sub_if_data *sdata, 730 struct ieee80211_sub_if_data *sdata,
655 struct sta_info *sta) 731 struct sta_info *sta)
656{ 732{
657 struct ieee80211_local *local = sdata->local;
658 struct ieee80211_key *old_key; 733 struct ieee80211_key *old_key;
659 int idx = key->conf.keyidx; 734 int idx = key->conf.keyidx;
660 bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; 735 bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
@@ -691,17 +766,13 @@ int ieee80211_key_link(struct ieee80211_key *key,
691 766
692 increment_tailroom_need_count(sdata); 767 increment_tailroom_need_count(sdata);
693 768
694 ieee80211_key_replace(sdata, sta, pairwise, old_key, key); 769 ret = ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
695 ieee80211_key_destroy(old_key, delay_tailroom);
696
697 ieee80211_debugfs_key_add(key);
698 770
699 if (!local->wowlan) { 771 if (!ret) {
700 ret = ieee80211_key_enable_hw_accel(key); 772 ieee80211_debugfs_key_add(key);
701 if (ret) 773 ieee80211_key_destroy(old_key, delay_tailroom);
702 ieee80211_key_free(key, delay_tailroom);
703 } else { 774 } else {
704 ret = 0; 775 ieee80211_key_free(key, delay_tailroom);
705 } 776 }
706 777
707 out: 778 out:
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 513627896204..83e71e6b2ebe 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -4,6 +4,7 @@
4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright (C) 2017 Intel Deutschland GmbH 6 * Copyright (C) 2017 Intel Deutschland GmbH
7 * Copyright (C) 2018 Intel Corporation
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as 10 * it under the terms of the GNU General Public License version 2 as
@@ -610,6 +611,18 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
610 local->ops = ops; 611 local->ops = ops;
611 local->use_chanctx = use_chanctx; 612 local->use_chanctx = use_chanctx;
612 613
614 /*
615 * We need a bit of data queued to build aggregates properly, so
616 * instruct the TCP stack to allow more than a single ms of data
617 * to be queued in the stack. The value is a bit-shift of 1
618 * second, so 8 is ~4ms of queued data. Only affects local TCP
619 * sockets.
620 * This is the default, anyhow - drivers may need to override it
621 * for local reasons (longer buffers, longer completion time, or
622 * similar).
623 */
624 local->hw.tx_sk_pacing_shift = 8;
625
613 /* set up some defaults */ 626 /* set up some defaults */
614 local->hw.queues = 1; 627 local->hw.queues = 1;
615 local->hw.max_rates = 1; 628 local->hw.max_rates = 1;
@@ -684,6 +697,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
684 tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, 697 tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
685 (unsigned long)local); 698 (unsigned long)local);
686 699
700 if (ops->wake_tx_queue)
701 tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs,
702 (unsigned long)local);
703
687 tasklet_init(&local->tasklet, 704 tasklet_init(&local->tasklet,
688 ieee80211_tasklet_handler, 705 ieee80211_tasklet_handler,
689 (unsigned long) local); 706 (unsigned long) local);
@@ -1154,6 +1171,53 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
1154 goto fail_rate; 1171 goto fail_rate;
1155 } 1172 }
1156 1173
1174 if (local->rate_ctrl) {
1175 clear_bit(IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, hw->flags);
1176 if (local->rate_ctrl->ops->capa & RATE_CTRL_CAPA_VHT_EXT_NSS_BW)
1177 ieee80211_hw_set(hw, SUPPORTS_VHT_EXT_NSS_BW);
1178 }
1179
1180 /*
1181 * If the VHT capabilities don't have IEEE80211_VHT_EXT_NSS_BW_CAPABLE,
1182 * or have it when we don't, copy the sband structure and set/clear it.
1183 * This is necessary because rate scaling algorithms could be switched
1184 * and have different support values.
1185 * Print a message so that in the common case the reallocation can be
1186 * avoided.
1187 */
1188 BUILD_BUG_ON(NUM_NL80211_BANDS > 8 * sizeof(local->sband_allocated));
1189 for (band = 0; band < NUM_NL80211_BANDS; band++) {
1190 struct ieee80211_supported_band *sband;
1191 bool local_cap, ie_cap;
1192
1193 local_cap = ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW);
1194
1195 sband = local->hw.wiphy->bands[band];
1196 if (!sband || !sband->vht_cap.vht_supported)
1197 continue;
1198
1199 ie_cap = !!(sband->vht_cap.vht_mcs.tx_highest &
1200 cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE));
1201
1202 if (local_cap == ie_cap)
1203 continue;
1204
1205 sband = kmemdup(sband, sizeof(*sband), GFP_KERNEL);
1206 if (!sband) {
1207 result = -ENOMEM;
1208 goto fail_rate;
1209 }
1210
1211 wiphy_dbg(hw->wiphy, "copying sband (band %d) due to VHT EXT NSS BW flag\n",
1212 band);
1213
1214 sband->vht_cap.vht_mcs.tx_highest ^=
1215 cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE);
1216
1217 local->hw.wiphy->bands[band] = sband;
1218 local->sband_allocated |= BIT(band);
1219 }
1220
1157 /* add one default STA interface if supported */ 1221 /* add one default STA interface if supported */
1158 if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && 1222 if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&
1159 !ieee80211_hw_check(hw, NO_AUTO_VIF)) { 1223 !ieee80211_hw_check(hw, NO_AUTO_VIF)) {
@@ -1272,6 +1336,7 @@ static int ieee80211_free_ack_frame(int id, void *p, void *data)
1272void ieee80211_free_hw(struct ieee80211_hw *hw) 1336void ieee80211_free_hw(struct ieee80211_hw *hw)
1273{ 1337{
1274 struct ieee80211_local *local = hw_to_local(hw); 1338 struct ieee80211_local *local = hw_to_local(hw);
1339 enum nl80211_band band;
1275 1340
1276 mutex_destroy(&local->iflist_mtx); 1341 mutex_destroy(&local->iflist_mtx);
1277 mutex_destroy(&local->mtx); 1342 mutex_destroy(&local->mtx);
@@ -1287,6 +1352,12 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
1287 1352
1288 ieee80211_free_led_names(local); 1353 ieee80211_free_led_names(local);
1289 1354
1355 for (band = 0; band < NUM_NL80211_BANDS; band++) {
1356 if (!(local->sband_allocated & BIT(band)))
1357 continue;
1358 kfree(local->hw.wiphy->bands[band]);
1359 }
1360
1290 wiphy_free(local->hw.wiphy); 1361 wiphy_free(local->hw.wiphy);
1291} 1362}
1292EXPORT_SYMBOL(ieee80211_free_hw); 1363EXPORT_SYMBOL(ieee80211_free_hw);
@@ -1304,18 +1375,12 @@ static int __init ieee80211_init(void)
1304 if (ret) 1375 if (ret)
1305 return ret; 1376 return ret;
1306 1377
1307 ret = rc80211_minstrel_ht_init();
1308 if (ret)
1309 goto err_minstrel;
1310
1311 ret = ieee80211_iface_init(); 1378 ret = ieee80211_iface_init();
1312 if (ret) 1379 if (ret)
1313 goto err_netdev; 1380 goto err_netdev;
1314 1381
1315 return 0; 1382 return 0;
1316 err_netdev: 1383 err_netdev:
1317 rc80211_minstrel_ht_exit();
1318 err_minstrel:
1319 rc80211_minstrel_exit(); 1384 rc80211_minstrel_exit();
1320 1385
1321 return ret; 1386 return ret;
@@ -1323,7 +1388,6 @@ static int __init ieee80211_init(void)
1323 1388
1324static void __exit ieee80211_exit(void) 1389static void __exit ieee80211_exit(void)
1325{ 1390{
1326 rc80211_minstrel_ht_exit();
1327 rc80211_minstrel_exit(); 1391 rc80211_minstrel_exit();
1328 1392
1329 ieee80211s_stop(); 1393 ieee80211s_stop();
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index d51da26e9c18..8bad414c52ad 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2008, 2009 open80211s Ltd. 2 * Copyright (c) 2008, 2009 open80211s Ltd.
3 * Copyright (C) 2018 Intel Corporation
3 * Authors: Luis Carlos Cobo <luisca@cozybit.com> 4 * Authors: Luis Carlos Cobo <luisca@cozybit.com>
4 * Javier Cardona <javier@cozybit.com> 5 * Javier Cardona <javier@cozybit.com>
5 * 6 *
@@ -98,7 +99,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
98 cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, 99 cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan,
99 NL80211_CHAN_NO_HT); 100 NL80211_CHAN_NO_HT);
100 ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); 101 ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def);
101 ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def); 102 ieee80211_chandef_vht_oper(&sdata->local->hw,
103 ie->vht_operation, ie->ht_operation,
104 &sta_chan_def);
102 105
103 if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, 106 if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
104 &sta_chan_def)) 107 &sta_chan_def))
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 3dbecae4be73..d2bc8d57c87e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -220,7 +220,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
220 memcpy(&he_oper_vht_cap, he_oper->optional, 3); 220 memcpy(&he_oper_vht_cap, he_oper->optional, 3);
221 he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); 221 he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0);
222 222
223 if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap, 223 if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
224 &he_oper_vht_cap, ht_oper,
224 &vht_chandef)) { 225 &vht_chandef)) {
225 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) 226 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
226 sdata_info(sdata, 227 sdata_info(sdata,
@@ -228,7 +229,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
228 ret = IEEE80211_STA_DISABLE_HE; 229 ret = IEEE80211_STA_DISABLE_HE;
229 goto out; 230 goto out;
230 } 231 }
231 } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { 232 } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper,
233 ht_oper, &vht_chandef)) {
232 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) 234 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
233 sdata_info(sdata, 235 sdata_info(sdata,
234 "AP VHT information is invalid, disable VHT\n"); 236 "AP VHT information is invalid, disable VHT\n");
@@ -2759,13 +2761,40 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
2759 auth_data->key_idx, tx_flags); 2761 auth_data->key_idx, tx_flags);
2760} 2762}
2761 2763
2764static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
2765 const u8 *bssid)
2766{
2767 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2768 struct sta_info *sta;
2769
2770 sdata_info(sdata, "authenticated\n");
2771 ifmgd->auth_data->done = true;
2772 ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC;
2773 ifmgd->auth_data->timeout_started = true;
2774 run_again(sdata, ifmgd->auth_data->timeout);
2775
2776 /* move station state to auth */
2777 mutex_lock(&sdata->local->sta_mtx);
2778 sta = sta_info_get(sdata, bssid);
2779 if (!sta) {
2780 WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
2781 return false;
2782 }
2783 if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
2784 sdata_info(sdata, "failed moving %pM to auth\n", bssid);
2785 return false;
2786 }
2787 mutex_unlock(&sdata->local->sta_mtx);
2788
2789 return true;
2790}
2791
2762static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, 2792static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
2763 struct ieee80211_mgmt *mgmt, size_t len) 2793 struct ieee80211_mgmt *mgmt, size_t len)
2764{ 2794{
2765 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 2795 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2766 u8 bssid[ETH_ALEN]; 2796 u8 bssid[ETH_ALEN];
2767 u16 auth_alg, auth_transaction, status_code; 2797 u16 auth_alg, auth_transaction, status_code;
2768 struct sta_info *sta;
2769 struct ieee80211_event event = { 2798 struct ieee80211_event event = {
2770 .type = MLME_EVENT, 2799 .type = MLME_EVENT,
2771 .u.mlme.data = AUTH_EVENT, 2800 .u.mlme.data = AUTH_EVENT,
@@ -2789,7 +2818,11 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
2789 status_code = le16_to_cpu(mgmt->u.auth.status_code); 2818 status_code = le16_to_cpu(mgmt->u.auth.status_code);
2790 2819
2791 if (auth_alg != ifmgd->auth_data->algorithm || 2820 if (auth_alg != ifmgd->auth_data->algorithm ||
2792 auth_transaction != ifmgd->auth_data->expected_transaction) { 2821 (auth_alg != WLAN_AUTH_SAE &&
2822 auth_transaction != ifmgd->auth_data->expected_transaction) ||
2823 (auth_alg == WLAN_AUTH_SAE &&
2824 (auth_transaction < ifmgd->auth_data->expected_transaction ||
2825 auth_transaction > 2))) {
2793 sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n", 2826 sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n",
2794 mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, 2827 mgmt->sa, auth_alg, ifmgd->auth_data->algorithm,
2795 auth_transaction, 2828 auth_transaction,
@@ -2832,35 +2865,17 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
2832 2865
2833 event.u.mlme.status = MLME_SUCCESS; 2866 event.u.mlme.status = MLME_SUCCESS;
2834 drv_event_callback(sdata->local, sdata, &event); 2867 drv_event_callback(sdata->local, sdata, &event);
2835 sdata_info(sdata, "authenticated\n"); 2868 if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE ||
2836 ifmgd->auth_data->done = true; 2869 (auth_transaction == 2 &&
2837 ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; 2870 ifmgd->auth_data->expected_transaction == 2)) {
2838 ifmgd->auth_data->timeout_started = true; 2871 if (!ieee80211_mark_sta_auth(sdata, bssid))
2839 run_again(sdata, ifmgd->auth_data->timeout); 2872 goto out_err;
2840 2873 } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
2841 if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && 2874 auth_transaction == 2) {
2842 ifmgd->auth_data->expected_transaction != 2) { 2875 sdata_info(sdata, "SAE peer confirmed\n");
2843 /* 2876 ifmgd->auth_data->peer_confirmed = true;
2844 * Report auth frame to user space for processing since another
2845 * round of Authentication frames is still needed.
2846 */
2847 cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
2848 return;
2849 } 2877 }
2850 2878
2851 /* move station state to auth */
2852 mutex_lock(&sdata->local->sta_mtx);
2853 sta = sta_info_get(sdata, bssid);
2854 if (!sta) {
2855 WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
2856 goto out_err;
2857 }
2858 if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
2859 sdata_info(sdata, "failed moving %pM to auth\n", bssid);
2860 goto out_err;
2861 }
2862 mutex_unlock(&sdata->local->sta_mtx);
2863
2864 cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); 2879 cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
2865 return; 2880 return;
2866 out_err: 2881 out_err:
@@ -3237,19 +3252,16 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
3237 } 3252 }
3238 3253
3239 if (bss_conf->he_support) { 3254 if (bss_conf->he_support) {
3240 u32 he_oper_params = 3255 bss_conf->bss_color =
3241 le32_to_cpu(elems.he_operation->he_oper_params); 3256 le32_get_bits(elems.he_operation->he_oper_params,
3257 IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
3242 3258
3243 bss_conf->bss_color = he_oper_params &
3244 IEEE80211_HE_OPERATION_BSS_COLOR_MASK;
3245 bss_conf->htc_trig_based_pkt_ext = 3259 bss_conf->htc_trig_based_pkt_ext =
3246 (he_oper_params & 3260 le32_get_bits(elems.he_operation->he_oper_params,
3247 IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) << 3261 IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK);
3248 IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET;
3249 bss_conf->frame_time_rts_th = 3262 bss_conf->frame_time_rts_th =
3250 (he_oper_params & 3263 le32_get_bits(elems.he_operation->he_oper_params,
3251 IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) << 3264 IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
3252 IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET;
3253 3265
3254 bss_conf->multi_sta_back_32bit = 3266 bss_conf->multi_sta_back_32bit =
3255 sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & 3267 sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
@@ -4879,6 +4891,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4879 struct ieee80211_mgd_auth_data *auth_data; 4891 struct ieee80211_mgd_auth_data *auth_data;
4880 u16 auth_alg; 4892 u16 auth_alg;
4881 int err; 4893 int err;
4894 bool cont_auth;
4882 4895
4883 /* prepare auth data structure */ 4896 /* prepare auth data structure */
4884 4897
@@ -4913,6 +4926,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4913 return -EOPNOTSUPP; 4926 return -EOPNOTSUPP;
4914 } 4927 }
4915 4928
4929 if (ifmgd->assoc_data)
4930 return -EBUSY;
4931
4916 auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len + 4932 auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len +
4917 req->ie_len, GFP_KERNEL); 4933 req->ie_len, GFP_KERNEL);
4918 if (!auth_data) 4934 if (!auth_data)
@@ -4932,6 +4948,13 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4932 auth_data->data_len += req->auth_data_len - 4; 4948 auth_data->data_len += req->auth_data_len - 4;
4933 } 4949 }
4934 4950
4951 /* Check if continuing authentication or trying to authenticate with the
4952 * same BSS that we were in the process of authenticating with and avoid
4953 * removal and re-addition of the STA entry in
4954 * ieee80211_prep_connection().
4955 */
4956 cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss;
4957
4935 if (req->ie && req->ie_len) { 4958 if (req->ie && req->ie_len) {
4936 memcpy(&auth_data->data[auth_data->data_len], 4959 memcpy(&auth_data->data[auth_data->data_len],
4937 req->ie, req->ie_len); 4960 req->ie, req->ie_len);
@@ -4948,18 +4971,26 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4948 4971
4949 /* try to authenticate/probe */ 4972 /* try to authenticate/probe */
4950 4973
4951 if ((ifmgd->auth_data && !ifmgd->auth_data->done) || 4974 if (ifmgd->auth_data) {
4952 ifmgd->assoc_data) { 4975 if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE) {
4953 err = -EBUSY; 4976 auth_data->peer_confirmed =
4954 goto err_free; 4977 ifmgd->auth_data->peer_confirmed;
4978 }
4979 ieee80211_destroy_auth_data(sdata, cont_auth);
4955 } 4980 }
4956 4981
4957 if (ifmgd->auth_data)
4958 ieee80211_destroy_auth_data(sdata, false);
4959
4960 /* prep auth_data so we don't go into idle on disassoc */ 4982 /* prep auth_data so we don't go into idle on disassoc */
4961 ifmgd->auth_data = auth_data; 4983 ifmgd->auth_data = auth_data;
4962 4984
4985 /* If this is continuation of an ongoing SAE authentication exchange
4986 * (i.e., request to send SAE Confirm) and the peer has already
4987 * confirmed, mark authentication completed since we are about to send
4988 * out SAE Confirm.
4989 */
4990 if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE &&
4991 auth_data->peer_confirmed && auth_data->sae_trans == 2)
4992 ieee80211_mark_sta_auth(sdata, req->bss->bssid);
4993
4963 if (ifmgd->associated) { 4994 if (ifmgd->associated) {
4964 u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; 4995 u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
4965 4996
@@ -4977,7 +5008,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4977 5008
4978 sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); 5009 sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid);
4979 5010
4980 err = ieee80211_prep_connection(sdata, req->bss, false, false); 5011 err = ieee80211_prep_connection(sdata, req->bss, cont_auth, false);
4981 if (err) 5012 if (err)
4982 goto err_clear; 5013 goto err_clear;
4983 5014
@@ -4998,7 +5029,6 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4998 mutex_lock(&sdata->local->mtx); 5029 mutex_lock(&sdata->local->mtx);
4999 ieee80211_vif_release_channel(sdata); 5030 ieee80211_vif_release_channel(sdata);
5000 mutex_unlock(&sdata->local->mtx); 5031 mutex_unlock(&sdata->local->mtx);
5001 err_free:
5002 kfree(auth_data); 5032 kfree(auth_data);
5003 return err; 5033 return err;
5004} 5034}
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 8212bfeb71d6..d59198191a79 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -95,18 +95,5 @@ static inline void rc80211_minstrel_exit(void)
95} 95}
96#endif 96#endif
97 97
98#ifdef CONFIG_MAC80211_RC_MINSTREL_HT
99int rc80211_minstrel_ht_init(void);
100void rc80211_minstrel_ht_exit(void);
101#else
102static inline int rc80211_minstrel_ht_init(void)
103{
104 return 0;
105}
106static inline void rc80211_minstrel_ht_exit(void)
107{
108}
109#endif
110
111 98
112#endif /* IEEE80211_RATE_H */ 99#endif /* IEEE80211_RATE_H */
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 07fb219327d6..a34e9c2ca626 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -167,12 +167,6 @@ minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
167 if (unlikely(!mrs->att_hist)) { 167 if (unlikely(!mrs->att_hist)) {
168 mrs->prob_ewma = cur_prob; 168 mrs->prob_ewma = cur_prob;
169 } else { 169 } else {
170 /* update exponential weighted moving variance */
171 mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
172 cur_prob,
173 mrs->prob_ewma,
174 EWMA_LEVEL);
175
176 /*update exponential weighted moving avarage */ 170 /*update exponential weighted moving avarage */
177 mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, 171 mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
178 cur_prob, 172 cur_prob,
@@ -572,141 +566,6 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
572 minstrel_update_rates(mp, mi); 566 minstrel_update_rates(mp, mi);
573} 567}
574 568
575static void *
576minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
577{
578 struct ieee80211_supported_band *sband;
579 struct minstrel_sta_info *mi;
580 struct minstrel_priv *mp = priv;
581 struct ieee80211_hw *hw = mp->hw;
582 int max_rates = 0;
583 int i;
584
585 mi = kzalloc(sizeof(struct minstrel_sta_info), gfp);
586 if (!mi)
587 return NULL;
588
589 for (i = 0; i < NUM_NL80211_BANDS; i++) {
590 sband = hw->wiphy->bands[i];
591 if (sband && sband->n_bitrates > max_rates)
592 max_rates = sband->n_bitrates;
593 }
594
595 mi->r = kcalloc(max_rates, sizeof(struct minstrel_rate), gfp);
596 if (!mi->r)
597 goto error;
598
599 mi->sample_table = kmalloc_array(max_rates, SAMPLE_COLUMNS, gfp);
600 if (!mi->sample_table)
601 goto error1;
602
603 mi->last_stats_update = jiffies;
604 return mi;
605
606error1:
607 kfree(mi->r);
608error:
609 kfree(mi);
610 return NULL;
611}
612
613static void
614minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
615{
616 struct minstrel_sta_info *mi = priv_sta;
617
618 kfree(mi->sample_table);
619 kfree(mi->r);
620 kfree(mi);
621}
622
623static void
624minstrel_init_cck_rates(struct minstrel_priv *mp)
625{
626 static const int bitrates[4] = { 10, 20, 55, 110 };
627 struct ieee80211_supported_band *sband;
628 u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
629 int i, j;
630
631 sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ];
632 if (!sband)
633 return;
634
635 for (i = 0, j = 0; i < sband->n_bitrates; i++) {
636 struct ieee80211_rate *rate = &sband->bitrates[i];
637
638 if (rate->flags & IEEE80211_RATE_ERP_G)
639 continue;
640
641 if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
642 continue;
643
644 for (j = 0; j < ARRAY_SIZE(bitrates); j++) {
645 if (rate->bitrate != bitrates[j])
646 continue;
647
648 mp->cck_rates[j] = i;
649 break;
650 }
651 }
652}
653
654static void *
655minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
656{
657 struct minstrel_priv *mp;
658
659 mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
660 if (!mp)
661 return NULL;
662
663 /* contention window settings
664 * Just an approximation. Using the per-queue values would complicate
665 * the calculations and is probably unnecessary */
666 mp->cw_min = 15;
667 mp->cw_max = 1023;
668
669 /* number of packets (in %) to use for sampling other rates
670 * sample less often for non-mrr packets, because the overhead
671 * is much higher than with mrr */
672 mp->lookaround_rate = 5;
673 mp->lookaround_rate_mrr = 10;
674
675 /* maximum time that the hw is allowed to stay in one MRR segment */
676 mp->segment_size = 6000;
677
678 if (hw->max_rate_tries > 0)
679 mp->max_retry = hw->max_rate_tries;
680 else
681 /* safe default, does not necessarily have to match hw properties */
682 mp->max_retry = 7;
683
684 if (hw->max_rates >= 4)
685 mp->has_mrr = true;
686
687 mp->hw = hw;
688 mp->update_interval = 100;
689
690#ifdef CONFIG_MAC80211_DEBUGFS
691 mp->fixed_rate_idx = (u32) -1;
692 mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx",
693 0666, debugfsdir, &mp->fixed_rate_idx);
694#endif
695
696 minstrel_init_cck_rates(mp);
697
698 return mp;
699}
700
701static void
702minstrel_free(void *priv)
703{
704#ifdef CONFIG_MAC80211_DEBUGFS
705 debugfs_remove(((struct minstrel_priv *)priv)->dbg_fixed_rate);
706#endif
707 kfree(priv);
708}
709
710static u32 minstrel_get_expected_throughput(void *priv_sta) 569static u32 minstrel_get_expected_throughput(void *priv_sta)
711{ 570{
712 struct minstrel_sta_info *mi = priv_sta; 571 struct minstrel_sta_info *mi = priv_sta;
@@ -725,29 +584,8 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
725} 584}
726 585
727const struct rate_control_ops mac80211_minstrel = { 586const struct rate_control_ops mac80211_minstrel = {
728 .name = "minstrel",
729 .tx_status_ext = minstrel_tx_status, 587 .tx_status_ext = minstrel_tx_status,
730 .get_rate = minstrel_get_rate, 588 .get_rate = minstrel_get_rate,
731 .rate_init = minstrel_rate_init, 589 .rate_init = minstrel_rate_init,
732 .alloc = minstrel_alloc,
733 .free = minstrel_free,
734 .alloc_sta = minstrel_alloc_sta,
735 .free_sta = minstrel_free_sta,
736#ifdef CONFIG_MAC80211_DEBUGFS
737 .add_sta_debugfs = minstrel_add_sta_debugfs,
738 .remove_sta_debugfs = minstrel_remove_sta_debugfs,
739#endif
740 .get_expected_throughput = minstrel_get_expected_throughput, 590 .get_expected_throughput = minstrel_get_expected_throughput,
741}; 591};
742
743int __init
744rc80211_minstrel_init(void)
745{
746 return ieee80211_rate_control_register(&mac80211_minstrel);
747}
748
749void
750rc80211_minstrel_exit(void)
751{
752 ieee80211_rate_control_unregister(&mac80211_minstrel);
753}
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index be6c3f35f48b..23ec953e3a24 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -35,19 +35,6 @@ minstrel_ewma(int old, int new, int weight)
35 return old + incr; 35 return old + incr;
36} 36}
37 37
38/*
39 * Perform EWMV (Exponentially Weighted Moving Variance) calculation
40 */
41static inline int
42minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
43{
44 int diff, incr;
45
46 diff = cur_prob - prob_ewma;
47 incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
48 return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
49}
50
51struct minstrel_rate_stats { 38struct minstrel_rate_stats {
52 /* current / last sampling period attempts/success counters */ 39 /* current / last sampling period attempts/success counters */
53 u16 attempts, last_attempts; 40 u16 attempts, last_attempts;
@@ -56,11 +43,8 @@ struct minstrel_rate_stats {
56 /* total attempts/success counters */ 43 /* total attempts/success counters */
57 u32 att_hist, succ_hist; 44 u32 att_hist, succ_hist;
58 45
59 /* statistis of packet delivery probability 46 /* prob_ewma - exponential weighted moving average of prob */
60 * prob_ewma - exponential weighted moving average of prob
61 * prob_ewmsd - exp. weighted moving standard deviation of prob */
62 u16 prob_ewma; 47 u16 prob_ewma;
63 u16 prob_ewmv;
64 48
65 /* maximum retry counts */ 49 /* maximum retry counts */
66 u8 retry_count; 50 u8 retry_count;
@@ -109,11 +93,6 @@ struct minstrel_sta_info {
109 93
110 /* sampling table */ 94 /* sampling table */
111 u8 *sample_table; 95 u8 *sample_table;
112
113#ifdef CONFIG_MAC80211_DEBUGFS
114 struct dentry *dbg_stats;
115 struct dentry *dbg_stats_csv;
116#endif
117}; 96};
118 97
119struct minstrel_priv { 98struct minstrel_priv {
@@ -137,7 +116,6 @@ struct minstrel_priv {
137 * - setting will be applied on next update 116 * - setting will be applied on next update
138 */ 117 */
139 u32 fixed_rate_idx; 118 u32 fixed_rate_idx;
140 struct dentry *dbg_fixed_rate;
141#endif 119#endif
142}; 120};
143 121
@@ -146,17 +124,8 @@ struct minstrel_debugfs_info {
146 char buf[]; 124 char buf[];
147}; 125};
148 126
149/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
150static inline int
151minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
152{
153 unsigned int ewmv = mrs->prob_ewmv;
154 return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
155}
156
157extern const struct rate_control_ops mac80211_minstrel; 127extern const struct rate_control_ops mac80211_minstrel;
158void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); 128void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
159void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
160 129
161/* Recalculate success probabilities and counters for a given rate using EWMA */ 130/* Recalculate success probabilities and counters for a given rate using EWMA */
162void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); 131void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs);
@@ -165,7 +134,5 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma);
165/* debugfs */ 134/* debugfs */
166int minstrel_stats_open(struct inode *inode, struct file *file); 135int minstrel_stats_open(struct inode *inode, struct file *file);
167int minstrel_stats_csv_open(struct inode *inode, struct file *file); 136int minstrel_stats_csv_open(struct inode *inode, struct file *file);
168ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos);
169int minstrel_stats_release(struct inode *inode, struct file *file);
170 137
171#endif 138#endif
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 9ad7d63d3e5b..c8afd85b51a0 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -54,22 +54,6 @@
54#include <net/mac80211.h> 54#include <net/mac80211.h>
55#include "rc80211_minstrel.h" 55#include "rc80211_minstrel.h"
56 56
57ssize_t
58minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos)
59{
60 struct minstrel_debugfs_info *ms;
61
62 ms = file->private_data;
63 return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len);
64}
65
66int
67minstrel_stats_release(struct inode *inode, struct file *file)
68{
69 kfree(file->private_data);
70 return 0;
71}
72
73int 57int
74minstrel_stats_open(struct inode *inode, struct file *file) 58minstrel_stats_open(struct inode *inode, struct file *file)
75{ 59{
@@ -86,14 +70,13 @@ minstrel_stats_open(struct inode *inode, struct file *file)
86 p = ms->buf; 70 p = ms->buf;
87 p += sprintf(p, "\n"); 71 p += sprintf(p, "\n");
88 p += sprintf(p, 72 p += sprintf(p,
89 "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n"); 73 "best __________rate_________ ____statistics___ ____last_____ ______sum-of________\n");
90 p += sprintf(p, 74 p += sprintf(p,
91 "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); 75 "rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n");
92 76
93 for (i = 0; i < mi->n_rates; i++) { 77 for (i = 0; i < mi->n_rates; i++) {
94 struct minstrel_rate *mr = &mi->r[i]; 78 struct minstrel_rate *mr = &mi->r[i];
95 struct minstrel_rate_stats *mrs = &mi->r[i].stats; 79 struct minstrel_rate_stats *mrs = &mi->r[i].stats;
96 unsigned int prob_ewmsd;
97 80
98 *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; 81 *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
99 *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; 82 *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -109,15 +92,13 @@ minstrel_stats_open(struct inode *inode, struct file *file)
109 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); 92 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
110 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); 93 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
111 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 94 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
112 prob_ewmsd = minstrel_get_ewmsd10(mrs);
113 95
114 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" 96 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u"
115 " %3u %3u %-3u " 97 " %3u %3u %-3u "
116 "%9llu %-9llu\n", 98 "%9llu %-9llu\n",
117 tp_max / 10, tp_max % 10, 99 tp_max / 10, tp_max % 10,
118 tp_avg / 10, tp_avg % 10, 100 tp_avg / 10, tp_avg % 10,
119 eprob / 10, eprob % 10, 101 eprob / 10, eprob % 10,
120 prob_ewmsd / 10, prob_ewmsd % 10,
121 mrs->retry_count, 102 mrs->retry_count,
122 mrs->last_success, 103 mrs->last_success,
123 mrs->last_attempts, 104 mrs->last_attempts,
@@ -135,14 +116,6 @@ minstrel_stats_open(struct inode *inode, struct file *file)
135 return 0; 116 return 0;
136} 117}
137 118
138static const struct file_operations minstrel_stat_fops = {
139 .owner = THIS_MODULE,
140 .open = minstrel_stats_open,
141 .read = minstrel_stats_read,
142 .release = minstrel_stats_release,
143 .llseek = default_llseek,
144};
145
146int 119int
147minstrel_stats_csv_open(struct inode *inode, struct file *file) 120minstrel_stats_csv_open(struct inode *inode, struct file *file)
148{ 121{
@@ -161,7 +134,6 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
161 for (i = 0; i < mi->n_rates; i++) { 134 for (i = 0; i < mi->n_rates; i++) {
162 struct minstrel_rate *mr = &mi->r[i]; 135 struct minstrel_rate *mr = &mi->r[i];
163 struct minstrel_rate_stats *mrs = &mi->r[i].stats; 136 struct minstrel_rate_stats *mrs = &mi->r[i].stats;
164 unsigned int prob_ewmsd;
165 137
166 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); 138 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
167 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); 139 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -177,14 +149,12 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
177 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); 149 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
178 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); 150 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
179 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 151 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
180 prob_ewmsd = minstrel_get_ewmsd10(mrs);
181 152
182 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," 153 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u,"
183 "%llu,%llu,%d,%d\n", 154 "%llu,%llu,%d,%d\n",
184 tp_max / 10, tp_max % 10, 155 tp_max / 10, tp_max % 10,
185 tp_avg / 10, tp_avg % 10, 156 tp_avg / 10, tp_avg % 10,
186 eprob / 10, eprob % 10, 157 eprob / 10, eprob % 10,
187 prob_ewmsd / 10, prob_ewmsd % 10,
188 mrs->retry_count, 158 mrs->retry_count,
189 mrs->last_success, 159 mrs->last_success,
190 mrs->last_attempts, 160 mrs->last_attempts,
@@ -200,33 +170,3 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
200 170
201 return 0; 171 return 0;
202} 172}
203
204static const struct file_operations minstrel_stat_csv_fops = {
205 .owner = THIS_MODULE,
206 .open = minstrel_stats_csv_open,
207 .read = minstrel_stats_read,
208 .release = minstrel_stats_release,
209 .llseek = default_llseek,
210};
211
212void
213minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
214{
215 struct minstrel_sta_info *mi = priv_sta;
216
217 mi->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, mi,
218 &minstrel_stat_fops);
219
220 mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, mi,
221 &minstrel_stat_csv_fops);
222}
223
224void
225minstrel_remove_sta_debugfs(void *priv, void *priv_sta)
226{
227 struct minstrel_sta_info *mi = priv_sta;
228
229 debugfs_remove(mi->dbg_stats);
230
231 debugfs_remove(mi->dbg_stats_csv);
232}
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 67ebdeaffbbc..f466ec37d161 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -52,22 +52,23 @@
52 _streams - 1 52 _streams - 1
53 53
54/* MCS rate information for an MCS group */ 54/* MCS rate information for an MCS group */
55#define MCS_GROUP(_streams, _sgi, _ht40) \ 55#define MCS_GROUP(_streams, _sgi, _ht40, _s) \
56 [GROUP_IDX(_streams, _sgi, _ht40)] = { \ 56 [GROUP_IDX(_streams, _sgi, _ht40)] = { \
57 .streams = _streams, \ 57 .streams = _streams, \
58 .shift = _s, \
58 .flags = \ 59 .flags = \
59 IEEE80211_TX_RC_MCS | \ 60 IEEE80211_TX_RC_MCS | \
60 (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ 61 (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
61 (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ 62 (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \
62 .duration = { \ 63 .duration = { \
63 MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26), \ 64 MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26) >> _s, \
64 MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52), \ 65 MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52) >> _s, \
65 MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78), \ 66 MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78) >> _s, \
66 MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104), \ 67 MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104) >> _s, \
67 MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156), \ 68 MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156) >> _s, \
68 MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208), \ 69 MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208) >> _s, \
69 MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234), \ 70 MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234) >> _s, \
70 MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) \ 71 MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) >> _s \
71 } \ 72 } \
72} 73}
73 74
@@ -80,9 +81,10 @@
80#define BW2VBPS(_bw, r3, r2, r1) \ 81#define BW2VBPS(_bw, r3, r2, r1) \
81 (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) 82 (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
82 83
83#define VHT_GROUP(_streams, _sgi, _bw) \ 84#define VHT_GROUP(_streams, _sgi, _bw, _s) \
84 [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ 85 [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
85 .streams = _streams, \ 86 .streams = _streams, \
87 .shift = _s, \
86 .flags = \ 88 .flags = \
87 IEEE80211_TX_RC_VHT_MCS | \ 89 IEEE80211_TX_RC_VHT_MCS | \
88 (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ 90 (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
@@ -90,25 +92,25 @@
90 _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ 92 _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \
91 .duration = { \ 93 .duration = { \
92 MCS_DURATION(_streams, _sgi, \ 94 MCS_DURATION(_streams, _sgi, \
93 BW2VBPS(_bw, 117, 54, 26)), \ 95 BW2VBPS(_bw, 117, 54, 26)) >> _s, \
94 MCS_DURATION(_streams, _sgi, \ 96 MCS_DURATION(_streams, _sgi, \
95 BW2VBPS(_bw, 234, 108, 52)), \ 97 BW2VBPS(_bw, 234, 108, 52)) >> _s, \
96 MCS_DURATION(_streams, _sgi, \ 98 MCS_DURATION(_streams, _sgi, \
97 BW2VBPS(_bw, 351, 162, 78)), \ 99 BW2VBPS(_bw, 351, 162, 78)) >> _s, \
98 MCS_DURATION(_streams, _sgi, \ 100 MCS_DURATION(_streams, _sgi, \
99 BW2VBPS(_bw, 468, 216, 104)), \ 101 BW2VBPS(_bw, 468, 216, 104)) >> _s, \
100 MCS_DURATION(_streams, _sgi, \ 102 MCS_DURATION(_streams, _sgi, \
101 BW2VBPS(_bw, 702, 324, 156)), \ 103 BW2VBPS(_bw, 702, 324, 156)) >> _s, \
102 MCS_DURATION(_streams, _sgi, \ 104 MCS_DURATION(_streams, _sgi, \
103 BW2VBPS(_bw, 936, 432, 208)), \ 105 BW2VBPS(_bw, 936, 432, 208)) >> _s, \
104 MCS_DURATION(_streams, _sgi, \ 106 MCS_DURATION(_streams, _sgi, \
105 BW2VBPS(_bw, 1053, 486, 234)), \ 107 BW2VBPS(_bw, 1053, 486, 234)) >> _s, \
106 MCS_DURATION(_streams, _sgi, \ 108 MCS_DURATION(_streams, _sgi, \
107 BW2VBPS(_bw, 1170, 540, 260)), \ 109 BW2VBPS(_bw, 1170, 540, 260)) >> _s, \
108 MCS_DURATION(_streams, _sgi, \ 110 MCS_DURATION(_streams, _sgi, \
109 BW2VBPS(_bw, 1404, 648, 312)), \ 111 BW2VBPS(_bw, 1404, 648, 312)) >> _s, \
110 MCS_DURATION(_streams, _sgi, \ 112 MCS_DURATION(_streams, _sgi, \
111 BW2VBPS(_bw, 1560, 720, 346)) \ 113 BW2VBPS(_bw, 1560, 720, 346)) >> _s \
112 } \ 114 } \
113} 115}
114 116
@@ -121,28 +123,27 @@
121 (CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \ 123 (CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \
122 CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE)) 124 CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE))
123 125
124#define CCK_DURATION_LIST(_short) \ 126#define CCK_DURATION_LIST(_short, _s) \
125 CCK_ACK_DURATION(10, _short), \ 127 CCK_ACK_DURATION(10, _short) >> _s, \
126 CCK_ACK_DURATION(20, _short), \ 128 CCK_ACK_DURATION(20, _short) >> _s, \
127 CCK_ACK_DURATION(55, _short), \ 129 CCK_ACK_DURATION(55, _short) >> _s, \
128 CCK_ACK_DURATION(110, _short) 130 CCK_ACK_DURATION(110, _short) >> _s
129 131
130#define CCK_GROUP \ 132#define CCK_GROUP(_s) \
131 [MINSTREL_CCK_GROUP] = { \ 133 [MINSTREL_CCK_GROUP] = { \
132 .streams = 0, \ 134 .streams = 1, \
133 .flags = 0, \ 135 .flags = 0, \
136 .shift = _s, \
134 .duration = { \ 137 .duration = { \
135 CCK_DURATION_LIST(false), \ 138 CCK_DURATION_LIST(false, _s), \
136 CCK_DURATION_LIST(true) \ 139 CCK_DURATION_LIST(true, _s) \
137 } \ 140 } \
138 } 141 }
139 142
140#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
141static bool minstrel_vht_only = true; 143static bool minstrel_vht_only = true;
142module_param(minstrel_vht_only, bool, 0644); 144module_param(minstrel_vht_only, bool, 0644);
143MODULE_PARM_DESC(minstrel_vht_only, 145MODULE_PARM_DESC(minstrel_vht_only,
144 "Use only VHT rates when VHT is supported by sta."); 146 "Use only VHT rates when VHT is supported by sta.");
145#endif
146 147
147/* 148/*
148 * To enable sufficiently targeted rate sampling, MCS rates are divided into 149 * To enable sufficiently targeted rate sampling, MCS rates are divided into
@@ -153,49 +154,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
153 * BW -> SGI -> #streams 154 * BW -> SGI -> #streams
154 */ 155 */
155const struct mcs_group minstrel_mcs_groups[] = { 156const struct mcs_group minstrel_mcs_groups[] = {
156 MCS_GROUP(1, 0, BW_20), 157 MCS_GROUP(1, 0, BW_20, 5),
157 MCS_GROUP(2, 0, BW_20), 158 MCS_GROUP(2, 0, BW_20, 4),
158 MCS_GROUP(3, 0, BW_20), 159 MCS_GROUP(3, 0, BW_20, 4),
159 160
160 MCS_GROUP(1, 1, BW_20), 161 MCS_GROUP(1, 1, BW_20, 5),
161 MCS_GROUP(2, 1, BW_20), 162 MCS_GROUP(2, 1, BW_20, 4),
162 MCS_GROUP(3, 1, BW_20), 163 MCS_GROUP(3, 1, BW_20, 4),
163 164
164 MCS_GROUP(1, 0, BW_40), 165 MCS_GROUP(1, 0, BW_40, 4),
165 MCS_GROUP(2, 0, BW_40), 166 MCS_GROUP(2, 0, BW_40, 4),
166 MCS_GROUP(3, 0, BW_40), 167 MCS_GROUP(3, 0, BW_40, 4),
167 168
168 MCS_GROUP(1, 1, BW_40), 169 MCS_GROUP(1, 1, BW_40, 4),
169 MCS_GROUP(2, 1, BW_40), 170 MCS_GROUP(2, 1, BW_40, 4),
170 MCS_GROUP(3, 1, BW_40), 171 MCS_GROUP(3, 1, BW_40, 4),
171 172
172 CCK_GROUP, 173 CCK_GROUP(8),
173 174
174#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT 175 VHT_GROUP(1, 0, BW_20, 5),
175 VHT_GROUP(1, 0, BW_20), 176 VHT_GROUP(2, 0, BW_20, 4),
176 VHT_GROUP(2, 0, BW_20), 177 VHT_GROUP(3, 0, BW_20, 4),
177 VHT_GROUP(3, 0, BW_20),
178 178
179 VHT_GROUP(1, 1, BW_20), 179 VHT_GROUP(1, 1, BW_20, 5),
180 VHT_GROUP(2, 1, BW_20), 180 VHT_GROUP(2, 1, BW_20, 4),
181 VHT_GROUP(3, 1, BW_20), 181 VHT_GROUP(3, 1, BW_20, 4),
182 182
183 VHT_GROUP(1, 0, BW_40), 183 VHT_GROUP(1, 0, BW_40, 4),
184 VHT_GROUP(2, 0, BW_40), 184 VHT_GROUP(2, 0, BW_40, 4),
185 VHT_GROUP(3, 0, BW_40), 185 VHT_GROUP(3, 0, BW_40, 4),
186 186
187 VHT_GROUP(1, 1, BW_40), 187 VHT_GROUP(1, 1, BW_40, 4),
188 VHT_GROUP(2, 1, BW_40), 188 VHT_GROUP(2, 1, BW_40, 4),
189 VHT_GROUP(3, 1, BW_40), 189 VHT_GROUP(3, 1, BW_40, 4),
190 190
191 VHT_GROUP(1, 0, BW_80), 191 VHT_GROUP(1, 0, BW_80, 4),
192 VHT_GROUP(2, 0, BW_80), 192 VHT_GROUP(2, 0, BW_80, 4),
193 VHT_GROUP(3, 0, BW_80), 193 VHT_GROUP(3, 0, BW_80, 4),
194 194
195 VHT_GROUP(1, 1, BW_80), 195 VHT_GROUP(1, 1, BW_80, 4),
196 VHT_GROUP(2, 1, BW_80), 196 VHT_GROUP(2, 1, BW_80, 4),
197 VHT_GROUP(3, 1, BW_80), 197 VHT_GROUP(3, 1, BW_80, 4),
198#endif
199}; 198};
200 199
201static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; 200static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
@@ -282,7 +281,8 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
282 break; 281 break;
283 282
284 /* short preamble */ 283 /* short preamble */
285 if (!(mi->supported[group] & BIT(idx))) 284 if ((mi->supported[group] & BIT(idx + 4)) &&
285 (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE))
286 idx += 4; 286 idx += 4;
287 } 287 }
288 return &mi->groups[group].rates[idx]; 288 return &mi->groups[group].rates[idx];
@@ -311,7 +311,8 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
311 if (group != MINSTREL_CCK_GROUP) 311 if (group != MINSTREL_CCK_GROUP)
312 nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); 312 nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len);
313 313
314 nsecs += minstrel_mcs_groups[group].duration[rate]; 314 nsecs += minstrel_mcs_groups[group].duration[rate] <<
315 minstrel_mcs_groups[group].shift;
315 316
316 /* 317 /*
317 * For the throughput calculation, limit the probability value to 90% to 318 * For the throughput calculation, limit the probability value to 90% to
@@ -759,12 +760,19 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
759 minstrel_ht_update_rates(mp, mi); 760 minstrel_ht_update_rates(mp, mi);
760} 761}
761 762
763static inline int
764minstrel_get_duration(int index)
765{
766 const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
767 unsigned int duration = group->duration[index % MCS_GROUP_RATES];
768 return duration << group->shift;
769}
770
762static void 771static void
763minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, 772minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
764 int index) 773 int index)
765{ 774{
766 struct minstrel_rate_stats *mrs; 775 struct minstrel_rate_stats *mrs;
767 const struct mcs_group *group;
768 unsigned int tx_time, tx_time_rtscts, tx_time_data; 776 unsigned int tx_time, tx_time_rtscts, tx_time_data;
769 unsigned int cw = mp->cw_min; 777 unsigned int cw = mp->cw_min;
770 unsigned int ctime = 0; 778 unsigned int ctime = 0;
@@ -783,8 +791,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
783 mrs->retry_count_rtscts = 2; 791 mrs->retry_count_rtscts = 2;
784 mrs->retry_updated = true; 792 mrs->retry_updated = true;
785 793
786 group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; 794 tx_time_data = minstrel_get_duration(index) * ampdu_len / 1000;
787 tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len / 1000;
788 795
789 /* Contention time for first 2 tries */ 796 /* Contention time for first 2 tries */
790 ctime = (t_slot * cw) >> 1; 797 ctime = (t_slot * cw) >> 1;
@@ -878,20 +885,24 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
878 int group = mi->max_prob_rate / MCS_GROUP_RATES; 885 int group = mi->max_prob_rate / MCS_GROUP_RATES;
879 const struct mcs_group *g = &minstrel_mcs_groups[group]; 886 const struct mcs_group *g = &minstrel_mcs_groups[group];
880 int rate = mi->max_prob_rate % MCS_GROUP_RATES; 887 int rate = mi->max_prob_rate % MCS_GROUP_RATES;
888 unsigned int duration;
881 889
882 /* Disable A-MSDU if max_prob_rate is bad */ 890 /* Disable A-MSDU if max_prob_rate is bad */
883 if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) 891 if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100))
884 return 1; 892 return 1;
885 893
894 duration = g->duration[rate];
895 duration <<= g->shift;
896
886 /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */ 897 /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */
887 if (g->duration[rate] > MCS_DURATION(1, 0, 52)) 898 if (duration > MCS_DURATION(1, 0, 52))
888 return 500; 899 return 500;
889 900
890 /* 901 /*
891 * If the rate is slower than single-stream MCS4, limit A-MSDU to usual 902 * If the rate is slower than single-stream MCS4, limit A-MSDU to usual
892 * data packet size 903 * data packet size
893 */ 904 */
894 if (g->duration[rate] > MCS_DURATION(1, 0, 104)) 905 if (duration > MCS_DURATION(1, 0, 104))
895 return 1600; 906 return 1600;
896 907
897 /* 908 /*
@@ -899,7 +910,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
899 * rate success probability is less than 75%, limit A-MSDU to twice the usual 910 * rate success probability is less than 75%, limit A-MSDU to twice the usual
900 * data packet size 911 * data packet size
901 */ 912 */
902 if (g->duration[rate] > MCS_DURATION(1, 0, 260) || 913 if (duration > MCS_DURATION(1, 0, 260) ||
903 (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < 914 (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) <
904 MINSTREL_FRAC(75, 100))) 915 MINSTREL_FRAC(75, 100)))
905 return 3200; 916 return 3200;
@@ -946,13 +957,6 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
946 rate_control_set_rates(mp->hw, mi->sta, rates); 957 rate_control_set_rates(mp->hw, mi->sta, rates);
947} 958}
948 959
949static inline int
950minstrel_get_duration(int index)
951{
952 const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
953 return group->duration[index % MCS_GROUP_RATES];
954}
955
956static int 960static int
957minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) 961minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
958{ 962{
@@ -1000,10 +1004,13 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
1000 return -1; 1004 return -1;
1001 1005
1002 /* 1006 /*
1003 * Do not sample if the probability is already higher than 95% 1007 * Do not sample if the probability is already higher than 95%,
1004 * to avoid wasting airtime. 1008 * or if the rate is 3 times slower than the current max probability
1009 * rate, to avoid wasting airtime.
1005 */ 1010 */
1006 if (mrs->prob_ewma > MINSTREL_FRAC(95, 100)) 1011 sample_dur = minstrel_get_duration(sample_idx);
1012 if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) ||
1013 minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur)
1007 return -1; 1014 return -1;
1008 1015
1009 /* 1016 /*
@@ -1013,7 +1020,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
1013 1020
1014 cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 / 1021 cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 /
1015 MCS_GROUP_RATES].streams; 1022 MCS_GROUP_RATES].streams;
1016 sample_dur = minstrel_get_duration(sample_idx);
1017 if (sample_dur >= minstrel_get_duration(tp_rate2) && 1023 if (sample_dur >= minstrel_get_duration(tp_rate2) &&
1018 (cur_max_tp_streams - 1 < 1024 (cur_max_tp_streams - 1 <
1019 minstrel_mcs_groups[sample_group].streams || 1025 minstrel_mcs_groups[sample_group].streams ||
@@ -1077,18 +1083,23 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
1077 return; 1083 return;
1078 1084
1079 sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES]; 1085 sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES];
1086 sample_idx %= MCS_GROUP_RATES;
1087
1088 if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] &&
1089 (sample_idx >= 4) != txrc->short_preamble)
1090 return;
1091
1080 info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; 1092 info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
1081 rate->count = 1; 1093 rate->count = 1;
1082 1094
1083 if (sample_idx / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) { 1095 if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) {
1084 int idx = sample_idx % ARRAY_SIZE(mp->cck_rates); 1096 int idx = sample_idx % ARRAY_SIZE(mp->cck_rates);
1085 rate->idx = mp->cck_rates[idx]; 1097 rate->idx = mp->cck_rates[idx];
1086 } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) { 1098 } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) {
1087 ieee80211_rate_set_vht(rate, sample_idx % MCS_GROUP_RATES, 1099 ieee80211_rate_set_vht(rate, sample_idx % MCS_GROUP_RATES,
1088 sample_group->streams); 1100 sample_group->streams);
1089 } else { 1101 } else {
1090 rate->idx = sample_idx % MCS_GROUP_RATES + 1102 rate->idx = sample_idx + (sample_group->streams - 1) * 8;
1091 (sample_group->streams - 1) * 8;
1092 } 1103 }
1093 1104
1094 rate->flags = sample_group->flags; 1105 rate->flags = sample_group->flags;
@@ -1130,14 +1141,14 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1130 struct minstrel_ht_sta_priv *msp = priv_sta; 1141 struct minstrel_ht_sta_priv *msp = priv_sta;
1131 struct minstrel_ht_sta *mi = &msp->ht; 1142 struct minstrel_ht_sta *mi = &msp->ht;
1132 struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; 1143 struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
1133 u16 sta_cap = sta->ht_cap.cap; 1144 u16 ht_cap = sta->ht_cap.cap;
1134 struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; 1145 struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
1135 struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
1136 int use_vht; 1146 int use_vht;
1137 int n_supported = 0; 1147 int n_supported = 0;
1138 int ack_dur; 1148 int ack_dur;
1139 int stbc; 1149 int stbc;
1140 int i; 1150 int i;
1151 bool ldpc;
1141 1152
1142 /* fall back to the old minstrel for legacy stations */ 1153 /* fall back to the old minstrel for legacy stations */
1143 if (!sta->ht_cap.ht_supported) 1154 if (!sta->ht_cap.ht_supported)
@@ -1145,12 +1156,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1145 1156
1146 BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB); 1157 BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB);
1147 1158
1148#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
1149 if (vht_cap->vht_supported) 1159 if (vht_cap->vht_supported)
1150 use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0); 1160 use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0);
1151 else 1161 else
1152#endif 1162 use_vht = 0;
1153 use_vht = 0;
1154 1163
1155 msp->is_ht = true; 1164 msp->is_ht = true;
1156 memset(mi, 0, sizeof(*mi)); 1165 memset(mi, 0, sizeof(*mi));
@@ -1175,16 +1184,22 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1175 } 1184 }
1176 mi->sample_tries = 4; 1185 mi->sample_tries = 4;
1177 1186
1178 /* TODO tx_flags for vht - ATM the RC API is not fine-grained enough */
1179 if (!use_vht) { 1187 if (!use_vht) {
1180 stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >> 1188 stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >>
1181 IEEE80211_HT_CAP_RX_STBC_SHIFT; 1189 IEEE80211_HT_CAP_RX_STBC_SHIFT;
1182 mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
1183 1190
1184 if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING) 1191 ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING;
1185 mi->tx_flags |= IEEE80211_TX_CTL_LDPC; 1192 } else {
1193 stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >>
1194 IEEE80211_VHT_CAP_RXSTBC_SHIFT;
1195
1196 ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC;
1186 } 1197 }
1187 1198
1199 mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
1200 if (ldpc)
1201 mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
1202
1188 for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { 1203 for (i = 0; i < ARRAY_SIZE(mi->groups); i++) {
1189 u32 gflags = minstrel_mcs_groups[i].flags; 1204 u32 gflags = minstrel_mcs_groups[i].flags;
1190 int bw, nss; 1205 int bw, nss;
@@ -1197,10 +1212,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1197 1212
1198 if (gflags & IEEE80211_TX_RC_SHORT_GI) { 1213 if (gflags & IEEE80211_TX_RC_SHORT_GI) {
1199 if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) { 1214 if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) {
1200 if (!(sta_cap & IEEE80211_HT_CAP_SGI_40)) 1215 if (!(ht_cap & IEEE80211_HT_CAP_SGI_40))
1201 continue; 1216 continue;
1202 } else { 1217 } else {
1203 if (!(sta_cap & IEEE80211_HT_CAP_SGI_20)) 1218 if (!(ht_cap & IEEE80211_HT_CAP_SGI_20))
1204 continue; 1219 continue;
1205 } 1220 }
1206 } 1221 }
@@ -1217,10 +1232,9 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1217 1232
1218 /* HT rate */ 1233 /* HT rate */
1219 if (gflags & IEEE80211_TX_RC_MCS) { 1234 if (gflags & IEEE80211_TX_RC_MCS) {
1220#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
1221 if (use_vht && minstrel_vht_only) 1235 if (use_vht && minstrel_vht_only)
1222 continue; 1236 continue;
1223#endif 1237
1224 mi->supported[i] = mcs->rx_mask[nss - 1]; 1238 mi->supported[i] = mcs->rx_mask[nss - 1];
1225 if (mi->supported[i]) 1239 if (mi->supported[i])
1226 n_supported++; 1240 n_supported++;
@@ -1258,8 +1272,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1258 if (!n_supported) 1272 if (!n_supported)
1259 goto use_legacy; 1273 goto use_legacy;
1260 1274
1261 if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE)) 1275 mi->supported[MINSTREL_CCK_GROUP] |= mi->cck_supported_short << 4;
1262 mi->cck_supported_short |= mi->cck_supported_short << 4;
1263 1276
1264 /* create an initial rate table with the lowest supported rates */ 1277 /* create an initial rate table with the lowest supported rates */
1265 minstrel_ht_update_stats(mp, mi); 1278 minstrel_ht_update_stats(mp, mi);
@@ -1340,16 +1353,88 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
1340 kfree(msp); 1353 kfree(msp);
1341} 1354}
1342 1355
1356static void
1357minstrel_ht_init_cck_rates(struct minstrel_priv *mp)
1358{
1359 static const int bitrates[4] = { 10, 20, 55, 110 };
1360 struct ieee80211_supported_band *sband;
1361 u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
1362 int i, j;
1363
1364 sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ];
1365 if (!sband)
1366 return;
1367
1368 for (i = 0; i < sband->n_bitrates; i++) {
1369 struct ieee80211_rate *rate = &sband->bitrates[i];
1370
1371 if (rate->flags & IEEE80211_RATE_ERP_G)
1372 continue;
1373
1374 if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
1375 continue;
1376
1377 for (j = 0; j < ARRAY_SIZE(bitrates); j++) {
1378 if (rate->bitrate != bitrates[j])
1379 continue;
1380
1381 mp->cck_rates[j] = i;
1382 break;
1383 }
1384 }
1385}
1386
1343static void * 1387static void *
1344minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) 1388minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
1345{ 1389{
1346 return mac80211_minstrel.alloc(hw, debugfsdir); 1390 struct minstrel_priv *mp;
1391
1392 mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
1393 if (!mp)
1394 return NULL;
1395
1396 /* contention window settings
1397 * Just an approximation. Using the per-queue values would complicate
1398 * the calculations and is probably unnecessary */
1399 mp->cw_min = 15;
1400 mp->cw_max = 1023;
1401
1402 /* number of packets (in %) to use for sampling other rates
1403 * sample less often for non-mrr packets, because the overhead
1404 * is much higher than with mrr */
1405 mp->lookaround_rate = 5;
1406 mp->lookaround_rate_mrr = 10;
1407
1408 /* maximum time that the hw is allowed to stay in one MRR segment */
1409 mp->segment_size = 6000;
1410
1411 if (hw->max_rate_tries > 0)
1412 mp->max_retry = hw->max_rate_tries;
1413 else
1414 /* safe default, does not necessarily have to match hw properties */
1415 mp->max_retry = 7;
1416
1417 if (hw->max_rates >= 4)
1418 mp->has_mrr = true;
1419
1420 mp->hw = hw;
1421 mp->update_interval = 100;
1422
1423#ifdef CONFIG_MAC80211_DEBUGFS
1424 mp->fixed_rate_idx = (u32) -1;
1425 debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir,
1426 &mp->fixed_rate_idx);
1427#endif
1428
1429 minstrel_ht_init_cck_rates(mp);
1430
1431 return mp;
1347} 1432}
1348 1433
1349static void 1434static void
1350minstrel_ht_free(void *priv) 1435minstrel_ht_free(void *priv)
1351{ 1436{
1352 mac80211_minstrel.free(priv); 1437 kfree(priv);
1353} 1438}
1354 1439
1355static u32 minstrel_ht_get_expected_throughput(void *priv_sta) 1440static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
@@ -1384,7 +1469,6 @@ static const struct rate_control_ops mac80211_minstrel_ht = {
1384 .free = minstrel_ht_free, 1469 .free = minstrel_ht_free,
1385#ifdef CONFIG_MAC80211_DEBUGFS 1470#ifdef CONFIG_MAC80211_DEBUGFS
1386 .add_sta_debugfs = minstrel_ht_add_sta_debugfs, 1471 .add_sta_debugfs = minstrel_ht_add_sta_debugfs,
1387 .remove_sta_debugfs = minstrel_ht_remove_sta_debugfs,
1388#endif 1472#endif
1389 .get_expected_throughput = minstrel_ht_get_expected_throughput, 1473 .get_expected_throughput = minstrel_ht_get_expected_throughput,
1390}; 1474};
@@ -1409,14 +1493,14 @@ static void __init init_sample_table(void)
1409} 1493}
1410 1494
1411int __init 1495int __init
1412rc80211_minstrel_ht_init(void) 1496rc80211_minstrel_init(void)
1413{ 1497{
1414 init_sample_table(); 1498 init_sample_table();
1415 return ieee80211_rate_control_register(&mac80211_minstrel_ht); 1499 return ieee80211_rate_control_register(&mac80211_minstrel_ht);
1416} 1500}
1417 1501
1418void 1502void
1419rc80211_minstrel_ht_exit(void) 1503rc80211_minstrel_exit(void)
1420{ 1504{
1421 ieee80211_rate_control_unregister(&mac80211_minstrel_ht); 1505 ieee80211_rate_control_unregister(&mac80211_minstrel_ht);
1422} 1506}
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index de1646c42e82..26b7a3244b47 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -15,11 +15,7 @@
15 */ 15 */
16#define MINSTREL_MAX_STREAMS 3 16#define MINSTREL_MAX_STREAMS 3
17#define MINSTREL_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */ 17#define MINSTREL_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */
18#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
19#define MINSTREL_VHT_STREAM_GROUPS 6 /* BW(=3) * SGI(=2) */ 18#define MINSTREL_VHT_STREAM_GROUPS 6 /* BW(=3) * SGI(=2) */
20#else
21#define MINSTREL_VHT_STREAM_GROUPS 0
22#endif
23 19
24#define MINSTREL_HT_GROUPS_NB (MINSTREL_MAX_STREAMS * \ 20#define MINSTREL_HT_GROUPS_NB (MINSTREL_MAX_STREAMS * \
25 MINSTREL_HT_STREAM_GROUPS) 21 MINSTREL_HT_STREAM_GROUPS)
@@ -34,16 +30,13 @@
34#define MINSTREL_CCK_GROUP (MINSTREL_HT_GROUP_0 + MINSTREL_HT_GROUPS_NB) 30#define MINSTREL_CCK_GROUP (MINSTREL_HT_GROUP_0 + MINSTREL_HT_GROUPS_NB)
35#define MINSTREL_VHT_GROUP_0 (MINSTREL_CCK_GROUP + 1) 31#define MINSTREL_VHT_GROUP_0 (MINSTREL_CCK_GROUP + 1)
36 32
37#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
38#define MCS_GROUP_RATES 10 33#define MCS_GROUP_RATES 10
39#else
40#define MCS_GROUP_RATES 8
41#endif
42 34
43struct mcs_group { 35struct mcs_group {
44 u32 flags; 36 u16 flags;
45 unsigned int streams; 37 u8 streams;
46 unsigned int duration[MCS_GROUP_RATES]; 38 u8 shift;
39 u16 duration[MCS_GROUP_RATES];
47}; 40};
48 41
49extern const struct mcs_group minstrel_mcs_groups[]; 42extern const struct mcs_group minstrel_mcs_groups[];
@@ -110,17 +103,12 @@ struct minstrel_ht_sta_priv {
110 struct minstrel_ht_sta ht; 103 struct minstrel_ht_sta ht;
111 struct minstrel_sta_info legacy; 104 struct minstrel_sta_info legacy;
112 }; 105 };
113#ifdef CONFIG_MAC80211_DEBUGFS
114 struct dentry *dbg_stats;
115 struct dentry *dbg_stats_csv;
116#endif
117 void *ratelist; 106 void *ratelist;
118 void *sample_table; 107 void *sample_table;
119 bool is_ht; 108 bool is_ht;
120}; 109};
121 110
122void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); 111void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
123void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta);
124int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, 112int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
125 int prob_ewma); 113 int prob_ewma);
126 114
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index bfcc03152dc6..57820a5f2c16 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -15,6 +15,22 @@
15#include "rc80211_minstrel.h" 15#include "rc80211_minstrel.h"
16#include "rc80211_minstrel_ht.h" 16#include "rc80211_minstrel_ht.h"
17 17
18static ssize_t
19minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos)
20{
21 struct minstrel_debugfs_info *ms;
22
23 ms = file->private_data;
24 return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len);
25}
26
27static int
28minstrel_stats_release(struct inode *inode, struct file *file)
29{
30 kfree(file->private_data);
31 return 0;
32}
33
18static char * 34static char *
19minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) 35minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
20{ 36{
@@ -41,7 +57,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
41 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; 57 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
42 static const int bitrates[4] = { 10, 20, 55, 110 }; 58 static const int bitrates[4] = { 10, 20, 55, 110 };
43 int idx = i * MCS_GROUP_RATES + j; 59 int idx = i * MCS_GROUP_RATES + j;
44 unsigned int prob_ewmsd; 60 unsigned int duration;
45 61
46 if (!(mi->supported[i] & BIT(j))) 62 if (!(mi->supported[i] & BIT(j)))
47 continue; 63 continue;
@@ -79,21 +95,21 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
79 p += sprintf(p, " %3u ", idx); 95 p += sprintf(p, " %3u ", idx);
80 96
81 /* tx_time[rate(i)] in usec */ 97 /* tx_time[rate(i)] in usec */
82 tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000); 98 duration = mg->duration[j];
99 duration <<= mg->shift;
100 tx_time = DIV_ROUND_CLOSEST(duration, 1000);
83 p += sprintf(p, "%6u ", tx_time); 101 p += sprintf(p, "%6u ", tx_time);
84 102
85 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); 103 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
86 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); 104 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
87 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 105 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
88 prob_ewmsd = minstrel_get_ewmsd10(mrs);
89 106
90 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" 107 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u"
91 " %3u %3u %-3u " 108 " %3u %3u %-3u "
92 "%9llu %-9llu\n", 109 "%9llu %-9llu\n",
93 tp_max / 10, tp_max % 10, 110 tp_max / 10, tp_max % 10,
94 tp_avg / 10, tp_avg % 10, 111 tp_avg / 10, tp_avg % 10,
95 eprob / 10, eprob % 10, 112 eprob / 10, eprob % 10,
96 prob_ewmsd / 10, prob_ewmsd % 10,
97 mrs->retry_count, 113 mrs->retry_count,
98 mrs->last_success, 114 mrs->last_success,
99 mrs->last_attempts, 115 mrs->last_attempts,
@@ -130,9 +146,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
130 146
131 p += sprintf(p, "\n"); 147 p += sprintf(p, "\n");
132 p += sprintf(p, 148 p += sprintf(p,
133 " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n"); 149 " best ____________rate__________ ____statistics___ _____last____ ______sum-of________\n");
134 p += sprintf(p, 150 p += sprintf(p,
135 "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); 151 "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n");
136 152
137 p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); 153 p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
138 for (i = 0; i < MINSTREL_CCK_GROUP; i++) 154 for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -187,7 +203,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
187 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; 203 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
188 static const int bitrates[4] = { 10, 20, 55, 110 }; 204 static const int bitrates[4] = { 10, 20, 55, 110 };
189 int idx = i * MCS_GROUP_RATES + j; 205 int idx = i * MCS_GROUP_RATES + j;
190 unsigned int prob_ewmsd; 206 unsigned int duration;
191 207
192 if (!(mi->supported[i] & BIT(j))) 208 if (!(mi->supported[i] & BIT(j)))
193 continue; 209 continue;
@@ -222,20 +238,21 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
222 } 238 }
223 239
224 p += sprintf(p, "%u,", idx); 240 p += sprintf(p, "%u,", idx);
225 tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000); 241
242 duration = mg->duration[j];
243 duration <<= mg->shift;
244 tx_time = DIV_ROUND_CLOSEST(duration, 1000);
226 p += sprintf(p, "%u,", tx_time); 245 p += sprintf(p, "%u,", tx_time);
227 246
228 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); 247 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
229 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); 248 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
230 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 249 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
231 prob_ewmsd = minstrel_get_ewmsd10(mrs);
232 250
233 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," 251 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,"
234 "%u,%llu,%llu,", 252 "%u,%llu,%llu,",
235 tp_max / 10, tp_max % 10, 253 tp_max / 10, tp_max % 10,
236 tp_avg / 10, tp_avg % 10, 254 tp_avg / 10, tp_avg % 10,
237 eprob / 10, eprob % 10, 255 eprob / 10, eprob % 10,
238 prob_ewmsd / 10, prob_ewmsd % 10,
239 mrs->retry_count, 256 mrs->retry_count,
240 mrs->last_success, 257 mrs->last_success,
241 mrs->last_attempts, 258 mrs->last_attempts,
@@ -303,17 +320,8 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
303{ 320{
304 struct minstrel_ht_sta_priv *msp = priv_sta; 321 struct minstrel_ht_sta_priv *msp = priv_sta;
305 322
306 msp->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, msp, 323 debugfs_create_file("rc_stats", 0444, dir, msp,
307 &minstrel_ht_stat_fops); 324 &minstrel_ht_stat_fops);
308 msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, msp, 325 debugfs_create_file("rc_stats_csv", 0444, dir, msp,
309 &minstrel_ht_stat_csv_fops); 326 &minstrel_ht_stat_csv_fops);
310}
311
312void
313minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta)
314{
315 struct minstrel_ht_sta_priv *msp = priv_sta;
316
317 debugfs_remove(msp->dbg_stats);
318 debugfs_remove(msp->dbg_stats_csv);
319} 327}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 96611d5dfadb..3bd3b5769797 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -115,7 +115,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
115 115
116 if (status->flag & (RX_FLAG_FAILED_FCS_CRC | 116 if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
117 RX_FLAG_FAILED_PLCP_CRC | 117 RX_FLAG_FAILED_PLCP_CRC |
118 RX_FLAG_ONLY_MONITOR)) 118 RX_FLAG_ONLY_MONITOR |
119 RX_FLAG_NO_PSDU))
119 return true; 120 return true;
120 121
121 if (unlikely(skb->len < 16 + present_fcs_len + rtap_space)) 122 if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
@@ -189,6 +190,15 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
189 BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12); 190 BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
190 } 191 }
191 192
193 if (status->flag & RX_FLAG_NO_PSDU)
194 len += 1;
195
196 if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
197 len = ALIGN(len, 2);
198 len += 4;
199 BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_lsig) != 4);
200 }
201
192 if (status->chains) { 202 if (status->chains) {
193 /* antenna and antenna signal fields */ 203 /* antenna and antenna signal fields */
194 len += 2 * hweight8(status->chains); 204 len += 2 * hweight8(status->chains);
@@ -279,6 +289,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
279 struct ieee80211_vendor_radiotap rtap = {}; 289 struct ieee80211_vendor_radiotap rtap = {};
280 struct ieee80211_radiotap_he he = {}; 290 struct ieee80211_radiotap_he he = {};
281 struct ieee80211_radiotap_he_mu he_mu = {}; 291 struct ieee80211_radiotap_he_mu he_mu = {};
292 struct ieee80211_radiotap_lsig lsig = {};
282 293
283 if (status->flag & RX_FLAG_RADIOTAP_HE) { 294 if (status->flag & RX_FLAG_RADIOTAP_HE) {
284 he = *(struct ieee80211_radiotap_he *)skb->data; 295 he = *(struct ieee80211_radiotap_he *)skb->data;
@@ -291,6 +302,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
291 skb_pull(skb, sizeof(he_mu)); 302 skb_pull(skb, sizeof(he_mu));
292 } 303 }
293 304
305 if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
306 lsig = *(struct ieee80211_radiotap_lsig *)skb->data;
307 skb_pull(skb, sizeof(lsig));
308 }
309
294 if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { 310 if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
295 rtap = *(struct ieee80211_vendor_radiotap *)skb->data; 311 rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
296 /* rtap.len and rtap.pad are undone immediately */ 312 /* rtap.len and rtap.pad are undone immediately */
@@ -549,7 +565,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
549 565
550 if (status->encoding == RX_ENC_HE && 566 if (status->encoding == RX_ENC_HE &&
551 status->flag & RX_FLAG_RADIOTAP_HE) { 567 status->flag & RX_FLAG_RADIOTAP_HE) {
552#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val)) 568#define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f)
553 569
554 if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) { 570 if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) {
555 he.data6 |= HE_PREP(DATA6_NSTS, 571 he.data6 |= HE_PREP(DATA6_NSTS,
@@ -630,6 +646,21 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
630 pos += sizeof(he_mu); 646 pos += sizeof(he_mu);
631 } 647 }
632 648
649 if (status->flag & RX_FLAG_NO_PSDU) {
650 rthdr->it_present |=
651 cpu_to_le32(1 << IEEE80211_RADIOTAP_ZERO_LEN_PSDU);
652 *pos++ = status->zero_length_psdu_type;
653 }
654
655 if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
656 /* ensure 2 byte alignment */
657 while ((pos - (u8 *)rthdr) & 1)
658 pos++;
659 rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_LSIG);
660 memcpy(pos, &lsig, sizeof(lsig));
661 pos += sizeof(lsig);
662 }
663
633 for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { 664 for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
634 *pos++ = status->chain_signal[chain]; 665 *pos++ = status->chain_signal[chain];
635 *pos++ = chain; 666 *pos++ = chain;
@@ -1505,7 +1536,7 @@ static void sta_ps_start(struct sta_info *sta)
1505 if (!sta->sta.txq[0]) 1536 if (!sta->sta.txq[0])
1506 return; 1537 return;
1507 1538
1508 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { 1539 for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
1509 if (txq_has_queue(sta->sta.txq[tid])) 1540 if (txq_has_queue(sta->sta.txq[tid]))
1510 set_bit(tid, &sta->txq_buffered_tids); 1541 set_bit(tid, &sta->txq_buffered_tids);
1511 else 1542 else
@@ -2046,6 +2077,7 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
2046 idx = sdata->fragment_next; 2077 idx = sdata->fragment_next;
2047 for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) { 2078 for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
2048 struct ieee80211_hdr *f_hdr; 2079 struct ieee80211_hdr *f_hdr;
2080 struct sk_buff *f_skb;
2049 2081
2050 idx--; 2082 idx--;
2051 if (idx < 0) 2083 if (idx < 0)
@@ -2057,7 +2089,8 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
2057 entry->last_frag + 1 != frag) 2089 entry->last_frag + 1 != frag)
2058 continue; 2090 continue;
2059 2091
2060 f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data; 2092 f_skb = __skb_peek(&entry->skb_list);
2093 f_hdr = (struct ieee80211_hdr *) f_skb->data;
2061 2094
2062 /* 2095 /*
2063 * Check ftype and addresses are equal, else check next fragment 2096 * Check ftype and addresses are equal, else check next fragment
@@ -2314,7 +2347,7 @@ __ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control)
2314 2347
2315 if (!sdata->u.mgd.use_4addr) 2348 if (!sdata->u.mgd.use_4addr)
2316 return -1; 2349 return -1;
2317 else 2350 else if (!ether_addr_equal(hdr->addr1, sdata->vif.addr))
2318 check_port_control = true; 2351 check_port_control = true;
2319 } 2352 }
2320 2353
@@ -2425,8 +2458,9 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
2425 if (!xmit_skb) 2458 if (!xmit_skb)
2426 net_info_ratelimited("%s: failed to clone multicast frame\n", 2459 net_info_ratelimited("%s: failed to clone multicast frame\n",
2427 dev->name); 2460 dev->name);
2428 } else if (!is_multicast_ether_addr(ehdr->h_dest)) { 2461 } else if (!is_multicast_ether_addr(ehdr->h_dest) &&
2429 dsta = sta_info_get(sdata, skb->data); 2462 !ether_addr_equal(ehdr->h_dest, ehdr->h_source)) {
2463 dsta = sta_info_get(sdata, ehdr->h_dest);
2430 if (dsta) { 2464 if (dsta) {
2431 /* 2465 /*
2432 * The destination station is associated to 2466 * The destination station is associated to
@@ -4207,11 +4241,10 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
4207 4241
4208 if (fast_rx->internal_forward) { 4242 if (fast_rx->internal_forward) {
4209 struct sk_buff *xmit_skb = NULL; 4243 struct sk_buff *xmit_skb = NULL;
4210 bool multicast = is_multicast_ether_addr(skb->data); 4244 if (is_multicast_ether_addr(addrs.da)) {
4211
4212 if (multicast) {
4213 xmit_skb = skb_copy(skb, GFP_ATOMIC); 4245 xmit_skb = skb_copy(skb, GFP_ATOMIC);
4214 } else if (sta_info_get(rx->sdata, skb->data)) { 4246 } else if (!ether_addr_equal(addrs.da, addrs.sa) &&
4247 sta_info_get(rx->sdata, addrs.da)) {
4215 xmit_skb = skb; 4248 xmit_skb = skb;
4216 skb = NULL; 4249 skb = NULL;
4217 } 4250 }
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 029334835747..4e4902bdbef8 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -144,6 +144,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
144 wide_bw_chansw_ie->new_center_freq_seg1, 144 wide_bw_chansw_ie->new_center_freq_seg1,
145 /* .basic_mcs_set doesn't matter */ 145 /* .basic_mcs_set doesn't matter */
146 }; 146 };
147 struct ieee80211_ht_operation ht_oper = {};
147 148
148 /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT, 149 /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT,
149 * to the previously parsed chandef 150 * to the previously parsed chandef
@@ -151,7 +152,9 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
151 new_vht_chandef = csa_ie->chandef; 152 new_vht_chandef = csa_ie->chandef;
152 153
153 /* ignore if parsing fails */ 154 /* ignore if parsing fails */
154 if (!ieee80211_chandef_vht_oper(&vht_oper, &new_vht_chandef)) 155 if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
156 &vht_oper, &ht_oper,
157 &new_vht_chandef))
155 new_vht_chandef.chan = NULL; 158 new_vht_chandef.chan = NULL;
156 159
157 if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ && 160 if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ &&
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f34202242d24..fb8c2252ac0e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -113,7 +113,12 @@ static void __cleanup_single_sta(struct sta_info *sta)
113 113
114 if (sta->sta.txq[0]) { 114 if (sta->sta.txq[0]) {
115 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { 115 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
116 struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); 116 struct txq_info *txqi;
117
118 if (!sta->sta.txq[i])
119 continue;
120
121 txqi = to_txq_info(sta->sta.txq[i]);
117 122
118 spin_lock_bh(&fq->lock); 123 spin_lock_bh(&fq->lock);
119 ieee80211_txq_purge(local, txqi); 124 ieee80211_txq_purge(local, txqi);
@@ -374,6 +379,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
374 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { 379 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
375 struct txq_info *txq = txq_data + i * size; 380 struct txq_info *txq = txq_data + i * size;
376 381
382 /* might not do anything for the bufferable MMPDU TXQ */
377 ieee80211_txq_init(sdata, sta, txq, i); 383 ieee80211_txq_init(sdata, sta, txq, i);
378 } 384 }
379 } 385 }
@@ -1239,13 +1245,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
1239 if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) 1245 if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
1240 drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); 1246 drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
1241 1247
1242 if (sta->sta.txq[0]) { 1248 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
1243 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { 1249 if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i]))
1244 if (!txq_has_queue(sta->sta.txq[i])) 1250 continue;
1245 continue;
1246 1251
1247 drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i])); 1252 drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
1248 }
1249 } 1253 }
1250 1254
1251 skb_queue_head_init(&pending); 1255 skb_queue_head_init(&pending);
@@ -1683,7 +1687,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
1683 return; 1687 return;
1684 1688
1685 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { 1689 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
1686 if (!(driver_release_tids & BIT(tid)) || 1690 if (!sta->sta.txq[tid] ||
1691 !(driver_release_tids & BIT(tid)) ||
1687 txq_has_queue(sta->sta.txq[tid])) 1692 txq_has_queue(sta->sta.txq[tid]))
1688 continue; 1693 continue;
1689 1694
@@ -2323,13 +2328,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2323 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); 2328 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
2324 } 2329 }
2325 2330
2326 if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) && 2331 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) &&
2327 !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) { 2332 sta->status_stats.ack_signal_filled) {
2328 sinfo->avg_ack_signal = 2333 sinfo->avg_ack_signal =
2329 -(s8)ewma_avg_signal_read( 2334 -(s8)ewma_avg_signal_read(
2330 &sta->status_stats.avg_ack_signal); 2335 &sta->status_stats.avg_ack_signal);
2331 sinfo->filled |= 2336 sinfo->filled |=
2332 BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG); 2337 BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG);
2333 } 2338 }
2334} 2339}
2335 2340
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 91d7c0cd1882..aa4afbf0abaf 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -987,6 +987,25 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
987} 987}
988EXPORT_SYMBOL(ieee80211_tx_status_ext); 988EXPORT_SYMBOL(ieee80211_tx_status_ext);
989 989
990void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
991 struct ieee80211_sta *pubsta,
992 struct ieee80211_tx_info *info)
993{
994 struct ieee80211_local *local = hw_to_local(hw);
995 struct ieee80211_supported_band *sband = hw->wiphy->bands[info->band];
996 struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
997 struct ieee80211_tx_status status = {
998 .info = info,
999 .sta = pubsta,
1000 };
1001
1002 rate_control_tx_status(local, sband, &status);
1003
1004 if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
1005 sta->tx_stats.last_rate = info->status.rates[0];
1006}
1007EXPORT_SYMBOL(ieee80211_tx_rate_update);
1008
990void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) 1009void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
991{ 1010{
992 struct sta_info *sta = container_of(pubsta, struct sta_info, sta); 1011 struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 0ab69a1964f8..588c51a67c89 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2600,6 +2600,29 @@ TRACE_EVENT(drv_wake_tx_queue,
2600 ) 2600 )
2601); 2601);
2602 2602
2603TRACE_EVENT(drv_get_ftm_responder_stats,
2604 TP_PROTO(struct ieee80211_local *local,
2605 struct ieee80211_sub_if_data *sdata,
2606 struct cfg80211_ftm_responder_stats *ftm_stats),
2607
2608 TP_ARGS(local, sdata, ftm_stats),
2609
2610 TP_STRUCT__entry(
2611 LOCAL_ENTRY
2612 VIF_ENTRY
2613 ),
2614
2615 TP_fast_assign(
2616 LOCAL_ASSIGN;
2617 VIF_ASSIGN;
2618 ),
2619
2620 TP_printk(
2621 LOCAL_PR_FMT VIF_PR_FMT,
2622 LOCAL_PR_ARG, VIF_PR_ARG
2623 )
2624);
2625
2603#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ 2626#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
2604 2627
2605#undef TRACE_INCLUDE_PATH 2628#undef TRACE_INCLUDE_PATH
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 25ba24bef8f5..e0ccee23fbcd 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1253,10 +1253,18 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
1253 (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) 1253 (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
1254 return NULL; 1254 return NULL;
1255 1255
1256 if (!ieee80211_is_data_present(hdr->frame_control)) 1256 if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
1257 return NULL; 1257 if ((!ieee80211_is_mgmt(hdr->frame_control) ||
1258 1258 ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
1259 if (sta) { 1259 vif->type == NL80211_IFTYPE_STATION) &&
1260 sta && sta->uploaded) {
1261 /*
1262 * This will be NULL if the driver didn't set the
1263 * opt-in hardware flag.
1264 */
1265 txq = sta->sta.txq[IEEE80211_NUM_TIDS];
1266 }
1267 } else if (sta) {
1260 u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; 1268 u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
1261 1269
1262 if (!sta->uploaded) 1270 if (!sta->uploaded)
@@ -1444,16 +1452,33 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
1444 1452
1445 txqi->txq.vif = &sdata->vif; 1453 txqi->txq.vif = &sdata->vif;
1446 1454
1447 if (sta) { 1455 if (!sta) {
1448 txqi->txq.sta = &sta->sta;
1449 sta->sta.txq[tid] = &txqi->txq;
1450 txqi->txq.tid = tid;
1451 txqi->txq.ac = ieee80211_ac_from_tid(tid);
1452 } else {
1453 sdata->vif.txq = &txqi->txq; 1456 sdata->vif.txq = &txqi->txq;
1454 txqi->txq.tid = 0; 1457 txqi->txq.tid = 0;
1455 txqi->txq.ac = IEEE80211_AC_BE; 1458 txqi->txq.ac = IEEE80211_AC_BE;
1459
1460 return;
1461 }
1462
1463 if (tid == IEEE80211_NUM_TIDS) {
1464 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
1465 /* Drivers need to opt in to the management MPDU TXQ */
1466 if (!ieee80211_hw_check(&sdata->local->hw,
1467 STA_MMPDU_TXQ))
1468 return;
1469 } else if (!ieee80211_hw_check(&sdata->local->hw,
1470 BUFF_MMPDU_TXQ)) {
1471 /* Drivers need to opt in to the bufferable MMPDU TXQ */
1472 return;
1473 }
1474 txqi->txq.ac = IEEE80211_AC_VO;
1475 } else {
1476 txqi->txq.ac = ieee80211_ac_from_tid(tid);
1456 } 1477 }
1478
1479 txqi->txq.sta = &sta->sta;
1480 txqi->txq.tid = tid;
1481 sta->sta.txq[tid] = &txqi->txq;
1457} 1482}
1458 1483
1459void ieee80211_txq_purge(struct ieee80211_local *local, 1484void ieee80211_txq_purge(struct ieee80211_local *local,
@@ -2955,6 +2980,10 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
2955 if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) 2980 if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
2956 goto out; 2981 goto out;
2957 2982
2983 /* Key is being removed */
2984 if (build.key->flags & KEY_FLAG_TAINTED)
2985 goto out;
2986
2958 switch (build.key->conf.cipher) { 2987 switch (build.key->conf.cipher) {
2959 case WLAN_CIPHER_SUITE_CCMP: 2988 case WLAN_CIPHER_SUITE_CCMP:
2960 case WLAN_CIPHER_SUITE_CCMP_256: 2989 case WLAN_CIPHER_SUITE_CCMP_256:
@@ -3200,6 +3229,10 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
3200 max_amsdu_len = min_t(int, max_amsdu_len, 3229 max_amsdu_len = min_t(int, max_amsdu_len,
3201 sta->sta.max_rc_amsdu_len); 3230 sta->sta.max_rc_amsdu_len);
3202 3231
3232 if (sta->sta.max_tid_amsdu_len[tid])
3233 max_amsdu_len = min_t(int, max_amsdu_len,
3234 sta->sta.max_tid_amsdu_len[tid]);
3235
3203 spin_lock_bh(&fq->lock); 3236 spin_lock_bh(&fq->lock);
3204 3237
3205 /* TODO: Ideally aggregation should be done on dequeue to remain 3238 /* TODO: Ideally aggregation should be done on dequeue to remain
@@ -3232,6 +3265,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
3232 if (max_frags && nfrags > max_frags) 3265 if (max_frags && nfrags > max_frags)
3233 goto out; 3266 goto out;
3234 3267
3268 if (!drv_can_aggregate_in_amsdu(local, head, skb))
3269 goto out;
3270
3235 if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) 3271 if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
3236 goto out; 3272 goto out;
3237 3273
@@ -3476,13 +3512,19 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
3476 struct ieee80211_tx_info *info; 3512 struct ieee80211_tx_info *info;
3477 struct ieee80211_tx_data tx; 3513 struct ieee80211_tx_data tx;
3478 ieee80211_tx_result r; 3514 ieee80211_tx_result r;
3479 struct ieee80211_vif *vif; 3515 struct ieee80211_vif *vif = txq->vif;
3480 3516
3481 spin_lock_bh(&fq->lock); 3517 spin_lock_bh(&fq->lock);
3482 3518
3483 if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) 3519 if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ||
3520 test_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags))
3484 goto out; 3521 goto out;
3485 3522
3523 if (vif->txqs_stopped[ieee80211_ac_from_tid(txq->tid)]) {
3524 set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags);
3525 goto out;
3526 }
3527
3486 /* Make sure fragments stay together. */ 3528 /* Make sure fragments stay together. */
3487 skb = __skb_dequeue(&txqi->frags); 3529 skb = __skb_dequeue(&txqi->frags);
3488 if (skb) 3530 if (skb)
@@ -3577,6 +3619,7 @@ begin:
3577 } 3619 }
3578 3620
3579 IEEE80211_SKB_CB(skb)->control.vif = vif; 3621 IEEE80211_SKB_CB(skb)->control.vif = vif;
3622
3580out: 3623out:
3581 spin_unlock_bh(&fq->lock); 3624 spin_unlock_bh(&fq->lock);
3582 3625
@@ -3605,13 +3648,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3605 if (!IS_ERR_OR_NULL(sta)) { 3648 if (!IS_ERR_OR_NULL(sta)) {
3606 struct ieee80211_fast_tx *fast_tx; 3649 struct ieee80211_fast_tx *fast_tx;
3607 3650
3608 /* We need a bit of data queued to build aggregates properly, so 3651 sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift);
3609 * instruct the TCP stack to allow more than a single ms of data
3610 * to be queued in the stack. The value is a bit-shift of 1
3611 * second, so 8 is ~4ms of queued data. Only affects local TCP
3612 * sockets.
3613 */
3614 sk_pacing_shift_update(skb->sk, 8);
3615 3652
3616 fast_tx = rcu_dereference(sta->fast_tx); 3653 fast_tx = rcu_dereference(sta->fast_tx);
3617 3654
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 716cd6442d86..bec424316ea4 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -240,6 +240,102 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
240} 240}
241EXPORT_SYMBOL(ieee80211_ctstoself_duration); 241EXPORT_SYMBOL(ieee80211_ctstoself_duration);
242 242
243static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
244{
245 struct ieee80211_local *local = sdata->local;
246 struct ieee80211_vif *vif = &sdata->vif;
247 struct fq *fq = &local->fq;
248 struct ps_data *ps = NULL;
249 struct txq_info *txqi;
250 struct sta_info *sta;
251 int i;
252
253 spin_lock_bh(&fq->lock);
254
255 if (sdata->vif.type == NL80211_IFTYPE_AP)
256 ps = &sdata->bss->ps;
257
258 sdata->vif.txqs_stopped[ac] = false;
259
260 list_for_each_entry_rcu(sta, &local->sta_list, list) {
261 if (sdata != sta->sdata)
262 continue;
263
264 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
265 struct ieee80211_txq *txq = sta->sta.txq[i];
266
267 if (!txq)
268 continue;
269
270 txqi = to_txq_info(txq);
271
272 if (ac != txq->ac)
273 continue;
274
275 if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX,
276 &txqi->flags))
277 continue;
278
279 spin_unlock_bh(&fq->lock);
280 drv_wake_tx_queue(local, txqi);
281 spin_lock_bh(&fq->lock);
282 }
283 }
284
285 if (!vif->txq)
286 goto out;
287
288 txqi = to_txq_info(vif->txq);
289
290 if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags) ||
291 (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac)
292 goto out;
293
294 spin_unlock_bh(&fq->lock);
295
296 drv_wake_tx_queue(local, txqi);
297 return;
298out:
299 spin_unlock_bh(&fq->lock);
300}
301
302void ieee80211_wake_txqs(unsigned long data)
303{
304 struct ieee80211_local *local = (struct ieee80211_local *)data;
305 struct ieee80211_sub_if_data *sdata;
306 int n_acs = IEEE80211_NUM_ACS;
307 unsigned long flags;
308 int i;
309
310 rcu_read_lock();
311 spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
312
313 if (local->hw.queues < IEEE80211_NUM_ACS)
314 n_acs = 1;
315
316 for (i = 0; i < local->hw.queues; i++) {
317 if (local->queue_stop_reasons[i])
318 continue;
319
320 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
321 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
322 int ac;
323
324 for (ac = 0; ac < n_acs; ac++) {
325 int ac_queue = sdata->vif.hw_queue[ac];
326
327 if (ac_queue == i ||
328 sdata->vif.cab_queue == i)
329 __ieee80211_wake_txqs(sdata, ac);
330 }
331 }
332 spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
333 }
334
335 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
336 rcu_read_unlock();
337}
338
243void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) 339void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
244{ 340{
245 struct ieee80211_sub_if_data *sdata; 341 struct ieee80211_sub_if_data *sdata;
@@ -308,6 +404,9 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
308 rcu_read_unlock(); 404 rcu_read_unlock();
309 } else 405 } else
310 tasklet_schedule(&local->tx_pending_tasklet); 406 tasklet_schedule(&local->tx_pending_tasklet);
407
408 if (local->ops->wake_tx_queue)
409 tasklet_schedule(&local->wake_txqs_tasklet);
311} 410}
312 411
313void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, 412void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
@@ -351,9 +450,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
351 if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) 450 if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
352 return; 451 return;
353 452
354 if (local->ops->wake_tx_queue)
355 return;
356
357 if (local->hw.queues < IEEE80211_NUM_ACS) 453 if (local->hw.queues < IEEE80211_NUM_ACS)
358 n_acs = 1; 454 n_acs = 1;
359 455
@@ -366,8 +462,15 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
366 462
367 for (ac = 0; ac < n_acs; ac++) { 463 for (ac = 0; ac < n_acs; ac++) {
368 if (sdata->vif.hw_queue[ac] == queue || 464 if (sdata->vif.hw_queue[ac] == queue ||
369 sdata->vif.cab_queue == queue) 465 sdata->vif.cab_queue == queue) {
370 netif_stop_subqueue(sdata->dev, ac); 466 if (!local->ops->wake_tx_queue) {
467 netif_stop_subqueue(sdata->dev, ac);
468 continue;
469 }
470 spin_lock(&local->fq.lock);
471 sdata->vif.txqs_stopped[ac] = true;
472 spin_unlock(&local->fq.lock);
473 }
371 } 474 }
372 } 475 }
373 rcu_read_unlock(); 476 rcu_read_unlock();
@@ -2075,6 +2178,11 @@ int ieee80211_reconfig(struct ieee80211_local *local)
2075 case NL80211_IFTYPE_AP: 2178 case NL80211_IFTYPE_AP:
2076 changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS; 2179 changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS;
2077 2180
2181 if (sdata->vif.bss_conf.ftm_responder == 1 &&
2182 wiphy_ext_feature_isset(sdata->local->hw.wiphy,
2183 NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
2184 changed |= BSS_CHANGED_FTM_RESPONDER;
2185
2078 if (sdata->vif.type == NL80211_IFTYPE_AP) { 2186 if (sdata->vif.type == NL80211_IFTYPE_AP) {
2079 changed |= BSS_CHANGED_AP_PROBE_RESP; 2187 changed |= BSS_CHANGED_AP_PROBE_RESP;
2080 2188
@@ -2657,49 +2765,65 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
2657 return true; 2765 return true;
2658} 2766}
2659 2767
2660bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, 2768bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
2769 const struct ieee80211_vht_operation *oper,
2770 const struct ieee80211_ht_operation *htop,
2661 struct cfg80211_chan_def *chandef) 2771 struct cfg80211_chan_def *chandef)
2662{ 2772{
2663 struct cfg80211_chan_def new = *chandef; 2773 struct cfg80211_chan_def new = *chandef;
2664 int cf1, cf2; 2774 int cf0, cf1;
2775 int ccfs0, ccfs1, ccfs2;
2776 int ccf0, ccf1;
2665 2777
2666 if (!oper) 2778 if (!oper || !htop)
2667 return false; 2779 return false;
2668 2780
2669 cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg0_idx, 2781 ccfs0 = oper->center_freq_seg0_idx;
2670 chandef->chan->band); 2782 ccfs1 = oper->center_freq_seg1_idx;
2671 cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, 2783 ccfs2 = (le16_to_cpu(htop->operation_mode) &
2672 chandef->chan->band); 2784 IEEE80211_HT_OP_MODE_CCFS2_MASK)
2785 >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT;
2786
2787 /* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */
2788 ccf0 = ccfs0;
2789 ccf1 = ccfs1;
2790 if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW))
2791 ccf1 = ccfs2;
2792
2793 cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band);
2794 cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band);
2673 2795
2674 switch (oper->chan_width) { 2796 switch (oper->chan_width) {
2675 case IEEE80211_VHT_CHANWIDTH_USE_HT: 2797 case IEEE80211_VHT_CHANWIDTH_USE_HT:
2798 /* just use HT information directly */
2676 break; 2799 break;
2677 case IEEE80211_VHT_CHANWIDTH_80MHZ: 2800 case IEEE80211_VHT_CHANWIDTH_80MHZ:
2678 new.width = NL80211_CHAN_WIDTH_80; 2801 new.width = NL80211_CHAN_WIDTH_80;
2679 new.center_freq1 = cf1; 2802 new.center_freq1 = cf0;
2680 /* If needed, adjust based on the newer interop workaround. */ 2803 /* If needed, adjust based on the newer interop workaround. */
2681 if (oper->center_freq_seg1_idx) { 2804 if (ccf1) {
2682 unsigned int diff; 2805 unsigned int diff;
2683 2806
2684 diff = abs(oper->center_freq_seg1_idx - 2807 diff = abs(ccf1 - ccf0);
2685 oper->center_freq_seg0_idx);
2686 if (diff == 8) { 2808 if (diff == 8) {
2687 new.width = NL80211_CHAN_WIDTH_160; 2809 new.width = NL80211_CHAN_WIDTH_160;
2688 new.center_freq1 = cf2; 2810 new.center_freq1 = cf1;
2689 } else if (diff > 8) { 2811 } else if (diff > 8) {
2690 new.width = NL80211_CHAN_WIDTH_80P80; 2812 new.width = NL80211_CHAN_WIDTH_80P80;
2691 new.center_freq2 = cf2; 2813 new.center_freq2 = cf1;
2692 } 2814 }
2693 } 2815 }
2694 break; 2816 break;
2695 case IEEE80211_VHT_CHANWIDTH_160MHZ: 2817 case IEEE80211_VHT_CHANWIDTH_160MHZ:
2818 /* deprecated encoding */
2696 new.width = NL80211_CHAN_WIDTH_160; 2819 new.width = NL80211_CHAN_WIDTH_160;
2697 new.center_freq1 = cf1; 2820 new.center_freq1 = cf0;
2698 break; 2821 break;
2699 case IEEE80211_VHT_CHANWIDTH_80P80MHZ: 2822 case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
2823 /* deprecated encoding */
2700 new.width = NL80211_CHAN_WIDTH_80P80; 2824 new.width = NL80211_CHAN_WIDTH_80P80;
2701 new.center_freq1 = cf1; 2825 new.center_freq1 = cf0;
2702 new.center_freq2 = cf2; 2826 new.center_freq2 = cf1;
2703 break; 2827 break;
2704 default: 2828 default:
2705 return false; 2829 return false;
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 259325cbcc31..006d82e4a397 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Portions of this file 4 * Portions of this file
5 * Copyright(c) 2015 - 2016 Intel Deutschland GmbH 5 * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
6 * Copyright (C) 2018 Intel Corporation
6 * 7 *
7 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
@@ -231,6 +232,13 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
231 memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, 232 memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs,
232 sizeof(struct ieee80211_vht_mcs_info)); 233 sizeof(struct ieee80211_vht_mcs_info));
233 234
235 /* copy EXT_NSS_BW Support value or remove the capability */
236 if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW))
237 vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
238 else
239 vht_cap->vht_mcs.tx_highest &=
240 ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE);
241
234 /* but also restrict MCSes */ 242 /* but also restrict MCSes */
235 for (i = 0; i < 8; i++) { 243 for (i = 0; i < 8; i++) {
236 u16 own_rx, own_tx, peer_rx, peer_tx; 244 u16 own_rx, own_tx, peer_rx, peer_tx;
@@ -294,6 +302,18 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
294 break; 302 break;
295 default: 303 default:
296 sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; 304 sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
305
306 if (!(vht_cap->vht_mcs.tx_highest &
307 cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
308 break;
309
310 /*
311 * If this is non-zero, then it does support 160 MHz after all,
312 * in one form or the other. We don't distinguish here (or even
313 * above) between 160 and 80+80 yet.
314 */
315 if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)
316 sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
297 } 317 }
298 318
299 sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); 319 sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 2fb703d70803..7e29f88dbf6a 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -146,18 +146,18 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
146 goto err_tfm; 146 goto err_tfm;
147 } 147 }
148 148
149 key->tfm0 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); 149 key->tfm0 = crypto_alloc_sync_skcipher("ctr(aes)", 0, 0);
150 if (IS_ERR(key->tfm0)) 150 if (IS_ERR(key->tfm0))
151 goto err_tfm; 151 goto err_tfm;
152 152
153 if (crypto_skcipher_setkey(key->tfm0, template->key, 153 if (crypto_sync_skcipher_setkey(key->tfm0, template->key,
154 IEEE802154_LLSEC_KEY_SIZE)) 154 IEEE802154_LLSEC_KEY_SIZE))
155 goto err_tfm0; 155 goto err_tfm0;
156 156
157 return key; 157 return key;
158 158
159err_tfm0: 159err_tfm0:
160 crypto_free_skcipher(key->tfm0); 160 crypto_free_sync_skcipher(key->tfm0);
161err_tfm: 161err_tfm:
162 for (i = 0; i < ARRAY_SIZE(key->tfm); i++) 162 for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
163 if (key->tfm[i]) 163 if (key->tfm[i])
@@ -177,7 +177,7 @@ static void llsec_key_release(struct kref *ref)
177 for (i = 0; i < ARRAY_SIZE(key->tfm); i++) 177 for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
178 crypto_free_aead(key->tfm[i]); 178 crypto_free_aead(key->tfm[i]);
179 179
180 crypto_free_skcipher(key->tfm0); 180 crypto_free_sync_skcipher(key->tfm0);
181 kzfree(key); 181 kzfree(key);
182} 182}
183 183
@@ -622,7 +622,7 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
622{ 622{
623 u8 iv[16]; 623 u8 iv[16];
624 struct scatterlist src; 624 struct scatterlist src;
625 SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); 625 SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
626 int err, datalen; 626 int err, datalen;
627 unsigned char *data; 627 unsigned char *data;
628 628
@@ -632,7 +632,7 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
632 datalen = skb_tail_pointer(skb) - data; 632 datalen = skb_tail_pointer(skb) - data;
633 sg_init_one(&src, data, datalen); 633 sg_init_one(&src, data, datalen);
634 634
635 skcipher_request_set_tfm(req, key->tfm0); 635 skcipher_request_set_sync_tfm(req, key->tfm0);
636 skcipher_request_set_callback(req, 0, NULL, NULL); 636 skcipher_request_set_callback(req, 0, NULL, NULL);
637 skcipher_request_set_crypt(req, &src, &src, datalen, iv); 637 skcipher_request_set_crypt(req, &src, &src, datalen, iv);
638 err = crypto_skcipher_encrypt(req); 638 err = crypto_skcipher_encrypt(req);
@@ -840,7 +840,7 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
840 unsigned char *data; 840 unsigned char *data;
841 int datalen; 841 int datalen;
842 struct scatterlist src; 842 struct scatterlist src;
843 SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); 843 SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
844 int err; 844 int err;
845 845
846 llsec_geniv(iv, dev_addr, &hdr->sec); 846 llsec_geniv(iv, dev_addr, &hdr->sec);
@@ -849,7 +849,7 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
849 849
850 sg_init_one(&src, data, datalen); 850 sg_init_one(&src, data, datalen);
851 851
852 skcipher_request_set_tfm(req, key->tfm0); 852 skcipher_request_set_sync_tfm(req, key->tfm0);
853 skcipher_request_set_callback(req, 0, NULL, NULL); 853 skcipher_request_set_callback(req, 0, NULL, NULL);
854 skcipher_request_set_crypt(req, &src, &src, datalen, iv); 854 skcipher_request_set_crypt(req, &src, &src, datalen, iv);
855 855
diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h
index 6f3b658e3279..8be46d74dc39 100644
--- a/net/mac802154/llsec.h
+++ b/net/mac802154/llsec.h
@@ -29,7 +29,7 @@ struct mac802154_llsec_key {
29 29
30 /* one tfm for each authsize (4/8/16) */ 30 /* one tfm for each authsize (4/8/16) */
31 struct crypto_aead *tfm[3]; 31 struct crypto_aead *tfm[3];
32 struct crypto_skcipher *tfm0; 32 struct crypto_sync_skcipher *tfm0;
33 33
34 struct kref ref; 34 struct kref ref;
35}; 35};
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8fbe6cdbe255..7d55d4c04088 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1223,7 +1223,7 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
1223 int err; 1223 int err;
1224 1224
1225 err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, 1225 err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
1226 devconf_mpls_policy, NULL); 1226 devconf_mpls_policy, extack);
1227 if (err < 0) 1227 if (err < 0)
1228 goto errout; 1228 goto errout;
1229 1229
@@ -1263,6 +1263,7 @@ errout:
1263static int mpls_netconf_dump_devconf(struct sk_buff *skb, 1263static int mpls_netconf_dump_devconf(struct sk_buff *skb,
1264 struct netlink_callback *cb) 1264 struct netlink_callback *cb)
1265{ 1265{
1266 const struct nlmsghdr *nlh = cb->nlh;
1266 struct net *net = sock_net(skb->sk); 1267 struct net *net = sock_net(skb->sk);
1267 struct hlist_head *head; 1268 struct hlist_head *head;
1268 struct net_device *dev; 1269 struct net_device *dev;
@@ -1270,6 +1271,21 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb,
1270 int idx, s_idx; 1271 int idx, s_idx;
1271 int h, s_h; 1272 int h, s_h;
1272 1273
1274 if (cb->strict_check) {
1275 struct netlink_ext_ack *extack = cb->extack;
1276 struct netconfmsg *ncm;
1277
1278 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
1279 NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
1280 return -EINVAL;
1281 }
1282
1283 if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
1284 NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
1285 return -EINVAL;
1286 }
1287 }
1288
1273 s_h = cb->args[0]; 1289 s_h = cb->args[0];
1274 s_idx = idx = cb->args[1]; 1290 s_idx = idx = cb->args[1];
1275 1291
@@ -1286,7 +1302,7 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb,
1286 goto cont; 1302 goto cont;
1287 if (mpls_netconf_fill_devconf(skb, mdev, 1303 if (mpls_netconf_fill_devconf(skb, mdev,
1288 NETLINK_CB(cb->skb).portid, 1304 NETLINK_CB(cb->skb).portid,
1289 cb->nlh->nlmsg_seq, 1305 nlh->nlmsg_seq,
1290 RTM_NEWNETCONF, 1306 RTM_NEWNETCONF,
1291 NLM_F_MULTI, 1307 NLM_F_MULTI,
1292 NETCONFA_ALL) < 0) { 1308 NETCONFA_ALL) < 0) {
@@ -2015,30 +2031,140 @@ nla_put_failure:
2015 return -EMSGSIZE; 2031 return -EMSGSIZE;
2016} 2032}
2017 2033
2034#if IS_ENABLED(CONFIG_INET)
2035static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
2036 struct fib_dump_filter *filter,
2037 struct netlink_callback *cb)
2038{
2039 return ip_valid_fib_dump_req(net, nlh, filter, cb);
2040}
2041#else
2042static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
2043 struct fib_dump_filter *filter,
2044 struct netlink_callback *cb)
2045{
2046 struct netlink_ext_ack *extack = cb->extack;
2047 struct nlattr *tb[RTA_MAX + 1];
2048 struct rtmsg *rtm;
2049 int err, i;
2050
2051 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2052 NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
2053 return -EINVAL;
2054 }
2055
2056 rtm = nlmsg_data(nlh);
2057 if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
2058 rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type ||
2059 rtm->rtm_flags) {
2060 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request");
2061 return -EINVAL;
2062 }
2063
2064 if (rtm->rtm_protocol) {
2065 filter->protocol = rtm->rtm_protocol;
2066 filter->filter_set = 1;
2067 cb->answer_flags = NLM_F_DUMP_FILTERED;
2068 }
2069
2070 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2071 rtm_mpls_policy, extack);
2072 if (err < 0)
2073 return err;
2074
2075 for (i = 0; i <= RTA_MAX; ++i) {
2076 int ifindex;
2077
2078 if (i == RTA_OIF) {
2079 ifindex = nla_get_u32(tb[i]);
2080 filter->dev = __dev_get_by_index(net, ifindex);
2081 if (!filter->dev)
2082 return -ENODEV;
2083 filter->filter_set = 1;
2084 } else if (tb[i]) {
2085 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
2086 return -EINVAL;
2087 }
2088 }
2089
2090 return 0;
2091}
2092#endif
2093
2094static bool mpls_rt_uses_dev(struct mpls_route *rt,
2095 const struct net_device *dev)
2096{
2097 struct net_device *nh_dev;
2098
2099 if (rt->rt_nhn == 1) {
2100 struct mpls_nh *nh = rt->rt_nh;
2101
2102 nh_dev = rtnl_dereference(nh->nh_dev);
2103 if (dev == nh_dev)
2104 return true;
2105 } else {
2106 for_nexthops(rt) {
2107 nh_dev = rtnl_dereference(nh->nh_dev);
2108 if (nh_dev == dev)
2109 return true;
2110 } endfor_nexthops(rt);
2111 }
2112
2113 return false;
2114}
2115
2018static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) 2116static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
2019{ 2117{
2118 const struct nlmsghdr *nlh = cb->nlh;
2020 struct net *net = sock_net(skb->sk); 2119 struct net *net = sock_net(skb->sk);
2021 struct mpls_route __rcu **platform_label; 2120 struct mpls_route __rcu **platform_label;
2121 struct fib_dump_filter filter = {};
2122 unsigned int flags = NLM_F_MULTI;
2022 size_t platform_labels; 2123 size_t platform_labels;
2023 unsigned int index; 2124 unsigned int index;
2024 2125
2025 ASSERT_RTNL(); 2126 ASSERT_RTNL();
2026 2127
2128 if (cb->strict_check) {
2129 int err;
2130
2131 err = mpls_valid_fib_dump_req(net, nlh, &filter, cb);
2132 if (err < 0)
2133 return err;
2134
2135 /* for MPLS, there is only 1 table with fixed type and flags.
2136 * If either are set in the filter then return nothing.
2137 */
2138 if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) ||
2139 (filter.rt_type && filter.rt_type != RTN_UNICAST) ||
2140 filter.flags)
2141 return skb->len;
2142 }
2143
2027 index = cb->args[0]; 2144 index = cb->args[0];
2028 if (index < MPLS_LABEL_FIRST_UNRESERVED) 2145 if (index < MPLS_LABEL_FIRST_UNRESERVED)
2029 index = MPLS_LABEL_FIRST_UNRESERVED; 2146 index = MPLS_LABEL_FIRST_UNRESERVED;
2030 2147
2031 platform_label = rtnl_dereference(net->mpls.platform_label); 2148 platform_label = rtnl_dereference(net->mpls.platform_label);
2032 platform_labels = net->mpls.platform_labels; 2149 platform_labels = net->mpls.platform_labels;
2150
2151 if (filter.filter_set)
2152 flags |= NLM_F_DUMP_FILTERED;
2153
2033 for (; index < platform_labels; index++) { 2154 for (; index < platform_labels; index++) {
2034 struct mpls_route *rt; 2155 struct mpls_route *rt;
2156
2035 rt = rtnl_dereference(platform_label[index]); 2157 rt = rtnl_dereference(platform_label[index]);
2036 if (!rt) 2158 if (!rt)
2037 continue; 2159 continue;
2038 2160
2161 if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) ||
2162 (filter.protocol && rt->rt_protocol != filter.protocol))
2163 continue;
2164
2039 if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, 2165 if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
2040 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2166 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2041 index, rt, NLM_F_MULTI) < 0) 2167 index, rt, flags) < 0)
2042 break; 2168 break;
2043 } 2169 }
2044 cb->args[0] = index; 2170 cb->args[0] = index;
diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
index 08a8a6031fd7..7f2b46108a24 100644
--- a/net/ncsi/Kconfig
+++ b/net/ncsi/Kconfig
@@ -10,3 +10,9 @@ config NET_NCSI
10 support. Enable this only if your system connects to a network 10 support. Enable this only if your system connects to a network
11 device via NCSI and the ethernet driver you're using supports 11 device via NCSI and the ethernet driver you're using supports
12 the protocol explicitly. 12 the protocol explicitly.
13config NCSI_OEM_CMD_GET_MAC
14 bool "Get NCSI OEM MAC Address"
15 depends on NET_NCSI
16 ---help---
17 This allows to get MAC address from NCSI firmware and set them back to
18 controller.
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8055e3965cef..1dae77c54009 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -68,6 +68,17 @@ enum {
68 NCSI_MODE_MAX 68 NCSI_MODE_MAX
69}; 69};
70 70
71/* OEM Vendor Manufacture ID */
72#define NCSI_OEM_MFR_MLX_ID 0x8119
73#define NCSI_OEM_MFR_BCM_ID 0x113d
74/* Broadcom specific OEM Command */
75#define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */
76/* OEM Command payload lengths*/
77#define NCSI_OEM_BCM_CMD_GMA_LEN 12
78/* Mac address offset in OEM response */
79#define BCM_MAC_ADDR_OFFSET 28
80
81
71struct ncsi_channel_version { 82struct ncsi_channel_version {
72 u32 version; /* Supported BCD encoded NCSI version */ 83 u32 version; /* Supported BCD encoded NCSI version */
73 u32 alpha2; /* Supported BCD encoded NCSI version */ 84 u32 alpha2; /* Supported BCD encoded NCSI version */
@@ -171,6 +182,8 @@ struct ncsi_package;
171#define NCSI_RESERVED_CHANNEL 0x1f 182#define NCSI_RESERVED_CHANNEL 0x1f
172#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) 183#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
173#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) 184#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c))
185#define NCSI_MAX_PACKAGE 8
186#define NCSI_MAX_CHANNEL 32
174 187
175struct ncsi_channel { 188struct ncsi_channel {
176 unsigned char id; 189 unsigned char id;
@@ -216,11 +229,15 @@ struct ncsi_request {
216 bool used; /* Request that has been assigned */ 229 bool used; /* Request that has been assigned */
217 unsigned int flags; /* NCSI request property */ 230 unsigned int flags; /* NCSI request property */
218#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 231#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
232#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2
219 struct ncsi_dev_priv *ndp; /* Associated NCSI device */ 233 struct ncsi_dev_priv *ndp; /* Associated NCSI device */
220 struct sk_buff *cmd; /* Associated NCSI command packet */ 234 struct sk_buff *cmd; /* Associated NCSI command packet */
221 struct sk_buff *rsp; /* Associated NCSI response packet */ 235 struct sk_buff *rsp; /* Associated NCSI response packet */
222 struct timer_list timer; /* Timer on waiting for response */ 236 struct timer_list timer; /* Timer on waiting for response */
223 bool enabled; /* Time has been enabled or not */ 237 bool enabled; /* Time has been enabled or not */
238 u32 snd_seq; /* netlink sending sequence number */
239 u32 snd_portid; /* netlink portid of sender */
240 struct nlmsghdr nlhdr; /* netlink message header */
224}; 241};
225 242
226enum { 243enum {
@@ -236,6 +253,7 @@ enum {
236 ncsi_dev_state_probe_dp, 253 ncsi_dev_state_probe_dp,
237 ncsi_dev_state_config_sp = 0x0301, 254 ncsi_dev_state_config_sp = 0x0301,
238 ncsi_dev_state_config_cis, 255 ncsi_dev_state_config_cis,
256 ncsi_dev_state_config_oem_gma,
239 ncsi_dev_state_config_clear_vids, 257 ncsi_dev_state_config_clear_vids,
240 ncsi_dev_state_config_svf, 258 ncsi_dev_state_config_svf,
241 ncsi_dev_state_config_ev, 259 ncsi_dev_state_config_ev,
@@ -269,6 +287,7 @@ struct ncsi_dev_priv {
269#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ 287#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */
270#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ 288#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */
271#define NCSI_DEV_RESHUFFLE 4 289#define NCSI_DEV_RESHUFFLE 4
290 unsigned int gma_flag; /* OEM GMA flag */
272 spinlock_t lock; /* Protect the NCSI device */ 291 spinlock_t lock; /* Protect the NCSI device */
273#if IS_ENABLED(CONFIG_IPV6) 292#if IS_ENABLED(CONFIG_IPV6)
274 unsigned int inet6_addr_num; /* Number of IPv6 addresses */ 293 unsigned int inet6_addr_num; /* Number of IPv6 addresses */
@@ -305,6 +324,8 @@ struct ncsi_cmd_arg {
305 unsigned short words[8]; 324 unsigned short words[8];
306 unsigned int dwords[4]; 325 unsigned int dwords[4];
307 }; 326 };
327 unsigned char *data; /* NCSI OEM data */
328 struct genl_info *info; /* Netlink information */
308}; 329};
309 330
310extern struct list_head ncsi_dev_list; 331extern struct list_head ncsi_dev_list;
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 7567ca63aae2..356af474e43c 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -17,6 +17,7 @@
17#include <net/ncsi.h> 17#include <net/ncsi.h>
18#include <net/net_namespace.h> 18#include <net/net_namespace.h>
19#include <net/sock.h> 19#include <net/sock.h>
20#include <net/genetlink.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "ncsi-pkt.h" 23#include "ncsi-pkt.h"
@@ -211,6 +212,25 @@ static int ncsi_cmd_handler_snfc(struct sk_buff *skb,
211 return 0; 212 return 0;
212} 213}
213 214
215static int ncsi_cmd_handler_oem(struct sk_buff *skb,
216 struct ncsi_cmd_arg *nca)
217{
218 struct ncsi_cmd_oem_pkt *cmd;
219 unsigned int len;
220
221 len = sizeof(struct ncsi_cmd_pkt_hdr) + 4;
222 if (nca->payload < 26)
223 len += 26;
224 else
225 len += nca->payload;
226
227 cmd = skb_put_zero(skb, len);
228 memcpy(&cmd->mfr_id, nca->data, nca->payload);
229 ncsi_cmd_build_header(&cmd->cmd.common, nca);
230
231 return 0;
232}
233
214static struct ncsi_cmd_handler { 234static struct ncsi_cmd_handler {
215 unsigned char type; 235 unsigned char type;
216 int payload; 236 int payload;
@@ -244,7 +264,7 @@ static struct ncsi_cmd_handler {
244 { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default }, 264 { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default },
245 { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default }, 265 { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default },
246 { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default }, 266 { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default },
247 { NCSI_PKT_CMD_OEM, 0, NULL }, 267 { NCSI_PKT_CMD_OEM, -1, ncsi_cmd_handler_oem },
248 { NCSI_PKT_CMD_PLDM, 0, NULL }, 268 { NCSI_PKT_CMD_PLDM, 0, NULL },
249 { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default } 269 { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default }
250}; 270};
@@ -316,12 +336,24 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
316 return -ENOENT; 336 return -ENOENT;
317 } 337 }
318 338
319 /* Get packet payload length and allocate the request */ 339 /* Get packet payload length and allocate the request
320 nca->payload = nch->payload; 340 * It is expected that if length set as negative in
341 * handler structure means caller is initializing it
342 * and setting length in nca before calling xmit function
343 */
344 if (nch->payload >= 0)
345 nca->payload = nch->payload;
321 nr = ncsi_alloc_command(nca); 346 nr = ncsi_alloc_command(nca);
322 if (!nr) 347 if (!nr)
323 return -ENOMEM; 348 return -ENOMEM;
324 349
350 /* track netlink information */
351 if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
352 nr->snd_seq = nca->info->snd_seq;
353 nr->snd_portid = nca->info->snd_portid;
354 nr->nlhdr = *nca->info->nlhdr;
355 }
356
325 /* Prepare the packet */ 357 /* Prepare the packet */
326 nca->id = nr->id; 358 nca->id = nr->id;
327 ret = nch->handler(nr->cmd, nca); 359 ret = nch->handler(nr->cmd, nca);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 091284760d21..bfc43b28c7a6 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -19,6 +19,7 @@
19#include <net/addrconf.h> 19#include <net/addrconf.h>
20#include <net/ipv6.h> 20#include <net/ipv6.h>
21#include <net/if_inet6.h> 21#include <net/if_inet6.h>
22#include <net/genetlink.h>
22 23
23#include "internal.h" 24#include "internal.h"
24#include "ncsi-pkt.h" 25#include "ncsi-pkt.h"
@@ -406,6 +407,9 @@ static void ncsi_request_timeout(struct timer_list *t)
406{ 407{
407 struct ncsi_request *nr = from_timer(nr, t, timer); 408 struct ncsi_request *nr = from_timer(nr, t, timer);
408 struct ncsi_dev_priv *ndp = nr->ndp; 409 struct ncsi_dev_priv *ndp = nr->ndp;
410 struct ncsi_cmd_pkt *cmd;
411 struct ncsi_package *np;
412 struct ncsi_channel *nc;
409 unsigned long flags; 413 unsigned long flags;
410 414
411 /* If the request already had associated response, 415 /* If the request already had associated response,
@@ -419,6 +423,18 @@ static void ncsi_request_timeout(struct timer_list *t)
419 } 423 }
420 spin_unlock_irqrestore(&ndp->lock, flags); 424 spin_unlock_irqrestore(&ndp->lock, flags);
421 425
426 if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
427 if (nr->cmd) {
428 /* Find the package */
429 cmd = (struct ncsi_cmd_pkt *)
430 skb_network_header(nr->cmd);
431 ncsi_find_package_and_channel(ndp,
432 cmd->cmd.common.channel,
433 &np, &nc);
434 ncsi_send_netlink_timeout(nr, np, nc);
435 }
436 }
437
422 /* Release the request */ 438 /* Release the request */
423 ncsi_free_request(nr); 439 ncsi_free_request(nr);
424} 440}
@@ -635,6 +651,72 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
635 return 0; 651 return 0;
636} 652}
637 653
654#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
655
656/* NCSI OEM Command APIs */
657static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
658{
659 unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN];
660 int ret = 0;
661
662 nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
663
664 memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
665 *(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
666 data[5] = NCSI_OEM_BCM_CMD_GMA;
667
668 nca->data = data;
669
670 ret = ncsi_xmit_cmd(nca);
671 if (ret)
672 netdev_err(nca->ndp->ndev.dev,
673 "NCSI: Failed to transmit cmd 0x%x during configure\n",
674 nca->type);
675 return ret;
676}
677
678/* OEM Command handlers initialization */
679static struct ncsi_oem_gma_handler {
680 unsigned int mfr_id;
681 int (*handler)(struct ncsi_cmd_arg *nca);
682} ncsi_oem_gma_handlers[] = {
683 { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
684};
685
686static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
687{
688 struct ncsi_oem_gma_handler *nch = NULL;
689 int i;
690
691 /* This function should only be called once, return if flag set */
692 if (nca->ndp->gma_flag == 1)
693 return -1;
694
695 /* Find gma handler for given manufacturer id */
696 for (i = 0; i < ARRAY_SIZE(ncsi_oem_gma_handlers); i++) {
697 if (ncsi_oem_gma_handlers[i].mfr_id == mf_id) {
698 if (ncsi_oem_gma_handlers[i].handler)
699 nch = &ncsi_oem_gma_handlers[i];
700 break;
701 }
702 }
703
704 if (!nch) {
705 netdev_err(nca->ndp->ndev.dev,
706 "NCSI: No GMA handler available for MFR-ID (0x%x)\n",
707 mf_id);
708 return -1;
709 }
710
711 /* Set the flag for GMA command which should only be called once */
712 nca->ndp->gma_flag = 1;
713
714 /* Get Mac address from NCSI device */
715 return nch->handler(nca);
716}
717
718#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
719
638static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) 720static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
639{ 721{
640 struct ncsi_dev *nd = &ndp->ndev; 722 struct ncsi_dev *nd = &ndp->ndev;
@@ -685,7 +767,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
685 goto error; 767 goto error;
686 } 768 }
687 769
770 nd->state = ncsi_dev_state_config_oem_gma;
771 break;
772 case ncsi_dev_state_config_oem_gma:
688 nd->state = ncsi_dev_state_config_clear_vids; 773 nd->state = ncsi_dev_state_config_clear_vids;
774 ret = -1;
775
776#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
777 nca.type = NCSI_PKT_CMD_OEM;
778 nca.package = np->id;
779 nca.channel = nc->id;
780 ndp->pending_req_num = 1;
781 ret = ncsi_gma_handler(&nca, nc->version.mf_id);
782#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
783
784 if (ret < 0)
785 schedule_work(&ndp->work);
786
689 break; 787 break;
690 case ncsi_dev_state_config_clear_vids: 788 case ncsi_dev_state_config_clear_vids:
691 case ncsi_dev_state_config_svf: 789 case ncsi_dev_state_config_svf:
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 45f33d6dedf7..33314381b4f5 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -12,7 +12,6 @@
12#include <linux/if_arp.h> 12#include <linux/if_arp.h>
13#include <linux/rtnetlink.h> 13#include <linux/rtnetlink.h>
14#include <linux/etherdevice.h> 14#include <linux/etherdevice.h>
15#include <linux/module.h>
16#include <net/genetlink.h> 15#include <net/genetlink.h>
17#include <net/ncsi.h> 16#include <net/ncsi.h>
18#include <linux/skbuff.h> 17#include <linux/skbuff.h>
@@ -20,6 +19,7 @@
20#include <uapi/linux/ncsi.h> 19#include <uapi/linux/ncsi.h>
21 20
22#include "internal.h" 21#include "internal.h"
22#include "ncsi-pkt.h"
23#include "ncsi-netlink.h" 23#include "ncsi-netlink.h"
24 24
25static struct genl_family ncsi_genl_family; 25static struct genl_family ncsi_genl_family;
@@ -29,6 +29,7 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
29 [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, 29 [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED },
30 [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, 30 [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 },
31 [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, 31 [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 },
32 [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 },
32}; 33};
33 34
34static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) 35static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
@@ -366,6 +367,202 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
366 return 0; 367 return 0;
367} 368}
368 369
370static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info)
371{
372 struct ncsi_dev_priv *ndp;
373 struct ncsi_pkt_hdr *hdr;
374 struct ncsi_cmd_arg nca;
375 unsigned char *data;
376 u32 package_id;
377 u32 channel_id;
378 int len, ret;
379
380 if (!info || !info->attrs) {
381 ret = -EINVAL;
382 goto out;
383 }
384
385 if (!info->attrs[NCSI_ATTR_IFINDEX]) {
386 ret = -EINVAL;
387 goto out;
388 }
389
390 if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) {
391 ret = -EINVAL;
392 goto out;
393 }
394
395 if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
396 ret = -EINVAL;
397 goto out;
398 }
399
400 if (!info->attrs[NCSI_ATTR_DATA]) {
401 ret = -EINVAL;
402 goto out;
403 }
404
405 ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
406 nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
407 if (!ndp) {
408 ret = -ENODEV;
409 goto out;
410 }
411
412 package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
413 channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
414
415 if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) {
416 ret = -ERANGE;
417 goto out_netlink;
418 }
419
420 len = nla_len(info->attrs[NCSI_ATTR_DATA]);
421 if (len < sizeof(struct ncsi_pkt_hdr)) {
422 netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n",
423 package_id);
424 ret = -EINVAL;
425 goto out_netlink;
426 } else {
427 data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]);
428 }
429
430 hdr = (struct ncsi_pkt_hdr *)data;
431
432 nca.ndp = ndp;
433 nca.package = (unsigned char)package_id;
434 nca.channel = (unsigned char)channel_id;
435 nca.type = hdr->type;
436 nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN;
437 nca.info = info;
438 nca.payload = ntohs(hdr->length);
439 nca.data = data + sizeof(*hdr);
440
441 ret = ncsi_xmit_cmd(&nca);
442out_netlink:
443 if (ret != 0) {
444 netdev_err(ndp->ndev.dev,
445 "NCSI: Error %d sending command\n",
446 ret);
447 ncsi_send_netlink_err(ndp->ndev.dev,
448 info->snd_seq,
449 info->snd_portid,
450 info->nlhdr,
451 ret);
452 }
453out:
454 return ret;
455}
456
457int ncsi_send_netlink_rsp(struct ncsi_request *nr,
458 struct ncsi_package *np,
459 struct ncsi_channel *nc)
460{
461 struct sk_buff *skb;
462 struct net *net;
463 void *hdr;
464 int rc;
465
466 net = dev_net(nr->rsp->dev);
467
468 skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
469 if (!skb)
470 return -ENOMEM;
471
472 hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
473 &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
474 if (!hdr) {
475 kfree_skb(skb);
476 return -EMSGSIZE;
477 }
478
479 nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex);
480 if (np)
481 nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
482 if (nc)
483 nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
484 else
485 nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
486
487 rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data);
488 if (rc)
489 goto err;
490
491 genlmsg_end(skb, hdr);
492 return genlmsg_unicast(net, skb, nr->snd_portid);
493
494err:
495 kfree_skb(skb);
496 return rc;
497}
498
499int ncsi_send_netlink_timeout(struct ncsi_request *nr,
500 struct ncsi_package *np,
501 struct ncsi_channel *nc)
502{
503 struct sk_buff *skb;
504 struct net *net;
505 void *hdr;
506
507 skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
508 if (!skb)
509 return -ENOMEM;
510
511 hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
512 &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
513 if (!hdr) {
514 kfree_skb(skb);
515 return -EMSGSIZE;
516 }
517
518 net = dev_net(nr->cmd->dev);
519
520 nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex);
521
522 if (np)
523 nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
524 else
525 nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID,
526 NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *)
527 nr->cmd->data)->channel)));
528
529 if (nc)
530 nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
531 else
532 nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
533
534 genlmsg_end(skb, hdr);
535 return genlmsg_unicast(net, skb, nr->snd_portid);
536}
537
538int ncsi_send_netlink_err(struct net_device *dev,
539 u32 snd_seq,
540 u32 snd_portid,
541 struct nlmsghdr *nlhdr,
542 int err)
543{
544 struct nlmsghdr *nlh;
545 struct nlmsgerr *nle;
546 struct sk_buff *skb;
547 struct net *net;
548
549 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
550 if (!skb)
551 return -ENOMEM;
552
553 net = dev_net(dev);
554
555 nlh = nlmsg_put(skb, snd_portid, snd_seq,
556 NLMSG_ERROR, sizeof(*nle), 0);
557 nle = (struct nlmsgerr *)nlmsg_data(nlh);
558 nle->error = err;
559 memcpy(&nle->msg, nlhdr, sizeof(*nlh));
560
561 nlmsg_end(skb, nlh);
562
563 return nlmsg_unicast(net->genl_sock, skb, snd_portid);
564}
565
369static const struct genl_ops ncsi_ops[] = { 566static const struct genl_ops ncsi_ops[] = {
370 { 567 {
371 .cmd = NCSI_CMD_PKG_INFO, 568 .cmd = NCSI_CMD_PKG_INFO,
@@ -386,6 +583,12 @@ static const struct genl_ops ncsi_ops[] = {
386 .doit = ncsi_clear_interface_nl, 583 .doit = ncsi_clear_interface_nl,
387 .flags = GENL_ADMIN_PERM, 584 .flags = GENL_ADMIN_PERM,
388 }, 585 },
586 {
587 .cmd = NCSI_CMD_SEND_CMD,
588 .policy = ncsi_genl_policy,
589 .doit = ncsi_send_cmd_nl,
590 .flags = GENL_ADMIN_PERM,
591 },
389}; 592};
390 593
391static struct genl_family ncsi_genl_family __ro_after_init = { 594static struct genl_family ncsi_genl_family __ro_after_init = {
diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h
index 91a5c256f8c4..c4a46887a932 100644
--- a/net/ncsi/ncsi-netlink.h
+++ b/net/ncsi/ncsi-netlink.h
@@ -14,6 +14,18 @@
14 14
15#include "internal.h" 15#include "internal.h"
16 16
17int ncsi_send_netlink_rsp(struct ncsi_request *nr,
18 struct ncsi_package *np,
19 struct ncsi_channel *nc);
20int ncsi_send_netlink_timeout(struct ncsi_request *nr,
21 struct ncsi_package *np,
22 struct ncsi_channel *nc);
23int ncsi_send_netlink_err(struct net_device *dev,
24 u32 snd_seq,
25 u32 snd_portid,
26 struct nlmsghdr *nlhdr,
27 int err);
28
17int ncsi_init_netlink(struct net_device *dev); 29int ncsi_init_netlink(struct net_device *dev);
18int ncsi_unregister_netlink(struct net_device *dev); 30int ncsi_unregister_netlink(struct net_device *dev);
19 31
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index 91b4b66438df..4d3f06be38bd 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -151,6 +151,28 @@ struct ncsi_cmd_snfc_pkt {
151 unsigned char pad[22]; 151 unsigned char pad[22];
152}; 152};
153 153
154/* OEM Request Command as per NCSI Specification */
155struct ncsi_cmd_oem_pkt {
156 struct ncsi_cmd_pkt_hdr cmd; /* Command header */
157 __be32 mfr_id; /* Manufacture ID */
158 unsigned char data[]; /* OEM Payload Data */
159};
160
161/* OEM Response Packet as per NCSI Specification */
162struct ncsi_rsp_oem_pkt {
163 struct ncsi_rsp_pkt_hdr rsp; /* Command header */
164 __be32 mfr_id; /* Manufacture ID */
165 unsigned char data[]; /* Payload data */
166};
167
168/* Broadcom Response Data */
169struct ncsi_rsp_oem_bcm_pkt {
170 unsigned char ver; /* Payload Version */
171 unsigned char type; /* OEM Command type */
172 __be16 len; /* Payload Length */
173 unsigned char data[]; /* Cmd specific Data */
174};
175
154/* Get Link Status */ 176/* Get Link Status */
155struct ncsi_rsp_gls_pkt { 177struct ncsi_rsp_gls_pkt {
156 struct ncsi_rsp_pkt_hdr rsp; /* Response header */ 178 struct ncsi_rsp_pkt_hdr rsp; /* Response header */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 930c1d3796f0..77e07ba3f493 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -16,9 +16,11 @@
16#include <net/ncsi.h> 16#include <net/ncsi.h>
17#include <net/net_namespace.h> 17#include <net/net_namespace.h>
18#include <net/sock.h> 18#include <net/sock.h>
19#include <net/genetlink.h>
19 20
20#include "internal.h" 21#include "internal.h"
21#include "ncsi-pkt.h" 22#include "ncsi-pkt.h"
23#include "ncsi-netlink.h"
22 24
23static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, 25static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
24 unsigned short payload) 26 unsigned short payload)
@@ -32,15 +34,25 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
32 * before calling this function. 34 * before calling this function.
33 */ 35 */
34 h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp); 36 h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp);
35 if (h->common.revision != NCSI_PKT_REVISION) 37
38 if (h->common.revision != NCSI_PKT_REVISION) {
39 netdev_dbg(nr->ndp->ndev.dev,
40 "NCSI: unsupported header revision\n");
36 return -EINVAL; 41 return -EINVAL;
37 if (ntohs(h->common.length) != payload) 42 }
43 if (ntohs(h->common.length) != payload) {
44 netdev_dbg(nr->ndp->ndev.dev,
45 "NCSI: payload length mismatched\n");
38 return -EINVAL; 46 return -EINVAL;
47 }
39 48
40 /* Check on code and reason */ 49 /* Check on code and reason */
41 if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || 50 if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED ||
42 ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) 51 ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) {
43 return -EINVAL; 52 netdev_dbg(nr->ndp->ndev.dev,
53 "NCSI: non zero response/reason code\n");
54 return -EPERM;
55 }
44 56
45 /* Validate checksum, which might be zeroes if the 57 /* Validate checksum, which might be zeroes if the
46 * sender doesn't support checksum according to NCSI 58 * sender doesn't support checksum according to NCSI
@@ -52,8 +64,11 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
52 64
53 checksum = ncsi_calculate_checksum((unsigned char *)h, 65 checksum = ncsi_calculate_checksum((unsigned char *)h,
54 sizeof(*h) + payload - 4); 66 sizeof(*h) + payload - 4);
55 if (*pchecksum != htonl(checksum)) 67
68 if (*pchecksum != htonl(checksum)) {
69 netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n");
56 return -EINVAL; 70 return -EINVAL;
71 }
57 72
58 return 0; 73 return 0;
59} 74}
@@ -596,6 +611,87 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
596 return 0; 611 return 0;
597} 612}
598 613
614/* Response handler for Broadcom command Get Mac Address */
615static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
616{
617 struct ncsi_dev_priv *ndp = nr->ndp;
618 struct net_device *ndev = ndp->ndev.dev;
619 const struct net_device_ops *ops = ndev->netdev_ops;
620 struct ncsi_rsp_oem_pkt *rsp;
621 struct sockaddr saddr;
622 int ret = 0;
623
624 /* Get the response header */
625 rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
626
627 saddr.sa_family = ndev->type;
628 ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
629 memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN);
630 /* Increase mac address by 1 for BMC's address */
631 saddr.sa_data[ETH_ALEN - 1]++;
632 ret = ops->ndo_set_mac_address(ndev, &saddr);
633 if (ret < 0)
634 netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
635
636 return ret;
637}
638
639/* Response handler for Broadcom card */
640static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr)
641{
642 struct ncsi_rsp_oem_bcm_pkt *bcm;
643 struct ncsi_rsp_oem_pkt *rsp;
644
645 /* Get the response header */
646 rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
647 bcm = (struct ncsi_rsp_oem_bcm_pkt *)(rsp->data);
648
649 if (bcm->type == NCSI_OEM_BCM_CMD_GMA)
650 return ncsi_rsp_handler_oem_bcm_gma(nr);
651 return 0;
652}
653
654static struct ncsi_rsp_oem_handler {
655 unsigned int mfr_id;
656 int (*handler)(struct ncsi_request *nr);
657} ncsi_rsp_oem_handlers[] = {
658 { NCSI_OEM_MFR_MLX_ID, NULL },
659 { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
660};
661
662/* Response handler for OEM command */
663static int ncsi_rsp_handler_oem(struct ncsi_request *nr)
664{
665 struct ncsi_rsp_oem_handler *nrh = NULL;
666 struct ncsi_rsp_oem_pkt *rsp;
667 unsigned int mfr_id, i;
668
669 /* Get the response header */
670 rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
671 mfr_id = ntohl(rsp->mfr_id);
672
673 /* Check for manufacturer id and Find the handler */
674 for (i = 0; i < ARRAY_SIZE(ncsi_rsp_oem_handlers); i++) {
675 if (ncsi_rsp_oem_handlers[i].mfr_id == mfr_id) {
676 if (ncsi_rsp_oem_handlers[i].handler)
677 nrh = &ncsi_rsp_oem_handlers[i];
678 else
679 nrh = NULL;
680
681 break;
682 }
683 }
684
685 if (!nrh) {
686 netdev_err(nr->ndp->ndev.dev, "Received unrecognized OEM packet with MFR-ID (0x%x)\n",
687 mfr_id);
688 return -ENOENT;
689 }
690
691 /* Process the packet */
692 return nrh->handler(nr);
693}
694
599static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) 695static int ncsi_rsp_handler_gvi(struct ncsi_request *nr)
600{ 696{
601 struct ncsi_rsp_gvi_pkt *rsp; 697 struct ncsi_rsp_gvi_pkt *rsp;
@@ -900,6 +996,26 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr)
900 return 0; 996 return 0;
901} 997}
902 998
999static int ncsi_rsp_handler_netlink(struct ncsi_request *nr)
1000{
1001 struct ncsi_dev_priv *ndp = nr->ndp;
1002 struct ncsi_rsp_pkt *rsp;
1003 struct ncsi_package *np;
1004 struct ncsi_channel *nc;
1005 int ret;
1006
1007 /* Find the package */
1008 rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
1009 ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
1010 &np, &nc);
1011 if (!np)
1012 return -ENODEV;
1013
1014 ret = ncsi_send_netlink_rsp(nr, np, nc);
1015
1016 return ret;
1017}
1018
903static struct ncsi_rsp_handler { 1019static struct ncsi_rsp_handler {
904 unsigned char type; 1020 unsigned char type;
905 int payload; 1021 int payload;
@@ -932,7 +1048,7 @@ static struct ncsi_rsp_handler {
932 { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns }, 1048 { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns },
933 { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts }, 1049 { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts },
934 { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps }, 1050 { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps },
935 { NCSI_PKT_RSP_OEM, 0, NULL }, 1051 { NCSI_PKT_RSP_OEM, -1, ncsi_rsp_handler_oem },
936 { NCSI_PKT_RSP_PLDM, 0, NULL }, 1052 { NCSI_PKT_RSP_PLDM, 0, NULL },
937 { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid } 1053 { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid }
938}; 1054};
@@ -1002,6 +1118,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
1002 netdev_warn(ndp->ndev.dev, 1118 netdev_warn(ndp->ndev.dev,
1003 "NCSI: 'bad' packet ignored for type 0x%x\n", 1119 "NCSI: 'bad' packet ignored for type 0x%x\n",
1004 hdr->type); 1120 hdr->type);
1121
1122 if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
1123 if (ret == -EPERM)
1124 goto out_netlink;
1125 else
1126 ncsi_send_netlink_err(ndp->ndev.dev,
1127 nr->snd_seq,
1128 nr->snd_portid,
1129 &nr->nlhdr,
1130 ret);
1131 }
1005 goto out; 1132 goto out;
1006 } 1133 }
1007 1134
@@ -1011,6 +1138,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
1011 netdev_err(ndp->ndev.dev, 1138 netdev_err(ndp->ndev.dev,
1012 "NCSI: Handler for packet type 0x%x returned %d\n", 1139 "NCSI: Handler for packet type 0x%x returned %d\n",
1013 hdr->type, ret); 1140 hdr->type, ret);
1141
1142out_netlink:
1143 if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
1144 ret = ncsi_rsp_handler_netlink(nr);
1145 if (ret) {
1146 netdev_err(ndp->ndev.dev,
1147 "NCSI: Netlink handler for packet type 0x%x returned %d\n",
1148 hdr->type, ret);
1149 }
1150 }
1151
1014out: 1152out:
1015 ncsi_free_request(nr); 1153 ncsi_free_request(nr);
1016 return ret; 1154 return ret;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f61c306de1d0..2ab870ef233a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -625,6 +625,13 @@ config NFT_FIB_INET
625 The lookup will be delegated to the IPv4 or IPv6 FIB depending 625 The lookup will be delegated to the IPv4 or IPv6 FIB depending
626 on the protocol of the packet. 626 on the protocol of the packet.
627 627
628config NFT_XFRM
629 tristate "Netfilter nf_tables xfrm/IPSec security association matching"
630 depends on XFRM
631 help
632 This option adds an expression that you can use to extract properties
633 of a packets security association.
634
628config NFT_SOCKET 635config NFT_SOCKET
629 tristate "Netfilter nf_tables socket match support" 636 tristate "Netfilter nf_tables socket match support"
630 depends on IPV6 || IPV6=n 637 depends on IPV6 || IPV6=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 16895e045b66..4ddf3ef51ece 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
113obj-$(CONFIG_NFT_SOCKET) += nft_socket.o 113obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
114obj-$(CONFIG_NFT_OSF) += nft_osf.o 114obj-$(CONFIG_NFT_OSF) += nft_osf.o
115obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o 115obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o
116obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o
116 117
117# nf_tables netdev 118# nf_tables netdev
118obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o 119obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 8a33dac4e805..e287da68d5fa 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -15,7 +15,7 @@
15 15
16#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) 16#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
17#define ipset_dereference_protected(p, set) \ 17#define ipset_dereference_protected(p, set) \
18 __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) 18 __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
19 19
20#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) 20#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
21 21
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7ca926a03b81..fe9abf3cc10a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1686,8 +1686,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
1686 skb_reset_network_header(skb); 1686 skb_reset_network_header(skb);
1687 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", 1687 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
1688 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); 1688 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
1689 ipv4_update_pmtu(skb, ipvs->net, 1689 ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
1690 mtu, 0, 0, 0, 0);
1691 /* Client uses PMTUD? */ 1690 /* Client uses PMTUD? */
1692 if (!(frag_off & htons(IP_DF))) 1691 if (!(frag_off & htons(IP_DF)))
1693 goto ignore_ipip; 1692 goto ignore_ipip;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 62eefea48973..83395bf6dc35 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3234,7 +3234,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3234 3234
3235 /* Try to find the service for which to dump destinations */ 3235 /* Try to find the service for which to dump destinations */
3236 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, 3236 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX,
3237 ip_vs_cmd_policy, NULL)) 3237 ip_vs_cmd_policy, cb->extack))
3238 goto out_err; 3238 goto out_err;
3239 3239
3240 3240
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index a676d5f76bdc..ca1168d67fac 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -379,7 +379,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
379 return false; 379 return false;
380 } 380 }
381 381
382 l4proto = __nf_ct_l4proto_find(l3num, protonum); 382 l4proto = __nf_ct_l4proto_find(protonum);
383 383
384 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, 384 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
385 l4proto); 385 l4proto);
@@ -539,7 +539,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
539 nf_ct_tmpl_free(ct); 539 nf_ct_tmpl_free(ct);
540 return; 540 return;
541 } 541 }
542 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 542 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
543 if (l4proto->destroy) 543 if (l4proto->destroy)
544 l4proto->destroy(ct); 544 l4proto->destroy(ct);
545 545
@@ -840,7 +840,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
840 enum ip_conntrack_info oldinfo; 840 enum ip_conntrack_info oldinfo;
841 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); 841 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
842 842
843 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 843 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
844 if (l4proto->allow_clash && 844 if (l4proto->allow_clash &&
845 !nf_ct_is_dying(ct) && 845 !nf_ct_is_dying(ct) &&
846 atomic_inc_not_zero(&ct->ct_general.use)) { 846 atomic_inc_not_zero(&ct->ct_general.use)) {
@@ -1109,7 +1109,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1109 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1109 if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1110 return true; 1110 return true;
1111 1111
1112 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 1112 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
1113 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1113 if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1114 return true; 1114 return true;
1115 1115
@@ -1370,12 +1370,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1370 1370
1371 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1371 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1372 1372
1373 if (!l4proto->new(ct, skb, dataoff)) {
1374 nf_conntrack_free(ct);
1375 pr_debug("can't track with proto module\n");
1376 return NULL;
1377 }
1378
1379 if (timeout_ext) 1373 if (timeout_ext)
1380 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1374 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1381 GFP_ATOMIC); 1375 GFP_ATOMIC);
@@ -1436,12 +1430,12 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1436 1430
1437/* On success, returns 0, sets skb->_nfct | ctinfo */ 1431/* On success, returns 0, sets skb->_nfct | ctinfo */
1438static int 1432static int
1439resolve_normal_ct(struct net *net, struct nf_conn *tmpl, 1433resolve_normal_ct(struct nf_conn *tmpl,
1440 struct sk_buff *skb, 1434 struct sk_buff *skb,
1441 unsigned int dataoff, 1435 unsigned int dataoff,
1442 u_int16_t l3num,
1443 u_int8_t protonum, 1436 u_int8_t protonum,
1444 const struct nf_conntrack_l4proto *l4proto) 1437 const struct nf_conntrack_l4proto *l4proto,
1438 const struct nf_hook_state *state)
1445{ 1439{
1446 const struct nf_conntrack_zone *zone; 1440 const struct nf_conntrack_zone *zone;
1447 struct nf_conntrack_tuple tuple; 1441 struct nf_conntrack_tuple tuple;
@@ -1452,17 +1446,18 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1452 u32 hash; 1446 u32 hash;
1453 1447
1454 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1448 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1455 dataoff, l3num, protonum, net, &tuple, l4proto)) { 1449 dataoff, state->pf, protonum, state->net,
1450 &tuple, l4proto)) {
1456 pr_debug("Can't get tuple\n"); 1451 pr_debug("Can't get tuple\n");
1457 return 0; 1452 return 0;
1458 } 1453 }
1459 1454
1460 /* look for tuple match */ 1455 /* look for tuple match */
1461 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1456 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1462 hash = hash_conntrack_raw(&tuple, net); 1457 hash = hash_conntrack_raw(&tuple, state->net);
1463 h = __nf_conntrack_find_get(net, zone, &tuple, hash); 1458 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1464 if (!h) { 1459 if (!h) {
1465 h = init_conntrack(net, tmpl, &tuple, l4proto, 1460 h = init_conntrack(state->net, tmpl, &tuple, l4proto,
1466 skb, dataoff, hash); 1461 skb, dataoff, hash);
1467 if (!h) 1462 if (!h)
1468 return 0; 1463 return 0;
@@ -1491,13 +1486,45 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1491 return 0; 1486 return 0;
1492} 1487}
1493 1488
1489/*
1490 * icmp packets need special treatment to handle error messages that are
1491 * related to a connection.
1492 *
1493 * Callers need to check if skb has a conntrack assigned when this
1494 * helper returns; in such case skb belongs to an already known connection.
1495 */
1496static unsigned int __cold
1497nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1498 struct sk_buff *skb,
1499 unsigned int dataoff,
1500 u8 protonum,
1501 const struct nf_hook_state *state)
1502{
1503 int ret;
1504
1505 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1506 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1507#if IS_ENABLED(CONFIG_IPV6)
1508 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1509 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1510#endif
1511 else
1512 return NF_ACCEPT;
1513
1514 if (ret <= 0) {
1515 NF_CT_STAT_INC_ATOMIC(state->net, error);
1516 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1517 }
1518
1519 return ret;
1520}
1521
1494unsigned int 1522unsigned int
1495nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1523nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1496 struct sk_buff *skb)
1497{ 1524{
1498 const struct nf_conntrack_l4proto *l4proto; 1525 const struct nf_conntrack_l4proto *l4proto;
1499 struct nf_conn *ct, *tmpl;
1500 enum ip_conntrack_info ctinfo; 1526 enum ip_conntrack_info ctinfo;
1527 struct nf_conn *ct, *tmpl;
1501 u_int8_t protonum; 1528 u_int8_t protonum;
1502 int dataoff, ret; 1529 int dataoff, ret;
1503 1530
@@ -1506,32 +1533,28 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1506 /* Previously seen (loopback or untracked)? Ignore. */ 1533 /* Previously seen (loopback or untracked)? Ignore. */
1507 if ((tmpl && !nf_ct_is_template(tmpl)) || 1534 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1508 ctinfo == IP_CT_UNTRACKED) { 1535 ctinfo == IP_CT_UNTRACKED) {
1509 NF_CT_STAT_INC_ATOMIC(net, ignore); 1536 NF_CT_STAT_INC_ATOMIC(state->net, ignore);
1510 return NF_ACCEPT; 1537 return NF_ACCEPT;
1511 } 1538 }
1512 skb->_nfct = 0; 1539 skb->_nfct = 0;
1513 } 1540 }
1514 1541
1515 /* rcu_read_lock()ed by nf_hook_thresh */ 1542 /* rcu_read_lock()ed by nf_hook_thresh */
1516 dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum); 1543 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1517 if (dataoff <= 0) { 1544 if (dataoff <= 0) {
1518 pr_debug("not prepared to track yet or error occurred\n"); 1545 pr_debug("not prepared to track yet or error occurred\n");
1519 NF_CT_STAT_INC_ATOMIC(net, error); 1546 NF_CT_STAT_INC_ATOMIC(state->net, error);
1520 NF_CT_STAT_INC_ATOMIC(net, invalid); 1547 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1521 ret = NF_ACCEPT; 1548 ret = NF_ACCEPT;
1522 goto out; 1549 goto out;
1523 } 1550 }
1524 1551
1525 l4proto = __nf_ct_l4proto_find(pf, protonum); 1552 l4proto = __nf_ct_l4proto_find(protonum);
1526 1553
1527 /* It may be an special packet, error, unclean... 1554 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1528 * inverse of the return code tells to the netfilter 1555 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1529 * core what to do with the packet. */ 1556 protonum, state);
1530 if (l4proto->error != NULL) {
1531 ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
1532 if (ret <= 0) { 1557 if (ret <= 0) {
1533 NF_CT_STAT_INC_ATOMIC(net, error);
1534 NF_CT_STAT_INC_ATOMIC(net, invalid);
1535 ret = -ret; 1558 ret = -ret;
1536 goto out; 1559 goto out;
1537 } 1560 }
@@ -1540,10 +1563,11 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1540 goto out; 1563 goto out;
1541 } 1564 }
1542repeat: 1565repeat:
1543 ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto); 1566 ret = resolve_normal_ct(tmpl, skb, dataoff,
1567 protonum, l4proto, state);
1544 if (ret < 0) { 1568 if (ret < 0) {
1545 /* Too stressed to deal. */ 1569 /* Too stressed to deal. */
1546 NF_CT_STAT_INC_ATOMIC(net, drop); 1570 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1547 ret = NF_DROP; 1571 ret = NF_DROP;
1548 goto out; 1572 goto out;
1549 } 1573 }
@@ -1551,21 +1575,21 @@ repeat:
1551 ct = nf_ct_get(skb, &ctinfo); 1575 ct = nf_ct_get(skb, &ctinfo);
1552 if (!ct) { 1576 if (!ct) {
1553 /* Not valid part of a connection */ 1577 /* Not valid part of a connection */
1554 NF_CT_STAT_INC_ATOMIC(net, invalid); 1578 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1555 ret = NF_ACCEPT; 1579 ret = NF_ACCEPT;
1556 goto out; 1580 goto out;
1557 } 1581 }
1558 1582
1559 ret = l4proto->packet(ct, skb, dataoff, ctinfo); 1583 ret = l4proto->packet(ct, skb, dataoff, ctinfo, state);
1560 if (ret <= 0) { 1584 if (ret <= 0) {
1561 /* Invalid: inverse of the return code tells 1585 /* Invalid: inverse of the return code tells
1562 * the netfilter core what to do */ 1586 * the netfilter core what to do */
1563 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1587 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1564 nf_conntrack_put(&ct->ct_general); 1588 nf_conntrack_put(&ct->ct_general);
1565 skb->_nfct = 0; 1589 skb->_nfct = 0;
1566 NF_CT_STAT_INC_ATOMIC(net, invalid); 1590 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1567 if (ret == -NF_DROP) 1591 if (ret == -NF_DROP)
1568 NF_CT_STAT_INC_ATOMIC(net, drop); 1592 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1569 /* Special case: TCP tracker reports an attempt to reopen a 1593 /* Special case: TCP tracker reports an attempt to reopen a
1570 * closed/aborted connection. We have to go back and create a 1594 * closed/aborted connection. We have to go back and create a
1571 * fresh conntrack. 1595 * fresh conntrack.
@@ -1594,8 +1618,7 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1594 1618
1595 rcu_read_lock(); 1619 rcu_read_lock();
1596 ret = nf_ct_invert_tuple(inverse, orig, 1620 ret = nf_ct_invert_tuple(inverse, orig,
1597 __nf_ct_l4proto_find(orig->src.l3num, 1621 __nf_ct_l4proto_find(orig->dst.protonum));
1598 orig->dst.protonum));
1599 rcu_read_unlock(); 1622 rcu_read_unlock();
1600 return ret; 1623 return ret;
1601} 1624}
@@ -1752,7 +1775,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
1752 if (dataoff <= 0) 1775 if (dataoff <= 0)
1753 return -1; 1776 return -1;
1754 1777
1755 l4proto = nf_ct_l4proto_find_get(l3num, l4num); 1778 l4proto = nf_ct_l4proto_find_get(l4num);
1756 1779
1757 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 1780 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
1758 l4num, net, &tuple, l4proto)) 1781 l4num, net, &tuple, l4proto))
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 27b84231db10..3034038bfdf0 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -610,8 +610,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
610 expect->tuple.src.l3num, 610 expect->tuple.src.l3num,
611 expect->tuple.dst.protonum); 611 expect->tuple.dst.protonum);
612 print_tuple(s, &expect->tuple, 612 print_tuple(s, &expect->tuple,
613 __nf_ct_l4proto_find(expect->tuple.src.l3num, 613 __nf_ct_l4proto_find(expect->tuple.dst.protonum));
614 expect->tuple.dst.protonum));
615 614
616 if (expect->flags & NF_CT_EXPECT_PERMANENT) { 615 if (expect->flags & NF_CT_EXPECT_PERMANENT) {
617 seq_puts(s, "PERMANENT"); 616 seq_puts(s, "PERMANENT");
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 036207ecaf16..4ae8e528943a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -135,8 +135,7 @@ static int ctnetlink_dump_tuples(struct sk_buff *skb,
135 ret = ctnetlink_dump_tuples_ip(skb, tuple); 135 ret = ctnetlink_dump_tuples_ip(skb, tuple);
136 136
137 if (ret >= 0) { 137 if (ret >= 0) {
138 l4proto = __nf_ct_l4proto_find(tuple->src.l3num, 138 l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
139 tuple->dst.protonum);
140 ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); 139 ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
141 } 140 }
142 rcu_read_unlock(); 141 rcu_read_unlock();
@@ -184,7 +183,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
184 struct nlattr *nest_proto; 183 struct nlattr *nest_proto;
185 int ret; 184 int ret;
186 185
187 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 186 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
188 if (!l4proto->to_nlattr) 187 if (!l4proto->to_nlattr)
189 return 0; 188 return 0;
190 189
@@ -592,7 +591,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
592 len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); 591 len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
593 len *= 3u; /* ORIG, REPLY, MASTER */ 592 len *= 3u; /* ORIG, REPLY, MASTER */
594 593
595 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 594 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
596 len += l4proto->nlattr_size; 595 len += l4proto->nlattr_size;
597 if (l4proto->nlattr_tuple_size) { 596 if (l4proto->nlattr_tuple_size) {
598 len4 = l4proto->nlattr_tuple_size(); 597 len4 = l4proto->nlattr_tuple_size();
@@ -821,6 +820,7 @@ static int ctnetlink_done(struct netlink_callback *cb)
821} 820}
822 821
823struct ctnetlink_filter { 822struct ctnetlink_filter {
823 u8 family;
824 struct { 824 struct {
825 u_int32_t val; 825 u_int32_t val;
826 u_int32_t mask; 826 u_int32_t mask;
@@ -828,31 +828,39 @@ struct ctnetlink_filter {
828}; 828};
829 829
830static struct ctnetlink_filter * 830static struct ctnetlink_filter *
831ctnetlink_alloc_filter(const struct nlattr * const cda[]) 831ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
832{ 832{
833#ifdef CONFIG_NF_CONNTRACK_MARK
834 struct ctnetlink_filter *filter; 833 struct ctnetlink_filter *filter;
835 834
835#ifndef CONFIG_NF_CONNTRACK_MARK
836 if (cda[CTA_MARK] && cda[CTA_MARK_MASK])
837 return ERR_PTR(-EOPNOTSUPP);
838#endif
839
836 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 840 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
837 if (filter == NULL) 841 if (filter == NULL)
838 return ERR_PTR(-ENOMEM); 842 return ERR_PTR(-ENOMEM);
839 843
840 filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); 844 filter->family = family;
841 filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
842 845
843 return filter; 846#ifdef CONFIG_NF_CONNTRACK_MARK
844#else 847 if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
845 return ERR_PTR(-EOPNOTSUPP); 848 filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
849 filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
850 }
846#endif 851#endif
852 return filter;
847} 853}
848 854
849static int ctnetlink_start(struct netlink_callback *cb) 855static int ctnetlink_start(struct netlink_callback *cb)
850{ 856{
851 const struct nlattr * const *cda = cb->data; 857 const struct nlattr * const *cda = cb->data;
852 struct ctnetlink_filter *filter = NULL; 858 struct ctnetlink_filter *filter = NULL;
859 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
860 u8 family = nfmsg->nfgen_family;
853 861
854 if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { 862 if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
855 filter = ctnetlink_alloc_filter(cda); 863 filter = ctnetlink_alloc_filter(cda, family);
856 if (IS_ERR(filter)) 864 if (IS_ERR(filter))
857 return PTR_ERR(filter); 865 return PTR_ERR(filter);
858 } 866 }
@@ -866,13 +874,24 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
866 struct ctnetlink_filter *filter = data; 874 struct ctnetlink_filter *filter = data;
867 875
868 if (filter == NULL) 876 if (filter == NULL)
869 return 1; 877 goto out;
878
879 /* Match entries of a given L3 protocol number.
880 * If it is not specified, ie. l3proto == 0,
881 * then match everything.
882 */
883 if (filter->family && nf_ct_l3num(ct) != filter->family)
884 goto ignore_entry;
870 885
871#ifdef CONFIG_NF_CONNTRACK_MARK 886#ifdef CONFIG_NF_CONNTRACK_MARK
872 if ((ct->mark & filter->mark.mask) == filter->mark.val) 887 if ((ct->mark & filter->mark.mask) != filter->mark.val)
873 return 1; 888 goto ignore_entry;
874#endif 889#endif
875 890
891out:
892 return 1;
893
894ignore_entry:
876 return 0; 895 return 0;
877} 896}
878 897
@@ -883,8 +902,6 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
883 struct nf_conn *ct, *last; 902 struct nf_conn *ct, *last;
884 struct nf_conntrack_tuple_hash *h; 903 struct nf_conntrack_tuple_hash *h;
885 struct hlist_nulls_node *n; 904 struct hlist_nulls_node *n;
886 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
887 u_int8_t l3proto = nfmsg->nfgen_family;
888 struct nf_conn *nf_ct_evict[8]; 905 struct nf_conn *nf_ct_evict[8];
889 int res, i; 906 int res, i;
890 spinlock_t *lockp; 907 spinlock_t *lockp;
@@ -923,11 +940,6 @@ restart:
923 if (!net_eq(net, nf_ct_net(ct))) 940 if (!net_eq(net, nf_ct_net(ct)))
924 continue; 941 continue;
925 942
926 /* Dump entries of a given L3 protocol number.
927 * If it is not specified, ie. l3proto == 0,
928 * then dump everything. */
929 if (l3proto && nf_ct_l3num(ct) != l3proto)
930 continue;
931 if (cb->args[1]) { 943 if (cb->args[1]) {
932 if (ct != last) 944 if (ct != last)
933 continue; 945 continue;
@@ -1048,7 +1060,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
1048 tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); 1060 tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
1049 1061
1050 rcu_read_lock(); 1062 rcu_read_lock();
1051 l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); 1063 l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
1052 1064
1053 if (likely(l4proto->nlattr_to_tuple)) { 1065 if (likely(l4proto->nlattr_to_tuple)) {
1054 ret = nla_validate_nested(attr, CTA_PROTO_MAX, 1066 ret = nla_validate_nested(attr, CTA_PROTO_MAX,
@@ -1213,12 +1225,12 @@ static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
1213 1225
1214static int ctnetlink_flush_conntrack(struct net *net, 1226static int ctnetlink_flush_conntrack(struct net *net,
1215 const struct nlattr * const cda[], 1227 const struct nlattr * const cda[],
1216 u32 portid, int report) 1228 u32 portid, int report, u8 family)
1217{ 1229{
1218 struct ctnetlink_filter *filter = NULL; 1230 struct ctnetlink_filter *filter = NULL;
1219 1231
1220 if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { 1232 if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
1221 filter = ctnetlink_alloc_filter(cda); 1233 filter = ctnetlink_alloc_filter(cda, family);
1222 if (IS_ERR(filter)) 1234 if (IS_ERR(filter))
1223 return PTR_ERR(filter); 1235 return PTR_ERR(filter);
1224 } 1236 }
@@ -1257,7 +1269,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
1257 else { 1269 else {
1258 return ctnetlink_flush_conntrack(net, cda, 1270 return ctnetlink_flush_conntrack(net, cda,
1259 NETLINK_CB(skb).portid, 1271 NETLINK_CB(skb).portid,
1260 nlmsg_report(nlh)); 1272 nlmsg_report(nlh), u3);
1261 } 1273 }
1262 1274
1263 if (err < 0) 1275 if (err < 0)
@@ -1696,7 +1708,7 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct,
1696 return err; 1708 return err;
1697 1709
1698 rcu_read_lock(); 1710 rcu_read_lock();
1699 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 1711 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
1700 if (l4proto->from_nlattr) 1712 if (l4proto->from_nlattr)
1701 err = l4proto->from_nlattr(tb, ct); 1713 err = l4proto->from_nlattr(tb, ct);
1702 rcu_read_unlock(); 1714 rcu_read_unlock();
@@ -2656,8 +2668,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
2656 rcu_read_lock(); 2668 rcu_read_lock();
2657 ret = ctnetlink_dump_tuples_ip(skb, &m); 2669 ret = ctnetlink_dump_tuples_ip(skb, &m);
2658 if (ret >= 0) { 2670 if (ret >= 0) {
2659 l4proto = __nf_ct_l4proto_find(tuple->src.l3num, 2671 l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
2660 tuple->dst.protonum);
2661 ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); 2672 ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
2662 } 2673 }
2663 rcu_read_unlock(); 2674 rcu_read_unlock();
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 51c5d7eec0a3..40643af7137e 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -43,7 +43,7 @@
43 43
44extern unsigned int nf_conntrack_net_id; 44extern unsigned int nf_conntrack_net_id;
45 45
46static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; 46static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly;
47 47
48static DEFINE_MUTEX(nf_ct_proto_mutex); 48static DEFINE_MUTEX(nf_ct_proto_mutex);
49 49
@@ -124,23 +124,21 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
124EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); 124EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid);
125#endif 125#endif
126 126
127const struct nf_conntrack_l4proto * 127const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto)
128__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
129{ 128{
130 if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL)) 129 if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos)))
131 return &nf_conntrack_l4proto_generic; 130 return &nf_conntrack_l4proto_generic;
132 131
133 return rcu_dereference(nf_ct_protos[l3proto][l4proto]); 132 return rcu_dereference(nf_ct_protos[l4proto]);
134} 133}
135EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); 134EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
136 135
137const struct nf_conntrack_l4proto * 136const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num)
138nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
139{ 137{
140 const struct nf_conntrack_l4proto *p; 138 const struct nf_conntrack_l4proto *p;
141 139
142 rcu_read_lock(); 140 rcu_read_lock();
143 p = __nf_ct_l4proto_find(l3num, l4num); 141 p = __nf_ct_l4proto_find(l4num);
144 if (!try_module_get(p->me)) 142 if (!try_module_get(p->me))
145 p = &nf_conntrack_l4proto_generic; 143 p = &nf_conntrack_l4proto_generic;
146 rcu_read_unlock(); 144 rcu_read_unlock();
@@ -159,8 +157,7 @@ static int kill_l4proto(struct nf_conn *i, void *data)
159{ 157{
160 const struct nf_conntrack_l4proto *l4proto; 158 const struct nf_conntrack_l4proto *l4proto;
161 l4proto = data; 159 l4proto = data;
162 return nf_ct_protonum(i) == l4proto->l4proto && 160 return nf_ct_protonum(i) == l4proto->l4proto;
163 nf_ct_l3num(i) == l4proto->l3proto;
164} 161}
165 162
166static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, 163static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
@@ -219,48 +216,20 @@ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto)
219{ 216{
220 int ret = 0; 217 int ret = 0;
221 218
222 if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos))
223 return -EBUSY;
224
225 if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) || 219 if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) ||
226 (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) 220 (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
227 return -EINVAL; 221 return -EINVAL;
228 222
229 mutex_lock(&nf_ct_proto_mutex); 223 mutex_lock(&nf_ct_proto_mutex);
230 if (!nf_ct_protos[l4proto->l3proto]) { 224 if (rcu_dereference_protected(
231 /* l3proto may be loaded latter. */ 225 nf_ct_protos[l4proto->l4proto],
232 struct nf_conntrack_l4proto __rcu **proto_array;
233 int i;
234
235 proto_array =
236 kmalloc_array(MAX_NF_CT_PROTO,
237 sizeof(struct nf_conntrack_l4proto *),
238 GFP_KERNEL);
239 if (proto_array == NULL) {
240 ret = -ENOMEM;
241 goto out_unlock;
242 }
243
244 for (i = 0; i < MAX_NF_CT_PROTO; i++)
245 RCU_INIT_POINTER(proto_array[i],
246 &nf_conntrack_l4proto_generic);
247
248 /* Before making proto_array visible to lockless readers,
249 * we must make sure its content is committed to memory.
250 */
251 smp_wmb();
252
253 nf_ct_protos[l4proto->l3proto] = proto_array;
254 } else if (rcu_dereference_protected(
255 nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
256 lockdep_is_held(&nf_ct_proto_mutex) 226 lockdep_is_held(&nf_ct_proto_mutex)
257 ) != &nf_conntrack_l4proto_generic) { 227 ) != &nf_conntrack_l4proto_generic) {
258 ret = -EBUSY; 228 ret = -EBUSY;
259 goto out_unlock; 229 goto out_unlock;
260 } 230 }
261 231
262 rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], 232 rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto);
263 l4proto);
264out_unlock: 233out_unlock:
265 mutex_unlock(&nf_ct_proto_mutex); 234 mutex_unlock(&nf_ct_proto_mutex);
266 return ret; 235 return ret;
@@ -274,7 +243,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net,
274 struct nf_proto_net *pn = NULL; 243 struct nf_proto_net *pn = NULL;
275 244
276 if (l4proto->init_net) { 245 if (l4proto->init_net) {
277 ret = l4proto->init_net(net, l4proto->l3proto); 246 ret = l4proto->init_net(net);
278 if (ret < 0) 247 if (ret < 0)
279 goto out; 248 goto out;
280 } 249 }
@@ -296,13 +265,13 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
296static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) 265static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
297 266
298{ 267{
299 BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)); 268 BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos));
300 269
301 BUG_ON(rcu_dereference_protected( 270 BUG_ON(rcu_dereference_protected(
302 nf_ct_protos[l4proto->l3proto][l4proto->l4proto], 271 nf_ct_protos[l4proto->l4proto],
303 lockdep_is_held(&nf_ct_proto_mutex) 272 lockdep_is_held(&nf_ct_proto_mutex)
304 ) != l4proto); 273 ) != l4proto);
305 rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], 274 rcu_assign_pointer(nf_ct_protos[l4proto->l4proto],
306 &nf_conntrack_l4proto_generic); 275 &nf_conntrack_l4proto_generic);
307} 276}
308 277
@@ -352,7 +321,7 @@ static int
352nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], 321nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
353 unsigned int num_proto) 322 unsigned int num_proto)
354{ 323{
355 int ret = -EINVAL, ver; 324 int ret = -EINVAL;
356 unsigned int i; 325 unsigned int i;
357 326
358 for (i = 0; i < num_proto; i++) { 327 for (i = 0; i < num_proto; i++) {
@@ -361,9 +330,8 @@ nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
361 break; 330 break;
362 } 331 }
363 if (i != num_proto) { 332 if (i != num_proto) {
364 ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4; 333 pr_err("nf_conntrack: can't register l4 %d proto.\n",
365 pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n", 334 l4proto[i]->l4proto);
366 ver, l4proto[i]->l4proto);
367 nf_ct_l4proto_unregister(l4proto, i); 335 nf_ct_l4proto_unregister(l4proto, i);
368 } 336 }
369 return ret; 337 return ret;
@@ -382,9 +350,8 @@ int nf_ct_l4proto_pernet_register(struct net *net,
382 break; 350 break;
383 } 351 }
384 if (i != num_proto) { 352 if (i != num_proto) {
385 pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n", 353 pr_err("nf_conntrack %d: pernet registration failed\n",
386 l4proto[i]->l4proto, 354 l4proto[i]->l4proto);
387 l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
388 nf_ct_l4proto_pernet_unregister(net, l4proto, i); 355 nf_ct_l4proto_pernet_unregister(net, l4proto, i);
389 } 356 }
390 return ret; 357 return ret;
@@ -455,7 +422,7 @@ static unsigned int ipv4_conntrack_in(void *priv,
455 struct sk_buff *skb, 422 struct sk_buff *skb,
456 const struct nf_hook_state *state) 423 const struct nf_hook_state *state)
457{ 424{
458 return nf_conntrack_in(state->net, PF_INET, state->hook, skb); 425 return nf_conntrack_in(skb, state);
459} 426}
460 427
461static unsigned int ipv4_conntrack_local(void *priv, 428static unsigned int ipv4_conntrack_local(void *priv,
@@ -477,7 +444,7 @@ static unsigned int ipv4_conntrack_local(void *priv,
477 return NF_ACCEPT; 444 return NF_ACCEPT;
478 } 445 }
479 446
480 return nf_conntrack_in(state->net, PF_INET, state->hook, skb); 447 return nf_conntrack_in(skb, state);
481} 448}
482 449
483/* Connection tracking may drop packets, but never alters them, so 450/* Connection tracking may drop packets, but never alters them, so
@@ -690,14 +657,14 @@ static unsigned int ipv6_conntrack_in(void *priv,
690 struct sk_buff *skb, 657 struct sk_buff *skb,
691 const struct nf_hook_state *state) 658 const struct nf_hook_state *state)
692{ 659{
693 return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); 660 return nf_conntrack_in(skb, state);
694} 661}
695 662
696static unsigned int ipv6_conntrack_local(void *priv, 663static unsigned int ipv6_conntrack_local(void *priv,
697 struct sk_buff *skb, 664 struct sk_buff *skb,
698 const struct nf_hook_state *state) 665 const struct nf_hook_state *state)
699{ 666{
700 return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); 667 return nf_conntrack_in(skb, state);
701} 668}
702 669
703static unsigned int ipv6_helper(void *priv, 670static unsigned int ipv6_helper(void *priv,
@@ -911,37 +878,26 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto)
911EXPORT_SYMBOL_GPL(nf_ct_netns_put); 878EXPORT_SYMBOL_GPL(nf_ct_netns_put);
912 879
913static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { 880static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
914 &nf_conntrack_l4proto_tcp4, 881 &nf_conntrack_l4proto_tcp,
915 &nf_conntrack_l4proto_udp4, 882 &nf_conntrack_l4proto_udp,
916 &nf_conntrack_l4proto_icmp, 883 &nf_conntrack_l4proto_icmp,
917#ifdef CONFIG_NF_CT_PROTO_DCCP 884#ifdef CONFIG_NF_CT_PROTO_DCCP
918 &nf_conntrack_l4proto_dccp4, 885 &nf_conntrack_l4proto_dccp,
919#endif 886#endif
920#ifdef CONFIG_NF_CT_PROTO_SCTP 887#ifdef CONFIG_NF_CT_PROTO_SCTP
921 &nf_conntrack_l4proto_sctp4, 888 &nf_conntrack_l4proto_sctp,
922#endif 889#endif
923#ifdef CONFIG_NF_CT_PROTO_UDPLITE 890#ifdef CONFIG_NF_CT_PROTO_UDPLITE
924 &nf_conntrack_l4proto_udplite4, 891 &nf_conntrack_l4proto_udplite,
925#endif 892#endif
926#if IS_ENABLED(CONFIG_IPV6) 893#if IS_ENABLED(CONFIG_IPV6)
927 &nf_conntrack_l4proto_tcp6,
928 &nf_conntrack_l4proto_udp6,
929 &nf_conntrack_l4proto_icmpv6, 894 &nf_conntrack_l4proto_icmpv6,
930#ifdef CONFIG_NF_CT_PROTO_DCCP
931 &nf_conntrack_l4proto_dccp6,
932#endif
933#ifdef CONFIG_NF_CT_PROTO_SCTP
934 &nf_conntrack_l4proto_sctp6,
935#endif
936#ifdef CONFIG_NF_CT_PROTO_UDPLITE
937 &nf_conntrack_l4proto_udplite6,
938#endif
939#endif /* CONFIG_IPV6 */ 895#endif /* CONFIG_IPV6 */
940}; 896};
941 897
942int nf_conntrack_proto_init(void) 898int nf_conntrack_proto_init(void)
943{ 899{
944 int ret = 0; 900 int ret = 0, i;
945 901
946 ret = nf_register_sockopt(&so_getorigdst); 902 ret = nf_register_sockopt(&so_getorigdst);
947 if (ret < 0) 903 if (ret < 0)
@@ -952,6 +908,11 @@ int nf_conntrack_proto_init(void)
952 if (ret < 0) 908 if (ret < 0)
953 goto cleanup_sockopt; 909 goto cleanup_sockopt;
954#endif 910#endif
911
912 for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
913 RCU_INIT_POINTER(nf_ct_protos[i],
914 &nf_conntrack_l4proto_generic);
915
955 ret = nf_ct_l4proto_register(builtin_l4proto, 916 ret = nf_ct_l4proto_register(builtin_l4proto,
956 ARRAY_SIZE(builtin_l4proto)); 917 ARRAY_SIZE(builtin_l4proto));
957 if (ret < 0) 918 if (ret < 0)
@@ -969,17 +930,10 @@ cleanup_sockopt:
969 930
970void nf_conntrack_proto_fini(void) 931void nf_conntrack_proto_fini(void)
971{ 932{
972 unsigned int i;
973
974 nf_unregister_sockopt(&so_getorigdst); 933 nf_unregister_sockopt(&so_getorigdst);
975#if IS_ENABLED(CONFIG_IPV6) 934#if IS_ENABLED(CONFIG_IPV6)
976 nf_unregister_sockopt(&so_getorigdst6); 935 nf_unregister_sockopt(&so_getorigdst6);
977#endif 936#endif
978 /* No need to call nf_ct_l4proto_unregister(), the register
979 * tables are free'd here anyway.
980 */
981 for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
982 kfree(nf_ct_protos[i]);
983} 937}
984 938
985int nf_conntrack_proto_pernet_init(struct net *net) 939int nf_conntrack_proto_pernet_init(struct net *net)
@@ -988,8 +942,7 @@ int nf_conntrack_proto_pernet_init(struct net *net)
988 struct nf_proto_net *pn = nf_ct_l4proto_net(net, 942 struct nf_proto_net *pn = nf_ct_l4proto_net(net,
989 &nf_conntrack_l4proto_generic); 943 &nf_conntrack_l4proto_generic);
990 944
991 err = nf_conntrack_l4proto_generic.init_net(net, 945 err = nf_conntrack_l4proto_generic.init_net(net);
992 nf_conntrack_l4proto_generic.l3proto);
993 if (err < 0) 946 if (err < 0)
994 return err; 947 return err;
995 err = nf_ct_l4proto_register_sysctl(net, 948 err = nf_ct_l4proto_register_sysctl(net,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index f3f91ed2c21a..171e9e122e5f 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -389,18 +389,15 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
389 return &net->ct.nf_ct_proto.dccp; 389 return &net->ct.nf_ct_proto.dccp;
390} 390}
391 391
392static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, 392static noinline bool
393 unsigned int dataoff) 393dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
394 const struct dccp_hdr *dh)
394{ 395{
395 struct net *net = nf_ct_net(ct); 396 struct net *net = nf_ct_net(ct);
396 struct nf_dccp_net *dn; 397 struct nf_dccp_net *dn;
397 struct dccp_hdr _dh, *dh;
398 const char *msg; 398 const char *msg;
399 u_int8_t state; 399 u_int8_t state;
400 400
401 dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
402 BUG_ON(dh == NULL);
403
404 state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; 401 state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
405 switch (state) { 402 switch (state) {
406 default: 403 default:
@@ -438,8 +435,51 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
438 ntohl(dhack->dccph_ack_nr_low); 435 ntohl(dhack->dccph_ack_nr_low);
439} 436}
440 437
441static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, 438static bool dccp_error(const struct dccp_hdr *dh,
442 unsigned int dataoff, enum ip_conntrack_info ctinfo) 439 struct sk_buff *skb, unsigned int dataoff,
440 const struct nf_hook_state *state)
441{
442 unsigned int dccp_len = skb->len - dataoff;
443 unsigned int cscov;
444 const char *msg;
445
446 if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
447 dh->dccph_doff * 4 > dccp_len) {
448 msg = "nf_ct_dccp: truncated/malformed packet ";
449 goto out_invalid;
450 }
451
452 cscov = dccp_len;
453 if (dh->dccph_cscov) {
454 cscov = (dh->dccph_cscov - 1) * 4;
455 if (cscov > dccp_len) {
456 msg = "nf_ct_dccp: bad checksum coverage ";
457 goto out_invalid;
458 }
459 }
460
461 if (state->hook == NF_INET_PRE_ROUTING &&
462 state->net->ct.sysctl_checksum &&
463 nf_checksum_partial(skb, state->hook, dataoff, cscov,
464 IPPROTO_DCCP, state->pf)) {
465 msg = "nf_ct_dccp: bad checksum ";
466 goto out_invalid;
467 }
468
469 if (dh->dccph_type >= DCCP_PKT_INVALID) {
470 msg = "nf_ct_dccp: reserved packet type ";
471 goto out_invalid;
472 }
473 return false;
474out_invalid:
475 nf_l4proto_log_invalid(skb, state->net, state->pf,
476 IPPROTO_DCCP, "%s", msg);
477 return true;
478}
479
480static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
481 unsigned int dataoff, enum ip_conntrack_info ctinfo,
482 const struct nf_hook_state *state)
443{ 483{
444 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 484 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
445 struct dccp_hdr _dh, *dh; 485 struct dccp_hdr _dh, *dh;
@@ -448,8 +488,15 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
448 unsigned int *timeouts; 488 unsigned int *timeouts;
449 489
450 dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); 490 dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
451 BUG_ON(dh == NULL); 491 if (!dh)
492 return NF_DROP;
493
494 if (dccp_error(dh, skb, dataoff, state))
495 return -NF_ACCEPT;
496
452 type = dh->dccph_type; 497 type = dh->dccph_type;
498 if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh))
499 return -NF_ACCEPT;
453 500
454 if (type == DCCP_PKT_RESET && 501 if (type == DCCP_PKT_RESET &&
455 !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 502 !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -527,55 +574,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
527 return NF_ACCEPT; 574 return NF_ACCEPT;
528} 575}
529 576
530static int dccp_error(struct net *net, struct nf_conn *tmpl,
531 struct sk_buff *skb, unsigned int dataoff,
532 u_int8_t pf, unsigned int hooknum)
533{
534 struct dccp_hdr _dh, *dh;
535 unsigned int dccp_len = skb->len - dataoff;
536 unsigned int cscov;
537 const char *msg;
538
539 dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
540 if (dh == NULL) {
541 msg = "nf_ct_dccp: short packet ";
542 goto out_invalid;
543 }
544
545 if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
546 dh->dccph_doff * 4 > dccp_len) {
547 msg = "nf_ct_dccp: truncated/malformed packet ";
548 goto out_invalid;
549 }
550
551 cscov = dccp_len;
552 if (dh->dccph_cscov) {
553 cscov = (dh->dccph_cscov - 1) * 4;
554 if (cscov > dccp_len) {
555 msg = "nf_ct_dccp: bad checksum coverage ";
556 goto out_invalid;
557 }
558 }
559
560 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
561 nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
562 pf)) {
563 msg = "nf_ct_dccp: bad checksum ";
564 goto out_invalid;
565 }
566
567 if (dh->dccph_type >= DCCP_PKT_INVALID) {
568 msg = "nf_ct_dccp: reserved packet type ";
569 goto out_invalid;
570 }
571
572 return NF_ACCEPT;
573
574out_invalid:
575 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg);
576 return -NF_ACCEPT;
577}
578
579static bool dccp_can_early_drop(const struct nf_conn *ct) 577static bool dccp_can_early_drop(const struct nf_conn *ct)
580{ 578{
581 switch (ct->proto.dccp.state) { 579 switch (ct->proto.dccp.state) {
@@ -814,7 +812,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
814 return 0; 812 return 0;
815} 813}
816 814
817static int dccp_init_net(struct net *net, u_int16_t proto) 815static int dccp_init_net(struct net *net)
818{ 816{
819 struct nf_dccp_net *dn = dccp_pernet(net); 817 struct nf_dccp_net *dn = dccp_pernet(net);
820 struct nf_proto_net *pn = &dn->pn; 818 struct nf_proto_net *pn = &dn->pn;
@@ -844,45 +842,9 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
844 return &net->ct.nf_ct_proto.dccp.pn; 842 return &net->ct.nf_ct_proto.dccp.pn;
845} 843}
846 844
847const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { 845const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
848 .l3proto = AF_INET,
849 .l4proto = IPPROTO_DCCP,
850 .new = dccp_new,
851 .packet = dccp_packet,
852 .error = dccp_error,
853 .can_early_drop = dccp_can_early_drop,
854#ifdef CONFIG_NF_CONNTRACK_PROCFS
855 .print_conntrack = dccp_print_conntrack,
856#endif
857#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
858 .nlattr_size = DCCP_NLATTR_SIZE,
859 .to_nlattr = dccp_to_nlattr,
860 .from_nlattr = nlattr_to_dccp,
861 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
862 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
863 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
864 .nla_policy = nf_ct_port_nla_policy,
865#endif
866#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
867 .ctnl_timeout = {
868 .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
869 .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
870 .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
871 .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
872 .nla_policy = dccp_timeout_nla_policy,
873 },
874#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
875 .init_net = dccp_init_net,
876 .get_net_proto = dccp_get_net_proto,
877};
878EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
879
880const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
881 .l3proto = AF_INET6,
882 .l4proto = IPPROTO_DCCP, 846 .l4proto = IPPROTO_DCCP,
883 .new = dccp_new,
884 .packet = dccp_packet, 847 .packet = dccp_packet,
885 .error = dccp_error,
886 .can_early_drop = dccp_can_early_drop, 848 .can_early_drop = dccp_can_early_drop,
887#ifdef CONFIG_NF_CONNTRACK_PROCFS 849#ifdef CONFIG_NF_CONNTRACK_PROCFS
888 .print_conntrack = dccp_print_conntrack, 850 .print_conntrack = dccp_print_conntrack,
@@ -908,4 +870,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
908 .init_net = dccp_init_net, 870 .init_net = dccp_init_net,
909 .get_net_proto = dccp_get_net_proto, 871 .get_net_proto = dccp_get_net_proto,
910}; 872};
911EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 1df3244ecd07..e10e867e0b55 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -44,12 +44,19 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
44 44
45/* Returns verdict for packet, or -1 for invalid. */ 45/* Returns verdict for packet, or -1 for invalid. */
46static int generic_packet(struct nf_conn *ct, 46static int generic_packet(struct nf_conn *ct,
47 const struct sk_buff *skb, 47 struct sk_buff *skb,
48 unsigned int dataoff, 48 unsigned int dataoff,
49 enum ip_conntrack_info ctinfo) 49 enum ip_conntrack_info ctinfo,
50 const struct nf_hook_state *state)
50{ 51{
51 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 52 const unsigned int *timeout = nf_ct_timeout_lookup(ct);
52 53
54 if (!nf_generic_should_process(nf_ct_protonum(ct))) {
55 pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
56 nf_ct_protonum(ct));
57 return -NF_ACCEPT;
58 }
59
53 if (!timeout) 60 if (!timeout)
54 timeout = &generic_pernet(nf_ct_net(ct))->timeout; 61 timeout = &generic_pernet(nf_ct_net(ct))->timeout;
55 62
@@ -57,19 +64,6 @@ static int generic_packet(struct nf_conn *ct,
57 return NF_ACCEPT; 64 return NF_ACCEPT;
58} 65}
59 66
60/* Called when a new connection for this protocol found. */
61static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
62 unsigned int dataoff)
63{
64 bool ret;
65
66 ret = nf_generic_should_process(nf_ct_protonum(ct));
67 if (!ret)
68 pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
69 nf_ct_protonum(ct));
70 return ret;
71}
72
73#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 67#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
74 68
75#include <linux/netfilter/nfnetlink.h> 69#include <linux/netfilter/nfnetlink.h>
@@ -142,7 +136,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
142 return 0; 136 return 0;
143} 137}
144 138
145static int generic_init_net(struct net *net, u_int16_t proto) 139static int generic_init_net(struct net *net)
146{ 140{
147 struct nf_generic_net *gn = generic_pernet(net); 141 struct nf_generic_net *gn = generic_pernet(net);
148 struct nf_proto_net *pn = &gn->pn; 142 struct nf_proto_net *pn = &gn->pn;
@@ -159,11 +153,9 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net)
159 153
160const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = 154const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
161{ 155{
162 .l3proto = PF_UNSPEC,
163 .l4proto = 255, 156 .l4proto = 255,
164 .pkt_to_tuple = generic_pkt_to_tuple, 157 .pkt_to_tuple = generic_pkt_to_tuple,
165 .packet = generic_packet, 158 .packet = generic_packet,
166 .new = generic_new,
167#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 159#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
168 .ctnl_timeout = { 160 .ctnl_timeout = {
169 .nlattr_to_obj = generic_timeout_nlattr_to_obj, 161 .nlattr_to_obj = generic_timeout_nlattr_to_obj,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 650eb4fba2c5..9b48dc8b4b88 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -233,10 +233,26 @@ static unsigned int *gre_get_timeouts(struct net *net)
233 233
234/* Returns verdict for packet, and may modify conntrack */ 234/* Returns verdict for packet, and may modify conntrack */
235static int gre_packet(struct nf_conn *ct, 235static int gre_packet(struct nf_conn *ct,
236 const struct sk_buff *skb, 236 struct sk_buff *skb,
237 unsigned int dataoff, 237 unsigned int dataoff,
238 enum ip_conntrack_info ctinfo) 238 enum ip_conntrack_info ctinfo,
239 const struct nf_hook_state *state)
239{ 240{
241 if (state->pf != NFPROTO_IPV4)
242 return -NF_ACCEPT;
243
244 if (!nf_ct_is_confirmed(ct)) {
245 unsigned int *timeouts = nf_ct_timeout_lookup(ct);
246
247 if (!timeouts)
248 timeouts = gre_get_timeouts(nf_ct_net(ct));
249
250 /* initialize to sane value. Ideally a conntrack helper
251 * (e.g. in case of pptp) is increasing them */
252 ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
253 ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
254 }
255
240 /* If we've seen traffic both ways, this is a GRE connection. 256 /* If we've seen traffic both ways, this is a GRE connection.
241 * Extend timeout. */ 257 * Extend timeout. */
242 if (ct->status & IPS_SEEN_REPLY) { 258 if (ct->status & IPS_SEEN_REPLY) {
@@ -252,26 +268,6 @@ static int gre_packet(struct nf_conn *ct,
252 return NF_ACCEPT; 268 return NF_ACCEPT;
253} 269}
254 270
255/* Called when a new connection for this protocol found. */
256static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
257 unsigned int dataoff)
258{
259 unsigned int *timeouts = nf_ct_timeout_lookup(ct);
260
261 if (!timeouts)
262 timeouts = gre_get_timeouts(nf_ct_net(ct));
263
264 pr_debug(": ");
265 nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
266
267 /* initialize to sane value. Ideally a conntrack helper
268 * (e.g. in case of pptp) is increasing them */
269 ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
270 ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
271
272 return true;
273}
274
275/* Called when a conntrack entry has already been removed from the hashes 271/* Called when a conntrack entry has already been removed from the hashes
276 * and is about to be deleted from memory */ 272 * and is about to be deleted from memory */
277static void gre_destroy(struct nf_conn *ct) 273static void gre_destroy(struct nf_conn *ct)
@@ -336,7 +332,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
336}; 332};
337#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ 333#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
338 334
339static int gre_init_net(struct net *net, u_int16_t proto) 335static int gre_init_net(struct net *net)
340{ 336{
341 struct netns_proto_gre *net_gre = gre_pernet(net); 337 struct netns_proto_gre *net_gre = gre_pernet(net);
342 int i; 338 int i;
@@ -351,14 +347,12 @@ static int gre_init_net(struct net *net, u_int16_t proto)
351 347
352/* protocol helper struct */ 348/* protocol helper struct */
353static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { 349static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
354 .l3proto = AF_INET,
355 .l4proto = IPPROTO_GRE, 350 .l4proto = IPPROTO_GRE,
356 .pkt_to_tuple = gre_pkt_to_tuple, 351 .pkt_to_tuple = gre_pkt_to_tuple,
357#ifdef CONFIG_NF_CONNTRACK_PROCFS 352#ifdef CONFIG_NF_CONNTRACK_PROCFS
358 .print_conntrack = gre_print_conntrack, 353 .print_conntrack = gre_print_conntrack,
359#endif 354#endif
360 .packet = gre_packet, 355 .packet = gre_packet,
361 .new = gre_new,
362 .destroy = gre_destroy, 356 .destroy = gre_destroy,
363 .me = THIS_MODULE, 357 .me = THIS_MODULE,
364#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 358#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 43c7e1a217b9..3598520bd19b 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -72,34 +72,17 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
72 return true; 72 return true;
73} 73}
74 74
75static unsigned int *icmp_get_timeouts(struct net *net)
76{
77 return &icmp_pernet(net)->timeout;
78}
79
80/* Returns verdict for packet, or -1 for invalid. */ 75/* Returns verdict for packet, or -1 for invalid. */
81static int icmp_packet(struct nf_conn *ct, 76static int icmp_packet(struct nf_conn *ct,
82 const struct sk_buff *skb, 77 struct sk_buff *skb,
83 unsigned int dataoff, 78 unsigned int dataoff,
84 enum ip_conntrack_info ctinfo) 79 enum ip_conntrack_info ctinfo,
80 const struct nf_hook_state *state)
85{ 81{
86 /* Do not immediately delete the connection after the first 82 /* Do not immediately delete the connection after the first
87 successful reply to avoid excessive conntrackd traffic 83 successful reply to avoid excessive conntrackd traffic
88 and also to handle correctly ICMP echo reply duplicates. */ 84 and also to handle correctly ICMP echo reply duplicates. */
89 unsigned int *timeout = nf_ct_timeout_lookup(ct); 85 unsigned int *timeout = nf_ct_timeout_lookup(ct);
90
91 if (!timeout)
92 timeout = icmp_get_timeouts(nf_ct_net(ct));
93
94 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
95
96 return NF_ACCEPT;
97}
98
99/* Called when a new connection for this protocol found. */
100static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
101 unsigned int dataoff)
102{
103 static const u_int8_t valid_new[] = { 86 static const u_int8_t valid_new[] = {
104 [ICMP_ECHO] = 1, 87 [ICMP_ECHO] = 1,
105 [ICMP_TIMESTAMP] = 1, 88 [ICMP_TIMESTAMP] = 1,
@@ -107,21 +90,29 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
107 [ICMP_ADDRESS] = 1 90 [ICMP_ADDRESS] = 1
108 }; 91 };
109 92
93 if (state->pf != NFPROTO_IPV4)
94 return -NF_ACCEPT;
95
110 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || 96 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
111 !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { 97 !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
112 /* Can't create a new ICMP `conn' with this. */ 98 /* Can't create a new ICMP `conn' with this. */
113 pr_debug("icmp: can't create new conn with type %u\n", 99 pr_debug("icmp: can't create new conn with type %u\n",
114 ct->tuplehash[0].tuple.dst.u.icmp.type); 100 ct->tuplehash[0].tuple.dst.u.icmp.type);
115 nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); 101 nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
116 return false; 102 return -NF_ACCEPT;
117 } 103 }
118 return true; 104
105 if (!timeout)
106 timeout = &icmp_pernet(nf_ct_net(ct))->timeout;
107
108 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
109 return NF_ACCEPT;
119} 110}
120 111
121/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 112/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
122static int 113static int
123icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, 114icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb,
124 unsigned int hooknum) 115 const struct nf_hook_state *state)
125{ 116{
126 struct nf_conntrack_tuple innertuple, origtuple; 117 struct nf_conntrack_tuple innertuple, origtuple;
127 const struct nf_conntrack_l4proto *innerproto; 118 const struct nf_conntrack_l4proto *innerproto;
@@ -137,13 +128,13 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
137 if (!nf_ct_get_tuplepr(skb, 128 if (!nf_ct_get_tuplepr(skb,
138 skb_network_offset(skb) + ip_hdrlen(skb) 129 skb_network_offset(skb) + ip_hdrlen(skb)
139 + sizeof(struct icmphdr), 130 + sizeof(struct icmphdr),
140 PF_INET, net, &origtuple)) { 131 PF_INET, state->net, &origtuple)) {
141 pr_debug("icmp_error_message: failed to get tuple\n"); 132 pr_debug("icmp_error_message: failed to get tuple\n");
142 return -NF_ACCEPT; 133 return -NF_ACCEPT;
143 } 134 }
144 135
145 /* rcu_read_lock()ed by nf_hook_thresh */ 136 /* rcu_read_lock()ed by nf_hook_thresh */
146 innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); 137 innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
147 138
148 /* Ordinarily, we'd expect the inverted tupleproto, but it's 139 /* Ordinarily, we'd expect the inverted tupleproto, but it's
149 been preserved inside the ICMP. */ 140 been preserved inside the ICMP. */
@@ -154,7 +145,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
154 145
155 ctinfo = IP_CT_RELATED; 146 ctinfo = IP_CT_RELATED;
156 147
157 h = nf_conntrack_find_get(net, zone, &innertuple); 148 h = nf_conntrack_find_get(state->net, zone, &innertuple);
158 if (!h) { 149 if (!h) {
159 pr_debug("icmp_error_message: no match\n"); 150 pr_debug("icmp_error_message: no match\n");
160 return -NF_ACCEPT; 151 return -NF_ACCEPT;
@@ -168,17 +159,18 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
168 return NF_ACCEPT; 159 return NF_ACCEPT;
169} 160}
170 161
171static void icmp_error_log(const struct sk_buff *skb, struct net *net, 162static void icmp_error_log(const struct sk_buff *skb,
172 u8 pf, const char *msg) 163 const struct nf_hook_state *state,
164 const char *msg)
173{ 165{
174 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); 166 nf_l4proto_log_invalid(skb, state->net, state->pf,
167 IPPROTO_ICMP, "%s", msg);
175} 168}
176 169
177/* Small and modified version of icmp_rcv */ 170/* Small and modified version of icmp_rcv */
178static int 171int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
179icmp_error(struct net *net, struct nf_conn *tmpl, 172 struct sk_buff *skb, unsigned int dataoff,
180 struct sk_buff *skb, unsigned int dataoff, 173 const struct nf_hook_state *state)
181 u8 pf, unsigned int hooknum)
182{ 174{
183 const struct icmphdr *icmph; 175 const struct icmphdr *icmph;
184 struct icmphdr _ih; 176 struct icmphdr _ih;
@@ -186,14 +178,15 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
186 /* Not enough header? */ 178 /* Not enough header? */
187 icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); 179 icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
188 if (icmph == NULL) { 180 if (icmph == NULL) {
189 icmp_error_log(skb, net, pf, "short packet"); 181 icmp_error_log(skb, state, "short packet");
190 return -NF_ACCEPT; 182 return -NF_ACCEPT;
191 } 183 }
192 184
193 /* See ip_conntrack_proto_tcp.c */ 185 /* See ip_conntrack_proto_tcp.c */
194 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && 186 if (state->net->ct.sysctl_checksum &&
195 nf_ip_checksum(skb, hooknum, dataoff, 0)) { 187 state->hook == NF_INET_PRE_ROUTING &&
196 icmp_error_log(skb, net, pf, "bad hw icmp checksum"); 188 nf_ip_checksum(skb, state->hook, dataoff, 0)) {
189 icmp_error_log(skb, state, "bad hw icmp checksum");
197 return -NF_ACCEPT; 190 return -NF_ACCEPT;
198 } 191 }
199 192
@@ -204,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
204 * discarded. 197 * discarded.
205 */ 198 */
206 if (icmph->type > NR_ICMP_TYPES) { 199 if (icmph->type > NR_ICMP_TYPES) {
207 icmp_error_log(skb, net, pf, "invalid icmp type"); 200 icmp_error_log(skb, state, "invalid icmp type");
208 return -NF_ACCEPT; 201 return -NF_ACCEPT;
209 } 202 }
210 203
@@ -216,7 +209,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
216 icmph->type != ICMP_REDIRECT) 209 icmph->type != ICMP_REDIRECT)
217 return NF_ACCEPT; 210 return NF_ACCEPT;
218 211
219 return icmp_error_message(net, tmpl, skb, hooknum); 212 return icmp_error_message(tmpl, skb, state);
220} 213}
221 214
222#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 215#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -342,7 +335,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
342 return 0; 335 return 0;
343} 336}
344 337
345static int icmp_init_net(struct net *net, u_int16_t proto) 338static int icmp_init_net(struct net *net)
346{ 339{
347 struct nf_icmp_net *in = icmp_pernet(net); 340 struct nf_icmp_net *in = icmp_pernet(net);
348 struct nf_proto_net *pn = &in->pn; 341 struct nf_proto_net *pn = &in->pn;
@@ -359,13 +352,10 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net)
359 352
360const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = 353const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
361{ 354{
362 .l3proto = PF_INET,
363 .l4proto = IPPROTO_ICMP, 355 .l4proto = IPPROTO_ICMP,
364 .pkt_to_tuple = icmp_pkt_to_tuple, 356 .pkt_to_tuple = icmp_pkt_to_tuple,
365 .invert_tuple = icmp_invert_tuple, 357 .invert_tuple = icmp_invert_tuple,
366 .packet = icmp_packet, 358 .packet = icmp_packet,
367 .new = icmp_new,
368 .error = icmp_error,
369 .destroy = NULL, 359 .destroy = NULL,
370 .me = NULL, 360 .me = NULL,
371#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 361#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 97e40f77d678..378618feed5d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -92,11 +92,31 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
92 92
93/* Returns verdict for packet, or -1 for invalid. */ 93/* Returns verdict for packet, or -1 for invalid. */
94static int icmpv6_packet(struct nf_conn *ct, 94static int icmpv6_packet(struct nf_conn *ct,
95 const struct sk_buff *skb, 95 struct sk_buff *skb,
96 unsigned int dataoff, 96 unsigned int dataoff,
97 enum ip_conntrack_info ctinfo) 97 enum ip_conntrack_info ctinfo,
98 const struct nf_hook_state *state)
98{ 99{
99 unsigned int *timeout = nf_ct_timeout_lookup(ct); 100 unsigned int *timeout = nf_ct_timeout_lookup(ct);
101 static const u8 valid_new[] = {
102 [ICMPV6_ECHO_REQUEST - 128] = 1,
103 [ICMPV6_NI_QUERY - 128] = 1
104 };
105
106 if (state->pf != NFPROTO_IPV6)
107 return -NF_ACCEPT;
108
109 if (!nf_ct_is_confirmed(ct)) {
110 int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
111
112 if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
113 /* Can't create a new ICMPv6 `conn' with this. */
114 pr_debug("icmpv6: can't create new conn with type %u\n",
115 type + 128);
116 nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
117 return -NF_ACCEPT;
118 }
119 }
100 120
101 if (!timeout) 121 if (!timeout)
102 timeout = icmpv6_get_timeouts(nf_ct_net(ct)); 122 timeout = icmpv6_get_timeouts(nf_ct_net(ct));
@@ -109,26 +129,6 @@ static int icmpv6_packet(struct nf_conn *ct,
109 return NF_ACCEPT; 129 return NF_ACCEPT;
110} 130}
111 131
112/* Called when a new connection for this protocol found. */
113static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
114 unsigned int dataoff)
115{
116 static const u_int8_t valid_new[] = {
117 [ICMPV6_ECHO_REQUEST - 128] = 1,
118 [ICMPV6_NI_QUERY - 128] = 1
119 };
120 int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
121
122 if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
123 /* Can't create a new ICMPv6 `conn' with this. */
124 pr_debug("icmpv6: can't create new conn with type %u\n",
125 type + 128);
126 nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
127 return false;
128 }
129 return true;
130}
131
132static int 132static int
133icmpv6_error_message(struct net *net, struct nf_conn *tmpl, 133icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
134 struct sk_buff *skb, 134 struct sk_buff *skb,
@@ -153,7 +153,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
153 } 153 }
154 154
155 /* rcu_read_lock()ed by nf_hook_thresh */ 155 /* rcu_read_lock()ed by nf_hook_thresh */
156 inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); 156 inproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
157 157
158 /* Ordinarily, we'd expect the inverted tupleproto, but it's 158 /* Ordinarily, we'd expect the inverted tupleproto, but it's
159 been preserved inside the ICMP. */ 159 been preserved inside the ICMP. */
@@ -179,16 +179,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
179 return NF_ACCEPT; 179 return NF_ACCEPT;
180} 180}
181 181
182static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, 182static void icmpv6_error_log(const struct sk_buff *skb,
183 u8 pf, const char *msg) 183 const struct nf_hook_state *state,
184 const char *msg)
184{ 185{
185 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); 186 nf_l4proto_log_invalid(skb, state->net, state->pf,
187 IPPROTO_ICMPV6, "%s", msg);
186} 188}
187 189
188static int 190int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
189icmpv6_error(struct net *net, struct nf_conn *tmpl, 191 struct sk_buff *skb,
190 struct sk_buff *skb, unsigned int dataoff, 192 unsigned int dataoff,
191 u8 pf, unsigned int hooknum) 193 const struct nf_hook_state *state)
192{ 194{
193 const struct icmp6hdr *icmp6h; 195 const struct icmp6hdr *icmp6h;
194 struct icmp6hdr _ih; 196 struct icmp6hdr _ih;
@@ -196,13 +198,14 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
196 198
197 icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); 199 icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
198 if (icmp6h == NULL) { 200 if (icmp6h == NULL) {
199 icmpv6_error_log(skb, net, pf, "short packet"); 201 icmpv6_error_log(skb, state, "short packet");
200 return -NF_ACCEPT; 202 return -NF_ACCEPT;
201 } 203 }
202 204
203 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && 205 if (state->hook == NF_INET_PRE_ROUTING &&
204 nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { 206 state->net->ct.sysctl_checksum &&
205 icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); 207 nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) {
208 icmpv6_error_log(skb, state, "ICMPv6 checksum failed");
206 return -NF_ACCEPT; 209 return -NF_ACCEPT;
207 } 210 }
208 211
@@ -217,7 +220,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
217 if (icmp6h->icmp6_type >= 128) 220 if (icmp6h->icmp6_type >= 128)
218 return NF_ACCEPT; 221 return NF_ACCEPT;
219 222
220 return icmpv6_error_message(net, tmpl, skb, dataoff); 223 return icmpv6_error_message(state->net, tmpl, skb, dataoff);
221} 224}
222 225
223#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 226#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -343,7 +346,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn,
343 return 0; 346 return 0;
344} 347}
345 348
346static int icmpv6_init_net(struct net *net, u_int16_t proto) 349static int icmpv6_init_net(struct net *net)
347{ 350{
348 struct nf_icmp_net *in = icmpv6_pernet(net); 351 struct nf_icmp_net *in = icmpv6_pernet(net);
349 struct nf_proto_net *pn = &in->pn; 352 struct nf_proto_net *pn = &in->pn;
@@ -360,13 +363,10 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
360 363
361const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = 364const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
362{ 365{
363 .l3proto = PF_INET6,
364 .l4proto = IPPROTO_ICMPV6, 366 .l4proto = IPPROTO_ICMPV6,
365 .pkt_to_tuple = icmpv6_pkt_to_tuple, 367 .pkt_to_tuple = icmpv6_pkt_to_tuple,
366 .invert_tuple = icmpv6_invert_tuple, 368 .invert_tuple = icmpv6_invert_tuple,
367 .packet = icmpv6_packet, 369 .packet = icmpv6_packet,
368 .new = icmpv6_new,
369 .error = icmpv6_error,
370#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 370#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
371 .tuple_to_nlattr = icmpv6_tuple_to_nlattr, 371 .tuple_to_nlattr = icmpv6_tuple_to_nlattr,
372 .nlattr_tuple_size = icmpv6_nlattr_tuple_size, 372 .nlattr_tuple_size = icmpv6_nlattr_tuple_size,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index e4d738d34cd0..3d719d3eb9a3 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -273,11 +273,100 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
273 return sctp_conntracks[dir][i][cur_state]; 273 return sctp_conntracks[dir][i][cur_state];
274} 274}
275 275
276/* Don't need lock here: this conntrack not in circulation yet */
277static noinline bool
278sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
279 const struct sctphdr *sh, unsigned int dataoff)
280{
281 enum sctp_conntrack new_state;
282 const struct sctp_chunkhdr *sch;
283 struct sctp_chunkhdr _sch;
284 u32 offset, count;
285
286 memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
287 new_state = SCTP_CONNTRACK_MAX;
288 for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) {
289 new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
290 SCTP_CONNTRACK_NONE, sch->type);
291
292 /* Invalid: delete conntrack */
293 if (new_state == SCTP_CONNTRACK_NONE ||
294 new_state == SCTP_CONNTRACK_MAX) {
295 pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
296 return false;
297 }
298
299 /* Copy the vtag into the state info */
300 if (sch->type == SCTP_CID_INIT) {
301 struct sctp_inithdr _inithdr, *ih;
302 /* Sec 8.5.1 (A) */
303 if (sh->vtag)
304 return false;
305
306 ih = skb_header_pointer(skb, offset + sizeof(_sch),
307 sizeof(_inithdr), &_inithdr);
308 if (!ih)
309 return false;
310
311 pr_debug("Setting vtag %x for new conn\n",
312 ih->init_tag);
313
314 ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
315 } else if (sch->type == SCTP_CID_HEARTBEAT) {
316 pr_debug("Setting vtag %x for secondary conntrack\n",
317 sh->vtag);
318 ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
319 } else {
320 /* If it is a shutdown ack OOTB packet, we expect a return
321 shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
322 pr_debug("Setting vtag %x for new conn OOTB\n",
323 sh->vtag);
324 ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
325 }
326
327 ct->proto.sctp.state = new_state;
328 }
329
330 return true;
331}
332
333static bool sctp_error(struct sk_buff *skb,
334 unsigned int dataoff,
335 const struct nf_hook_state *state)
336{
337 const struct sctphdr *sh;
338 const char *logmsg;
339
340 if (skb->len < dataoff + sizeof(struct sctphdr)) {
341 logmsg = "nf_ct_sctp: short packet ";
342 goto out_invalid;
343 }
344 if (state->hook == NF_INET_PRE_ROUTING &&
345 state->net->ct.sysctl_checksum &&
346 skb->ip_summed == CHECKSUM_NONE) {
347 if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
348 logmsg = "nf_ct_sctp: failed to read header ";
349 goto out_invalid;
350 }
351 sh = (const struct sctphdr *)(skb->data + dataoff);
352 if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
353 logmsg = "nf_ct_sctp: bad CRC ";
354 goto out_invalid;
355 }
356 skb->ip_summed = CHECKSUM_UNNECESSARY;
357 }
358 return false;
359out_invalid:
360 nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg);
361 return true;
362}
363
276/* Returns verdict for packet, or -NF_ACCEPT for invalid. */ 364/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
277static int sctp_packet(struct nf_conn *ct, 365static int sctp_packet(struct nf_conn *ct,
278 const struct sk_buff *skb, 366 struct sk_buff *skb,
279 unsigned int dataoff, 367 unsigned int dataoff,
280 enum ip_conntrack_info ctinfo) 368 enum ip_conntrack_info ctinfo,
369 const struct nf_hook_state *state)
281{ 370{
282 enum sctp_conntrack new_state, old_state; 371 enum sctp_conntrack new_state, old_state;
283 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 372 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -289,6 +378,9 @@ static int sctp_packet(struct nf_conn *ct,
289 unsigned int *timeouts; 378 unsigned int *timeouts;
290 unsigned long map[256 / sizeof(unsigned long)] = { 0 }; 379 unsigned long map[256 / sizeof(unsigned long)] = { 0 };
291 380
381 if (sctp_error(skb, dataoff, state))
382 return -NF_ACCEPT;
383
292 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); 384 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
293 if (sh == NULL) 385 if (sh == NULL)
294 goto out; 386 goto out;
@@ -296,6 +388,17 @@ static int sctp_packet(struct nf_conn *ct,
296 if (do_basic_checks(ct, skb, dataoff, map) != 0) 388 if (do_basic_checks(ct, skb, dataoff, map) != 0)
297 goto out; 389 goto out;
298 390
391 if (!nf_ct_is_confirmed(ct)) {
392 /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
393 if (test_bit(SCTP_CID_ABORT, map) ||
394 test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
395 test_bit(SCTP_CID_COOKIE_ACK, map))
396 return -NF_ACCEPT;
397
398 if (!sctp_new(ct, skb, sh, dataoff))
399 return -NF_ACCEPT;
400 }
401
299 /* Check the verification tag (Sec 8.5) */ 402 /* Check the verification tag (Sec 8.5) */
300 if (!test_bit(SCTP_CID_INIT, map) && 403 if (!test_bit(SCTP_CID_INIT, map) &&
301 !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && 404 !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
@@ -397,110 +500,6 @@ out:
397 return -NF_ACCEPT; 500 return -NF_ACCEPT;
398} 501}
399 502
400/* Called when a new connection for this protocol found. */
401static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
402 unsigned int dataoff)
403{
404 enum sctp_conntrack new_state;
405 const struct sctphdr *sh;
406 struct sctphdr _sctph;
407 const struct sctp_chunkhdr *sch;
408 struct sctp_chunkhdr _sch;
409 u_int32_t offset, count;
410 unsigned long map[256 / sizeof(unsigned long)] = { 0 };
411
412 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
413 if (sh == NULL)
414 return false;
415
416 if (do_basic_checks(ct, skb, dataoff, map) != 0)
417 return false;
418
419 /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
420 if (test_bit(SCTP_CID_ABORT, map) ||
421 test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
422 test_bit(SCTP_CID_COOKIE_ACK, map))
423 return false;
424
425 memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
426 new_state = SCTP_CONNTRACK_MAX;
427 for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
428 /* Don't need lock here: this conntrack not in circulation yet */
429 new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
430 SCTP_CONNTRACK_NONE, sch->type);
431
432 /* Invalid: delete conntrack */
433 if (new_state == SCTP_CONNTRACK_NONE ||
434 new_state == SCTP_CONNTRACK_MAX) {
435 pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
436 return false;
437 }
438
439 /* Copy the vtag into the state info */
440 if (sch->type == SCTP_CID_INIT) {
441 struct sctp_inithdr _inithdr, *ih;
442 /* Sec 8.5.1 (A) */
443 if (sh->vtag)
444 return false;
445
446 ih = skb_header_pointer(skb, offset + sizeof(_sch),
447 sizeof(_inithdr), &_inithdr);
448 if (!ih)
449 return false;
450
451 pr_debug("Setting vtag %x for new conn\n",
452 ih->init_tag);
453
454 ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
455 } else if (sch->type == SCTP_CID_HEARTBEAT) {
456 pr_debug("Setting vtag %x for secondary conntrack\n",
457 sh->vtag);
458 ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
459 }
460 /* If it is a shutdown ack OOTB packet, we expect a return
461 shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
462 else {
463 pr_debug("Setting vtag %x for new conn OOTB\n",
464 sh->vtag);
465 ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
466 }
467
468 ct->proto.sctp.state = new_state;
469 }
470
471 return true;
472}
473
474static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
475 unsigned int dataoff,
476 u8 pf, unsigned int hooknum)
477{
478 const struct sctphdr *sh;
479 const char *logmsg;
480
481 if (skb->len < dataoff + sizeof(struct sctphdr)) {
482 logmsg = "nf_ct_sctp: short packet ";
483 goto out_invalid;
484 }
485 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
486 skb->ip_summed == CHECKSUM_NONE) {
487 if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
488 logmsg = "nf_ct_sctp: failed to read header ";
489 goto out_invalid;
490 }
491 sh = (const struct sctphdr *)(skb->data + dataoff);
492 if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
493 logmsg = "nf_ct_sctp: bad CRC ";
494 goto out_invalid;
495 }
496 skb->ip_summed = CHECKSUM_UNNECESSARY;
497 }
498 return NF_ACCEPT;
499out_invalid:
500 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg);
501 return -NF_ACCEPT;
502}
503
504static bool sctp_can_early_drop(const struct nf_conn *ct) 503static bool sctp_can_early_drop(const struct nf_conn *ct)
505{ 504{
506 switch (ct->proto.sctp.state) { 505 switch (ct->proto.sctp.state) {
@@ -735,7 +734,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
735 return 0; 734 return 0;
736} 735}
737 736
738static int sctp_init_net(struct net *net, u_int16_t proto) 737static int sctp_init_net(struct net *net)
739{ 738{
740 struct nf_sctp_net *sn = sctp_pernet(net); 739 struct nf_sctp_net *sn = sctp_pernet(net);
741 struct nf_proto_net *pn = &sn->pn; 740 struct nf_proto_net *pn = &sn->pn;
@@ -760,49 +759,12 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
760 return &net->ct.nf_ct_proto.sctp.pn; 759 return &net->ct.nf_ct_proto.sctp.pn;
761} 760}
762 761
763const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { 762const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
764 .l3proto = PF_INET,
765 .l4proto = IPPROTO_SCTP,
766#ifdef CONFIG_NF_CONNTRACK_PROCFS
767 .print_conntrack = sctp_print_conntrack,
768#endif
769 .packet = sctp_packet,
770 .new = sctp_new,
771 .error = sctp_error,
772 .can_early_drop = sctp_can_early_drop,
773 .me = THIS_MODULE,
774#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
775 .nlattr_size = SCTP_NLATTR_SIZE,
776 .to_nlattr = sctp_to_nlattr,
777 .from_nlattr = nlattr_to_sctp,
778 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
779 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
780 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
781 .nla_policy = nf_ct_port_nla_policy,
782#endif
783#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
784 .ctnl_timeout = {
785 .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
786 .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
787 .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
788 .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
789 .nla_policy = sctp_timeout_nla_policy,
790 },
791#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
792 .init_net = sctp_init_net,
793 .get_net_proto = sctp_get_net_proto,
794};
795EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
796
797const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
798 .l3proto = PF_INET6,
799 .l4proto = IPPROTO_SCTP, 763 .l4proto = IPPROTO_SCTP,
800#ifdef CONFIG_NF_CONNTRACK_PROCFS 764#ifdef CONFIG_NF_CONNTRACK_PROCFS
801 .print_conntrack = sctp_print_conntrack, 765 .print_conntrack = sctp_print_conntrack,
802#endif 766#endif
803 .packet = sctp_packet, 767 .packet = sctp_packet,
804 .new = sctp_new,
805 .error = sctp_error,
806 .can_early_drop = sctp_can_early_drop, 768 .can_early_drop = sctp_can_early_drop,
807 .me = THIS_MODULE, 769 .me = THIS_MODULE,
808#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 770#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -826,4 +788,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
826 .init_net = sctp_init_net, 788 .init_net = sctp_init_net,
827 .get_net_proto = sctp_get_net_proto, 789 .get_net_proto = sctp_get_net_proto,
828}; 790};
829EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 247b89784a6f..1bcf9984d45e 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -717,35 +717,26 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
717 [TCPHDR_ACK|TCPHDR_URG] = 1, 717 [TCPHDR_ACK|TCPHDR_URG] = 1,
718}; 718};
719 719
720static void tcp_error_log(const struct sk_buff *skb, struct net *net, 720static void tcp_error_log(const struct sk_buff *skb,
721 u8 pf, const char *msg) 721 const struct nf_hook_state *state,
722 const char *msg)
722{ 723{
723 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); 724 nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
724} 725}
725 726
726/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ 727/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
727static int tcp_error(struct net *net, struct nf_conn *tmpl, 728static bool tcp_error(const struct tcphdr *th,
728 struct sk_buff *skb, 729 struct sk_buff *skb,
729 unsigned int dataoff, 730 unsigned int dataoff,
730 u_int8_t pf, 731 const struct nf_hook_state *state)
731 unsigned int hooknum)
732{ 732{
733 const struct tcphdr *th;
734 struct tcphdr _tcph;
735 unsigned int tcplen = skb->len - dataoff; 733 unsigned int tcplen = skb->len - dataoff;
736 u_int8_t tcpflags; 734 u8 tcpflags;
737
738 /* Smaller that minimal TCP header? */
739 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
740 if (th == NULL) {
741 tcp_error_log(skb, net, pf, "short packet");
742 return -NF_ACCEPT;
743 }
744 735
745 /* Not whole TCP header or malformed packet */ 736 /* Not whole TCP header or malformed packet */
746 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { 737 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
747 tcp_error_log(skb, net, pf, "truncated packet"); 738 tcp_error_log(skb, state, "truncated packet");
748 return -NF_ACCEPT; 739 return true;
749 } 740 }
750 741
751 /* Checksum invalid? Ignore. 742 /* Checksum invalid? Ignore.
@@ -753,27 +744,101 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
753 * because the checksum is assumed to be correct. 744 * because the checksum is assumed to be correct.
754 */ 745 */
755 /* FIXME: Source route IP option packets --RR */ 746 /* FIXME: Source route IP option packets --RR */
756 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && 747 if (state->net->ct.sysctl_checksum &&
757 nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { 748 state->hook == NF_INET_PRE_ROUTING &&
758 tcp_error_log(skb, net, pf, "bad checksum"); 749 nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
759 return -NF_ACCEPT; 750 tcp_error_log(skb, state, "bad checksum");
751 return true;
760 } 752 }
761 753
762 /* Check TCP flags. */ 754 /* Check TCP flags. */
763 tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); 755 tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
764 if (!tcp_valid_flags[tcpflags]) { 756 if (!tcp_valid_flags[tcpflags]) {
765 tcp_error_log(skb, net, pf, "invalid tcp flag combination"); 757 tcp_error_log(skb, state, "invalid tcp flag combination");
766 return -NF_ACCEPT; 758 return true;
767 } 759 }
768 760
769 return NF_ACCEPT; 761 return false;
762}
763
764static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
765 unsigned int dataoff,
766 const struct tcphdr *th)
767{
768 enum tcp_conntrack new_state;
769 struct net *net = nf_ct_net(ct);
770 const struct nf_tcp_net *tn = tcp_pernet(net);
771 const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
772 const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
773
774 /* Don't need lock here: this conntrack not in circulation yet */
775 new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
776
777 /* Invalid: delete conntrack */
778 if (new_state >= TCP_CONNTRACK_MAX) {
779 pr_debug("nf_ct_tcp: invalid new deleting.\n");
780 return false;
781 }
782
783 if (new_state == TCP_CONNTRACK_SYN_SENT) {
784 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
785 /* SYN packet */
786 ct->proto.tcp.seen[0].td_end =
787 segment_seq_plus_len(ntohl(th->seq), skb->len,
788 dataoff, th);
789 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
790 if (ct->proto.tcp.seen[0].td_maxwin == 0)
791 ct->proto.tcp.seen[0].td_maxwin = 1;
792 ct->proto.tcp.seen[0].td_maxend =
793 ct->proto.tcp.seen[0].td_end;
794
795 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
796 } else if (tn->tcp_loose == 0) {
797 /* Don't try to pick up connections. */
798 return false;
799 } else {
800 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
801 /*
802 * We are in the middle of a connection,
803 * its history is lost for us.
804 * Let's try to use the data from the packet.
805 */
806 ct->proto.tcp.seen[0].td_end =
807 segment_seq_plus_len(ntohl(th->seq), skb->len,
808 dataoff, th);
809 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
810 if (ct->proto.tcp.seen[0].td_maxwin == 0)
811 ct->proto.tcp.seen[0].td_maxwin = 1;
812 ct->proto.tcp.seen[0].td_maxend =
813 ct->proto.tcp.seen[0].td_end +
814 ct->proto.tcp.seen[0].td_maxwin;
815
816 /* We assume SACK and liberal window checking to handle
817 * window scaling */
818 ct->proto.tcp.seen[0].flags =
819 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
820 IP_CT_TCP_FLAG_BE_LIBERAL;
821 }
822
823 /* tcp_packet will set them */
824 ct->proto.tcp.last_index = TCP_NONE_SET;
825
826 pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
827 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
828 __func__,
829 sender->td_end, sender->td_maxend, sender->td_maxwin,
830 sender->td_scale,
831 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
832 receiver->td_scale);
833 return true;
770} 834}
771 835
772/* Returns verdict for packet, or -1 for invalid. */ 836/* Returns verdict for packet, or -1 for invalid. */
773static int tcp_packet(struct nf_conn *ct, 837static int tcp_packet(struct nf_conn *ct,
774 const struct sk_buff *skb, 838 struct sk_buff *skb,
775 unsigned int dataoff, 839 unsigned int dataoff,
776 enum ip_conntrack_info ctinfo) 840 enum ip_conntrack_info ctinfo,
841 const struct nf_hook_state *state)
777{ 842{
778 struct net *net = nf_ct_net(ct); 843 struct net *net = nf_ct_net(ct);
779 struct nf_tcp_net *tn = tcp_pernet(net); 844 struct nf_tcp_net *tn = tcp_pernet(net);
@@ -786,7 +851,14 @@ static int tcp_packet(struct nf_conn *ct,
786 unsigned long timeout; 851 unsigned long timeout;
787 852
788 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); 853 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
789 BUG_ON(th == NULL); 854 if (th == NULL)
855 return -NF_ACCEPT;
856
857 if (tcp_error(th, skb, dataoff, state))
858 return -NF_ACCEPT;
859
860 if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
861 return -NF_ACCEPT;
790 862
791 spin_lock_bh(&ct->lock); 863 spin_lock_bh(&ct->lock);
792 old_state = ct->proto.tcp.state; 864 old_state = ct->proto.tcp.state;
@@ -1067,82 +1139,6 @@ static int tcp_packet(struct nf_conn *ct,
1067 return NF_ACCEPT; 1139 return NF_ACCEPT;
1068} 1140}
1069 1141
1070/* Called when a new connection for this protocol found. */
1071static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1072 unsigned int dataoff)
1073{
1074 enum tcp_conntrack new_state;
1075 const struct tcphdr *th;
1076 struct tcphdr _tcph;
1077 struct net *net = nf_ct_net(ct);
1078 struct nf_tcp_net *tn = tcp_pernet(net);
1079 const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
1080 const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
1081
1082 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
1083 BUG_ON(th == NULL);
1084
1085 /* Don't need lock here: this conntrack not in circulation yet */
1086 new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
1087
1088 /* Invalid: delete conntrack */
1089 if (new_state >= TCP_CONNTRACK_MAX) {
1090 pr_debug("nf_ct_tcp: invalid new deleting.\n");
1091 return false;
1092 }
1093
1094 if (new_state == TCP_CONNTRACK_SYN_SENT) {
1095 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
1096 /* SYN packet */
1097 ct->proto.tcp.seen[0].td_end =
1098 segment_seq_plus_len(ntohl(th->seq), skb->len,
1099 dataoff, th);
1100 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1101 if (ct->proto.tcp.seen[0].td_maxwin == 0)
1102 ct->proto.tcp.seen[0].td_maxwin = 1;
1103 ct->proto.tcp.seen[0].td_maxend =
1104 ct->proto.tcp.seen[0].td_end;
1105
1106 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
1107 } else if (tn->tcp_loose == 0) {
1108 /* Don't try to pick up connections. */
1109 return false;
1110 } else {
1111 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
1112 /*
1113 * We are in the middle of a connection,
1114 * its history is lost for us.
1115 * Let's try to use the data from the packet.
1116 */
1117 ct->proto.tcp.seen[0].td_end =
1118 segment_seq_plus_len(ntohl(th->seq), skb->len,
1119 dataoff, th);
1120 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1121 if (ct->proto.tcp.seen[0].td_maxwin == 0)
1122 ct->proto.tcp.seen[0].td_maxwin = 1;
1123 ct->proto.tcp.seen[0].td_maxend =
1124 ct->proto.tcp.seen[0].td_end +
1125 ct->proto.tcp.seen[0].td_maxwin;
1126
1127 /* We assume SACK and liberal window checking to handle
1128 * window scaling */
1129 ct->proto.tcp.seen[0].flags =
1130 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
1131 IP_CT_TCP_FLAG_BE_LIBERAL;
1132 }
1133
1134 /* tcp_packet will set them */
1135 ct->proto.tcp.last_index = TCP_NONE_SET;
1136
1137 pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1138 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1139 sender->td_end, sender->td_maxend, sender->td_maxwin,
1140 sender->td_scale,
1141 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1142 receiver->td_scale);
1143 return true;
1144}
1145
1146static bool tcp_can_early_drop(const struct nf_conn *ct) 1142static bool tcp_can_early_drop(const struct nf_conn *ct)
1147{ 1143{
1148 switch (ct->proto.tcp.state) { 1144 switch (ct->proto.tcp.state) {
@@ -1510,7 +1506,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
1510 return 0; 1506 return 0;
1511} 1507}
1512 1508
1513static int tcp_init_net(struct net *net, u_int16_t proto) 1509static int tcp_init_net(struct net *net)
1514{ 1510{
1515 struct nf_tcp_net *tn = tcp_pernet(net); 1511 struct nf_tcp_net *tn = tcp_pernet(net);
1516 struct nf_proto_net *pn = &tn->pn; 1512 struct nf_proto_net *pn = &tn->pn;
@@ -1538,16 +1534,13 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net)
1538 return &net->ct.nf_ct_proto.tcp.pn; 1534 return &net->ct.nf_ct_proto.tcp.pn;
1539} 1535}
1540 1536
1541const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = 1537const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1542{ 1538{
1543 .l3proto = PF_INET,
1544 .l4proto = IPPROTO_TCP, 1539 .l4proto = IPPROTO_TCP,
1545#ifdef CONFIG_NF_CONNTRACK_PROCFS 1540#ifdef CONFIG_NF_CONNTRACK_PROCFS
1546 .print_conntrack = tcp_print_conntrack, 1541 .print_conntrack = tcp_print_conntrack,
1547#endif 1542#endif
1548 .packet = tcp_packet, 1543 .packet = tcp_packet,
1549 .new = tcp_new,
1550 .error = tcp_error,
1551 .can_early_drop = tcp_can_early_drop, 1544 .can_early_drop = tcp_can_early_drop,
1552#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1545#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1553 .to_nlattr = tcp_to_nlattr, 1546 .to_nlattr = tcp_to_nlattr,
@@ -1571,39 +1564,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
1571 .init_net = tcp_init_net, 1564 .init_net = tcp_init_net,
1572 .get_net_proto = tcp_get_net_proto, 1565 .get_net_proto = tcp_get_net_proto,
1573}; 1566};
1574EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
1575
1576const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
1577{
1578 .l3proto = PF_INET6,
1579 .l4proto = IPPROTO_TCP,
1580#ifdef CONFIG_NF_CONNTRACK_PROCFS
1581 .print_conntrack = tcp_print_conntrack,
1582#endif
1583 .packet = tcp_packet,
1584 .new = tcp_new,
1585 .error = tcp_error,
1586 .can_early_drop = tcp_can_early_drop,
1587#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1588 .nlattr_size = TCP_NLATTR_SIZE,
1589 .to_nlattr = tcp_to_nlattr,
1590 .from_nlattr = nlattr_to_tcp,
1591 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
1592 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
1593 .nlattr_tuple_size = tcp_nlattr_tuple_size,
1594 .nla_policy = nf_ct_port_nla_policy,
1595#endif
1596#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1597 .ctnl_timeout = {
1598 .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
1599 .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
1600 .nlattr_max = CTA_TIMEOUT_TCP_MAX,
1601 .obj_size = sizeof(unsigned int) *
1602 TCP_CONNTRACK_TIMEOUT_MAX,
1603 .nla_policy = tcp_timeout_nla_policy,
1604 },
1605#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1606 .init_net = tcp_init_net,
1607 .get_net_proto = tcp_get_net_proto,
1608};
1609EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3065fb8ef91b..a7aa70370913 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -42,14 +42,65 @@ static unsigned int *udp_get_timeouts(struct net *net)
42 return udp_pernet(net)->timeouts; 42 return udp_pernet(net)->timeouts;
43} 43}
44 44
45static void udp_error_log(const struct sk_buff *skb,
46 const struct nf_hook_state *state,
47 const char *msg)
48{
49 nf_l4proto_log_invalid(skb, state->net, state->pf,
50 IPPROTO_UDP, "%s", msg);
51}
52
53static bool udp_error(struct sk_buff *skb,
54 unsigned int dataoff,
55 const struct nf_hook_state *state)
56{
57 unsigned int udplen = skb->len - dataoff;
58 const struct udphdr *hdr;
59 struct udphdr _hdr;
60
61 /* Header is too small? */
62 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
63 if (!hdr) {
64 udp_error_log(skb, state, "short packet");
65 return true;
66 }
67
68 /* Truncated/malformed packets */
69 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
70 udp_error_log(skb, state, "truncated/malformed packet");
71 return true;
72 }
73
74 /* Packet with no checksum */
75 if (!hdr->check)
76 return false;
77
78 /* Checksum invalid? Ignore.
79 * We skip checking packets on the outgoing path
80 * because the checksum is assumed to be correct.
81 * FIXME: Source route IP option packets --RR */
82 if (state->hook == NF_INET_PRE_ROUTING &&
83 state->net->ct.sysctl_checksum &&
84 nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) {
85 udp_error_log(skb, state, "bad checksum");
86 return true;
87 }
88
89 return false;
90}
91
45/* Returns verdict for packet, and may modify conntracktype */ 92/* Returns verdict for packet, and may modify conntracktype */
46static int udp_packet(struct nf_conn *ct, 93static int udp_packet(struct nf_conn *ct,
47 const struct sk_buff *skb, 94 struct sk_buff *skb,
48 unsigned int dataoff, 95 unsigned int dataoff,
49 enum ip_conntrack_info ctinfo) 96 enum ip_conntrack_info ctinfo,
97 const struct nf_hook_state *state)
50{ 98{
51 unsigned int *timeouts; 99 unsigned int *timeouts;
52 100
101 if (udp_error(skb, dataoff, state))
102 return -NF_ACCEPT;
103
53 timeouts = nf_ct_timeout_lookup(ct); 104 timeouts = nf_ct_timeout_lookup(ct);
54 if (!timeouts) 105 if (!timeouts)
55 timeouts = udp_get_timeouts(nf_ct_net(ct)); 106 timeouts = udp_get_timeouts(nf_ct_net(ct));
@@ -69,24 +120,18 @@ static int udp_packet(struct nf_conn *ct,
69 return NF_ACCEPT; 120 return NF_ACCEPT;
70} 121}
71 122
72/* Called when a new connection for this protocol found. */
73static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
74 unsigned int dataoff)
75{
76 return true;
77}
78
79#ifdef CONFIG_NF_CT_PROTO_UDPLITE 123#ifdef CONFIG_NF_CT_PROTO_UDPLITE
80static void udplite_error_log(const struct sk_buff *skb, struct net *net, 124static void udplite_error_log(const struct sk_buff *skb,
81 u8 pf, const char *msg) 125 const struct nf_hook_state *state,
126 const char *msg)
82{ 127{
83 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg); 128 nf_l4proto_log_invalid(skb, state->net, state->pf,
129 IPPROTO_UDPLITE, "%s", msg);
84} 130}
85 131
86static int udplite_error(struct net *net, struct nf_conn *tmpl, 132static bool udplite_error(struct sk_buff *skb,
87 struct sk_buff *skb, 133 unsigned int dataoff,
88 unsigned int dataoff, 134 const struct nf_hook_state *state)
89 u8 pf, unsigned int hooknum)
90{ 135{
91 unsigned int udplen = skb->len - dataoff; 136 unsigned int udplen = skb->len - dataoff;
92 const struct udphdr *hdr; 137 const struct udphdr *hdr;
@@ -96,80 +141,67 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
96 /* Header is too small? */ 141 /* Header is too small? */
97 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); 142 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
98 if (!hdr) { 143 if (!hdr) {
99 udplite_error_log(skb, net, pf, "short packet"); 144 udplite_error_log(skb, state, "short packet");
100 return -NF_ACCEPT; 145 return true;
101 } 146 }
102 147
103 cscov = ntohs(hdr->len); 148 cscov = ntohs(hdr->len);
104 if (cscov == 0) { 149 if (cscov == 0) {
105 cscov = udplen; 150 cscov = udplen;
106 } else if (cscov < sizeof(*hdr) || cscov > udplen) { 151 } else if (cscov < sizeof(*hdr) || cscov > udplen) {
107 udplite_error_log(skb, net, pf, "invalid checksum coverage"); 152 udplite_error_log(skb, state, "invalid checksum coverage");
108 return -NF_ACCEPT; 153 return true;
109 } 154 }
110 155
111 /* UDPLITE mandates checksums */ 156 /* UDPLITE mandates checksums */
112 if (!hdr->check) { 157 if (!hdr->check) {
113 udplite_error_log(skb, net, pf, "checksum missing"); 158 udplite_error_log(skb, state, "checksum missing");
114 return -NF_ACCEPT; 159 return true;
115 } 160 }
116 161
117 /* Checksum invalid? Ignore. */ 162 /* Checksum invalid? Ignore. */
118 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && 163 if (state->hook == NF_INET_PRE_ROUTING &&
119 nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, 164 state->net->ct.sysctl_checksum &&
120 pf)) { 165 nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP,
121 udplite_error_log(skb, net, pf, "bad checksum"); 166 state->pf)) {
122 return -NF_ACCEPT; 167 udplite_error_log(skb, state, "bad checksum");
168 return true;
123 } 169 }
124 170
125 return NF_ACCEPT; 171 return false;
126}
127#endif
128
129static void udp_error_log(const struct sk_buff *skb, struct net *net,
130 u8 pf, const char *msg)
131{
132 nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg);
133} 172}
134 173
135static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, 174/* Returns verdict for packet, and may modify conntracktype */
136 unsigned int dataoff, 175static int udplite_packet(struct nf_conn *ct,
137 u_int8_t pf, 176 struct sk_buff *skb,
138 unsigned int hooknum) 177 unsigned int dataoff,
178 enum ip_conntrack_info ctinfo,
179 const struct nf_hook_state *state)
139{ 180{
140 unsigned int udplen = skb->len - dataoff; 181 unsigned int *timeouts;
141 const struct udphdr *hdr;
142 struct udphdr _hdr;
143
144 /* Header is too small? */
145 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
146 if (hdr == NULL) {
147 udp_error_log(skb, net, pf, "short packet");
148 return -NF_ACCEPT;
149 }
150 182
151 /* Truncated/malformed packets */ 183 if (udplite_error(skb, dataoff, state))
152 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
153 udp_error_log(skb, net, pf, "truncated/malformed packet");
154 return -NF_ACCEPT; 184 return -NF_ACCEPT;
155 }
156 185
157 /* Packet with no checksum */ 186 timeouts = nf_ct_timeout_lookup(ct);
158 if (!hdr->check) 187 if (!timeouts)
159 return NF_ACCEPT; 188 timeouts = udp_get_timeouts(nf_ct_net(ct));
160 189
161 /* Checksum invalid? Ignore. 190 /* If we've seen traffic both ways, this is some kind of UDP
162 * We skip checking packets on the outgoing path 191 stream. Extend timeout. */
163 * because the checksum is assumed to be correct. 192 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
164 * FIXME: Source route IP option packets --RR */ 193 nf_ct_refresh_acct(ct, ctinfo, skb,
165 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && 194 timeouts[UDP_CT_REPLIED]);
166 nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { 195 /* Also, more likely to be important, and not a probe */
167 udp_error_log(skb, net, pf, "bad checksum"); 196 if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
168 return -NF_ACCEPT; 197 nf_conntrack_event_cache(IPCT_ASSURED, ct);
198 } else {
199 nf_ct_refresh_acct(ct, ctinfo, skb,
200 timeouts[UDP_CT_UNREPLIED]);
169 } 201 }
170
171 return NF_ACCEPT; 202 return NF_ACCEPT;
172} 203}
204#endif
173 205
174#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 206#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
175 207
@@ -258,7 +290,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
258 return 0; 290 return 0;
259} 291}
260 292
261static int udp_init_net(struct net *net, u_int16_t proto) 293static int udp_init_net(struct net *net)
262{ 294{
263 struct nf_udp_net *un = udp_pernet(net); 295 struct nf_udp_net *un = udp_pernet(net);
264 struct nf_proto_net *pn = &un->pn; 296 struct nf_proto_net *pn = &un->pn;
@@ -278,72 +310,11 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net)
278 return &net->ct.nf_ct_proto.udp.pn; 310 return &net->ct.nf_ct_proto.udp.pn;
279} 311}
280 312
281const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = 313const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
282{
283 .l3proto = PF_INET,
284 .l4proto = IPPROTO_UDP,
285 .allow_clash = true,
286 .packet = udp_packet,
287 .new = udp_new,
288 .error = udp_error,
289#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
290 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
291 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
292 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
293 .nla_policy = nf_ct_port_nla_policy,
294#endif
295#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
296 .ctnl_timeout = {
297 .nlattr_to_obj = udp_timeout_nlattr_to_obj,
298 .obj_to_nlattr = udp_timeout_obj_to_nlattr,
299 .nlattr_max = CTA_TIMEOUT_UDP_MAX,
300 .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
301 .nla_policy = udp_timeout_nla_policy,
302 },
303#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
304 .init_net = udp_init_net,
305 .get_net_proto = udp_get_net_proto,
306};
307EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
308
309#ifdef CONFIG_NF_CT_PROTO_UDPLITE
310const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
311{
312 .l3proto = PF_INET,
313 .l4proto = IPPROTO_UDPLITE,
314 .allow_clash = true,
315 .packet = udp_packet,
316 .new = udp_new,
317 .error = udplite_error,
318#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
319 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
320 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
321 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
322 .nla_policy = nf_ct_port_nla_policy,
323#endif
324#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
325 .ctnl_timeout = {
326 .nlattr_to_obj = udp_timeout_nlattr_to_obj,
327 .obj_to_nlattr = udp_timeout_obj_to_nlattr,
328 .nlattr_max = CTA_TIMEOUT_UDP_MAX,
329 .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
330 .nla_policy = udp_timeout_nla_policy,
331 },
332#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
333 .init_net = udp_init_net,
334 .get_net_proto = udp_get_net_proto,
335};
336EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
337#endif
338
339const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
340{ 314{
341 .l3proto = PF_INET6,
342 .l4proto = IPPROTO_UDP, 315 .l4proto = IPPROTO_UDP,
343 .allow_clash = true, 316 .allow_clash = true,
344 .packet = udp_packet, 317 .packet = udp_packet,
345 .new = udp_new,
346 .error = udp_error,
347#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 318#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
348 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, 319 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
349 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, 320 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
@@ -362,17 +333,13 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
362 .init_net = udp_init_net, 333 .init_net = udp_init_net,
363 .get_net_proto = udp_get_net_proto, 334 .get_net_proto = udp_get_net_proto,
364}; 335};
365EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
366 336
367#ifdef CONFIG_NF_CT_PROTO_UDPLITE 337#ifdef CONFIG_NF_CT_PROTO_UDPLITE
368const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = 338const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite =
369{ 339{
370 .l3proto = PF_INET6,
371 .l4proto = IPPROTO_UDPLITE, 340 .l4proto = IPPROTO_UDPLITE,
372 .allow_clash = true, 341 .allow_clash = true,
373 .packet = udp_packet, 342 .packet = udplite_packet,
374 .new = udp_new,
375 .error = udplite_error,
376#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 343#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
377 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, 344 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
378 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, 345 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
@@ -391,5 +358,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
391 .init_net = udp_init_net, 358 .init_net = udp_init_net,
392 .get_net_proto = udp_get_net_proto, 359 .get_net_proto = udp_get_net_proto,
393}; 360};
394EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
395#endif 361#endif
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 13279f683da9..463d17d349c1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -292,7 +292,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
292 if (!net_eq(nf_ct_net(ct), net)) 292 if (!net_eq(nf_ct_net(ct), net))
293 goto release; 293 goto release;
294 294
295 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 295 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
296 WARN_ON(!l4proto); 296 WARN_ON(!l4proto);
297 297
298 ret = -ENOSPC; 298 ret = -ENOSPC;
@@ -720,10 +720,3 @@ static void __exit nf_conntrack_standalone_fini(void)
720 720
721module_init(nf_conntrack_standalone_init); 721module_init(nf_conntrack_standalone_init);
722module_exit(nf_conntrack_standalone_fini); 722module_exit(nf_conntrack_standalone_fini);
723
724/* Some modules need us, but don't depend directly on any symbol.
725 They should call this. */
726void need_conntrack(void)
727{
728}
729EXPORT_SYMBOL_GPL(need_conntrack);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index d8125616edc7..b7a4816add76 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -120,7 +120,7 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct)
120 if (l4num == IPPROTO_TCP) 120 if (l4num == IPPROTO_TCP)
121 flow_offload_fixup_tcp(&ct->proto.tcp); 121 flow_offload_fixup_tcp(&ct->proto.tcp);
122 122
123 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num); 123 l4proto = __nf_ct_l4proto_find(l4num);
124 if (!l4proto) 124 if (!l4proto)
125 return; 125 return;
126 126
@@ -233,8 +233,8 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
233 struct flow_offload *flow; 233 struct flow_offload *flow;
234 int dir; 234 int dir;
235 235
236 tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple, 236 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
237 nf_flow_offload_rhash_params); 237 nf_flow_offload_rhash_params);
238 if (!tuplehash) 238 if (!tuplehash)
239 return NULL; 239 return NULL;
240 240
@@ -254,20 +254,17 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
254 struct flow_offload_tuple_rhash *tuplehash; 254 struct flow_offload_tuple_rhash *tuplehash;
255 struct rhashtable_iter hti; 255 struct rhashtable_iter hti;
256 struct flow_offload *flow; 256 struct flow_offload *flow;
257 int err; 257 int err = 0;
258
259 err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
260 if (err)
261 return err;
262 258
259 rhashtable_walk_enter(&flow_table->rhashtable, &hti);
263 rhashtable_walk_start(&hti); 260 rhashtable_walk_start(&hti);
264 261
265 while ((tuplehash = rhashtable_walk_next(&hti))) { 262 while ((tuplehash = rhashtable_walk_next(&hti))) {
266 if (IS_ERR(tuplehash)) { 263 if (IS_ERR(tuplehash)) {
267 err = PTR_ERR(tuplehash); 264 if (PTR_ERR(tuplehash) != -EAGAIN) {
268 if (err != -EAGAIN) 265 err = PTR_ERR(tuplehash);
269 goto out; 266 break;
270 267 }
271 continue; 268 continue;
272 } 269 }
273 if (tuplehash->tuple.dir) 270 if (tuplehash->tuple.dir)
@@ -277,7 +274,6 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
277 274
278 iter(flow, data); 275 iter(flow, data);
279 } 276 }
280out:
281 rhashtable_walk_stop(&hti); 277 rhashtable_walk_stop(&hti);
282 rhashtable_walk_exit(&hti); 278 rhashtable_walk_exit(&hti);
283 279
@@ -290,25 +286,19 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
290 return (__s32)(flow->timeout - (u32)jiffies) <= 0; 286 return (__s32)(flow->timeout - (u32)jiffies) <= 0;
291} 287}
292 288
293static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) 289static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
294{ 290{
295 struct flow_offload_tuple_rhash *tuplehash; 291 struct flow_offload_tuple_rhash *tuplehash;
296 struct rhashtable_iter hti; 292 struct rhashtable_iter hti;
297 struct flow_offload *flow; 293 struct flow_offload *flow;
298 int err;
299
300 err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
301 if (err)
302 return 0;
303 294
295 rhashtable_walk_enter(&flow_table->rhashtable, &hti);
304 rhashtable_walk_start(&hti); 296 rhashtable_walk_start(&hti);
305 297
306 while ((tuplehash = rhashtable_walk_next(&hti))) { 298 while ((tuplehash = rhashtable_walk_next(&hti))) {
307 if (IS_ERR(tuplehash)) { 299 if (IS_ERR(tuplehash)) {
308 err = PTR_ERR(tuplehash); 300 if (PTR_ERR(tuplehash) != -EAGAIN)
309 if (err != -EAGAIN) 301 break;
310 goto out;
311
312 continue; 302 continue;
313 } 303 }
314 if (tuplehash->tuple.dir) 304 if (tuplehash->tuple.dir)
@@ -321,11 +311,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
321 FLOW_OFFLOAD_TEARDOWN))) 311 FLOW_OFFLOAD_TEARDOWN)))
322 flow_offload_del(flow_table, flow); 312 flow_offload_del(flow_table, flow);
323 } 313 }
324out:
325 rhashtable_walk_stop(&hti); 314 rhashtable_walk_stop(&hti);
326 rhashtable_walk_exit(&hti); 315 rhashtable_walk_exit(&hti);
327
328 return 1;
329} 316}
330 317
331static void nf_flow_offload_work_gc(struct work_struct *work) 318static void nf_flow_offload_work_gc(struct work_struct *work)
@@ -478,14 +465,17 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init);
478static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) 465static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
479{ 466{
480 struct net_device *dev = data; 467 struct net_device *dev = data;
468 struct flow_offload_entry *e;
469
470 e = container_of(flow, struct flow_offload_entry, flow);
481 471
482 if (!dev) { 472 if (!dev) {
483 flow_offload_teardown(flow); 473 flow_offload_teardown(flow);
484 return; 474 return;
485 } 475 }
486 476 if (net_eq(nf_ct_net(e->ct), dev_net(dev)) &&
487 if (flow->tuplehash[0].tuple.iifidx == dev->ifindex || 477 (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
488 flow->tuplehash[1].tuple.iifidx == dev->ifindex) 478 flow->tuplehash[1].tuple.iifidx == dev->ifindex))
489 flow_offload_dead(flow); 479 flow_offload_dead(flow);
490} 480}
491 481
@@ -496,7 +486,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
496 flush_delayed_work(&flowtable->gc_work); 486 flush_delayed_work(&flowtable->gc_work);
497} 487}
498 488
499void nf_flow_table_cleanup(struct net *net, struct net_device *dev) 489void nf_flow_table_cleanup(struct net_device *dev)
500{ 490{
501 struct nf_flowtable *flowtable; 491 struct nf_flowtable *flowtable;
502 492
@@ -514,7 +504,7 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
514 mutex_unlock(&flowtable_lock); 504 mutex_unlock(&flowtable_lock);
515 cancel_delayed_work_sync(&flow_table->gc_work); 505 cancel_delayed_work_sync(&flow_table->gc_work);
516 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); 506 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
517 WARN_ON(!nf_flow_offload_gc_step(flow_table)); 507 nf_flow_offload_gc_step(flow_table);
518 rhashtable_destroy(&flow_table->rhashtable); 508 rhashtable_destroy(&flow_table->rhashtable);
519} 509}
520EXPORT_SYMBOL_GPL(nf_flow_table_free); 510EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 15ed91309992..1d291a51cd45 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -254,8 +254,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
254 if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) 254 if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
255 return NF_ACCEPT; 255 return NF_ACCEPT;
256 256
257 if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && 257 if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
258 nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
259 return NF_DROP; 258 return NF_DROP;
260 259
261 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; 260 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
@@ -471,8 +470,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
471 if (skb_try_make_writable(skb, sizeof(*ip6h))) 470 if (skb_try_make_writable(skb, sizeof(*ip6h)))
472 return NF_DROP; 471 return NF_DROP;
473 472
474 if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && 473 if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
475 nf_flow_nat_ipv6(flow, skb, dir) < 0)
476 return NF_DROP; 474 return NF_DROP;
477 475
478 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; 476 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 99606baedda4..38793b95d9bc 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -37,7 +37,7 @@ static void mangle_contents(struct sk_buff *skb,
37{ 37{
38 unsigned char *data; 38 unsigned char *data;
39 39
40 BUG_ON(skb_is_nonlinear(skb)); 40 SKB_LINEAR_ASSERT(skb);
41 data = skb_network_header(skb) + dataoff; 41 data = skb_network_header(skb) + dataoff;
42 42
43 /* move post-replacement */ 43 /* move post-replacement */
@@ -110,8 +110,6 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
110 !enlarge_skb(skb, rep_len - match_len)) 110 !enlarge_skb(skb, rep_len - match_len))
111 return false; 111 return false;
112 112
113 SKB_LINEAR_ASSERT(skb);
114
115 tcph = (void *)skb->data + protoff; 113 tcph = (void *)skb->data + protoff;
116 114
117 oldlen = skb->len - protoff; 115 oldlen = skb->len - protoff;
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index adee04af8d43..78a9e6454ff3 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -52,13 +52,11 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
52 52
53 newdst = 0; 53 newdst = 0;
54 54
55 rcu_read_lock();
56 indev = __in_dev_get_rcu(skb->dev); 55 indev = __in_dev_get_rcu(skb->dev);
57 if (indev && indev->ifa_list) { 56 if (indev && indev->ifa_list) {
58 ifa = indev->ifa_list; 57 ifa = indev->ifa_list;
59 newdst = ifa->ifa_local; 58 newdst = ifa->ifa_local;
60 } 59 }
61 rcu_read_unlock();
62 60
63 if (!newdst) 61 if (!newdst)
64 return NF_DROP; 62 return NF_DROP;
@@ -97,7 +95,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
97 struct inet6_ifaddr *ifa; 95 struct inet6_ifaddr *ifa;
98 bool addr = false; 96 bool addr = false;
99 97
100 rcu_read_lock();
101 idev = __in6_dev_get(skb->dev); 98 idev = __in6_dev_get(skb->dev);
102 if (idev != NULL) { 99 if (idev != NULL) {
103 read_lock_bh(&idev->lock); 100 read_lock_bh(&idev->lock);
@@ -108,7 +105,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
108 } 105 }
109 read_unlock_bh(&idev->lock); 106 read_unlock_bh(&idev->lock);
110 } 107 }
111 rcu_read_unlock();
112 108
113 if (!addr) 109 if (!addr)
114 return NF_DROP; 110 return NF_DROP;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2cfb173cd0b2..42487d01a3ed 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -27,6 +27,8 @@
27static LIST_HEAD(nf_tables_expressions); 27static LIST_HEAD(nf_tables_expressions);
28static LIST_HEAD(nf_tables_objects); 28static LIST_HEAD(nf_tables_objects);
29static LIST_HEAD(nf_tables_flowtables); 29static LIST_HEAD(nf_tables_flowtables);
30static LIST_HEAD(nf_tables_destroy_list);
31static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
30static u64 table_handle; 32static u64 table_handle;
31 33
32enum { 34enum {
@@ -64,6 +66,8 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
64 66
65 net->nft.validate_state = new_validate_state; 67 net->nft.validate_state = new_validate_state;
66} 68}
69static void nf_tables_trans_destroy_work(struct work_struct *w);
70static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
67 71
68static void nft_ctx_init(struct nft_ctx *ctx, 72static void nft_ctx_init(struct nft_ctx *ctx,
69 struct net *net, 73 struct net *net,
@@ -207,6 +211,18 @@ static int nft_delchain(struct nft_ctx *ctx)
207 return err; 211 return err;
208} 212}
209 213
214/* either expr ops provide both activate/deactivate, or neither */
215static bool nft_expr_check_ops(const struct nft_expr_ops *ops)
216{
217 if (!ops)
218 return true;
219
220 if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate)))
221 return false;
222
223 return true;
224}
225
210static void nft_rule_expr_activate(const struct nft_ctx *ctx, 226static void nft_rule_expr_activate(const struct nft_ctx *ctx,
211 struct nft_rule *rule) 227 struct nft_rule *rule)
212{ 228{
@@ -298,7 +314,7 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
298 return 0; 314 return 0;
299} 315}
300 316
301static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, 317static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
302 struct nft_set *set) 318 struct nft_set *set)
303{ 319{
304 struct nft_trans *trans; 320 struct nft_trans *trans;
@@ -318,7 +334,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
318 return 0; 334 return 0;
319} 335}
320 336
321static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) 337static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
322{ 338{
323 int err; 339 int err;
324 340
@@ -1005,7 +1021,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
1005 1021
1006static void nf_tables_table_destroy(struct nft_ctx *ctx) 1022static void nf_tables_table_destroy(struct nft_ctx *ctx)
1007{ 1023{
1008 BUG_ON(ctx->table->use > 0); 1024 if (WARN_ON(ctx->table->use > 0))
1025 return;
1009 1026
1010 rhltable_destroy(&ctx->table->chains_ht); 1027 rhltable_destroy(&ctx->table->chains_ht);
1011 kfree(ctx->table->name); 1028 kfree(ctx->table->name);
@@ -1412,7 +1429,8 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
1412{ 1429{
1413 struct nft_chain *chain = ctx->chain; 1430 struct nft_chain *chain = ctx->chain;
1414 1431
1415 BUG_ON(chain->use > 0); 1432 if (WARN_ON(chain->use > 0))
1433 return;
1416 1434
1417 /* no concurrent access possible anymore */ 1435 /* no concurrent access possible anymore */
1418 nf_tables_chain_free_chain_rules(chain); 1436 nf_tables_chain_free_chain_rules(chain);
@@ -1907,6 +1925,9 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
1907 */ 1925 */
1908int nft_register_expr(struct nft_expr_type *type) 1926int nft_register_expr(struct nft_expr_type *type)
1909{ 1927{
1928 if (!nft_expr_check_ops(type->ops))
1929 return -EINVAL;
1930
1910 nfnl_lock(NFNL_SUBSYS_NFTABLES); 1931 nfnl_lock(NFNL_SUBSYS_NFTABLES);
1911 if (type->family == NFPROTO_UNSPEC) 1932 if (type->family == NFPROTO_UNSPEC)
1912 list_add_tail_rcu(&type->list, &nf_tables_expressions); 1933 list_add_tail_rcu(&type->list, &nf_tables_expressions);
@@ -2054,6 +2075,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
2054 err = PTR_ERR(ops); 2075 err = PTR_ERR(ops);
2055 goto err1; 2076 goto err1;
2056 } 2077 }
2078 if (!nft_expr_check_ops(ops)) {
2079 err = -EINVAL;
2080 goto err1;
2081 }
2057 } else 2082 } else
2058 ops = type->ops; 2083 ops = type->ops;
2059 2084
@@ -2434,7 +2459,6 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
2434{ 2459{
2435 struct nft_expr *expr; 2460 struct nft_expr *expr;
2436 2461
2437 lockdep_assert_held(&ctx->net->nft.commit_mutex);
2438 /* 2462 /*
2439 * Careful: some expressions might not be initialized in case this 2463 * Careful: some expressions might not be initialized in case this
2440 * is called on error from nf_tables_newrule(). 2464 * is called on error from nf_tables_newrule().
@@ -3567,13 +3591,6 @@ static void nft_set_destroy(struct nft_set *set)
3567 kvfree(set); 3591 kvfree(set);
3568} 3592}
3569 3593
3570static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
3571{
3572 list_del_rcu(&set->list);
3573 nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
3574 nft_set_destroy(set);
3575}
3576
3577static int nf_tables_delset(struct net *net, struct sock *nlsk, 3594static int nf_tables_delset(struct net *net, struct sock *nlsk,
3578 struct sk_buff *skb, const struct nlmsghdr *nlh, 3595 struct sk_buff *skb, const struct nlmsghdr *nlh,
3579 const struct nlattr * const nla[], 3596 const struct nlattr * const nla[],
@@ -3668,17 +3685,38 @@ bind:
3668} 3685}
3669EXPORT_SYMBOL_GPL(nf_tables_bind_set); 3686EXPORT_SYMBOL_GPL(nf_tables_bind_set);
3670 3687
3671void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, 3688void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
3672 struct nft_set_binding *binding) 3689 struct nft_set_binding *binding)
3673{ 3690{
3691 if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
3692 nft_is_active(ctx->net, set))
3693 list_add_tail_rcu(&set->list, &ctx->table->sets);
3694
3695 list_add_tail_rcu(&binding->list, &set->bindings);
3696}
3697EXPORT_SYMBOL_GPL(nf_tables_rebind_set);
3698
3699void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
3700 struct nft_set_binding *binding)
3701{
3674 list_del_rcu(&binding->list); 3702 list_del_rcu(&binding->list);
3675 3703
3676 if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && 3704 if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
3677 nft_is_active(ctx->net, set)) 3705 nft_is_active(ctx->net, set))
3678 nf_tables_set_destroy(ctx, set); 3706 list_del_rcu(&set->list);
3679} 3707}
3680EXPORT_SYMBOL_GPL(nf_tables_unbind_set); 3708EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
3681 3709
3710void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
3711{
3712 if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
3713 nft_is_active(ctx->net, set)) {
3714 nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
3715 nft_set_destroy(set);
3716 }
3717}
3718EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
3719
3682const struct nft_set_ext_type nft_set_ext_types[] = { 3720const struct nft_set_ext_type nft_set_ext_types[] = {
3683 [NFT_SET_EXT_KEY] = { 3721 [NFT_SET_EXT_KEY] = {
3684 .align = __alignof__(u32), 3722 .align = __alignof__(u32),
@@ -6191,19 +6229,28 @@ static void nft_commit_release(struct nft_trans *trans)
6191 nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); 6229 nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
6192 break; 6230 break;
6193 } 6231 }
6232
6233 if (trans->put_net)
6234 put_net(trans->ctx.net);
6235
6194 kfree(trans); 6236 kfree(trans);
6195} 6237}
6196 6238
6197static void nf_tables_commit_release(struct net *net) 6239static void nf_tables_trans_destroy_work(struct work_struct *w)
6198{ 6240{
6199 struct nft_trans *trans, *next; 6241 struct nft_trans *trans, *next;
6242 LIST_HEAD(head);
6200 6243
6201 if (list_empty(&net->nft.commit_list)) 6244 spin_lock(&nf_tables_destroy_list_lock);
6245 list_splice_init(&nf_tables_destroy_list, &head);
6246 spin_unlock(&nf_tables_destroy_list_lock);
6247
6248 if (list_empty(&head))
6202 return; 6249 return;
6203 6250
6204 synchronize_rcu(); 6251 synchronize_rcu();
6205 6252
6206 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { 6253 list_for_each_entry_safe(trans, next, &head, list) {
6207 list_del(&trans->list); 6254 list_del(&trans->list);
6208 nft_commit_release(trans); 6255 nft_commit_release(trans);
6209 } 6256 }
@@ -6334,6 +6381,37 @@ static void nft_chain_del(struct nft_chain *chain)
6334 list_del_rcu(&chain->list); 6381 list_del_rcu(&chain->list);
6335} 6382}
6336 6383
6384static void nf_tables_commit_release(struct net *net)
6385{
6386 struct nft_trans *trans;
6387
6388 /* all side effects have to be made visible.
6389 * For example, if a chain named 'foo' has been deleted, a
6390 * new transaction must not find it anymore.
6391 *
6392 * Memory reclaim happens asynchronously from work queue
6393 * to prevent expensive synchronize_rcu() in commit phase.
6394 */
6395 if (list_empty(&net->nft.commit_list)) {
6396 mutex_unlock(&net->nft.commit_mutex);
6397 return;
6398 }
6399
6400 trans = list_last_entry(&net->nft.commit_list,
6401 struct nft_trans, list);
6402 get_net(trans->ctx.net);
6403 WARN_ON_ONCE(trans->put_net);
6404
6405 trans->put_net = true;
6406 spin_lock(&nf_tables_destroy_list_lock);
6407 list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
6408 spin_unlock(&nf_tables_destroy_list_lock);
6409
6410 mutex_unlock(&net->nft.commit_mutex);
6411
6412 schedule_work(&trans_destroy_work);
6413}
6414
6337static int nf_tables_commit(struct net *net, struct sk_buff *skb) 6415static int nf_tables_commit(struct net *net, struct sk_buff *skb)
6338{ 6416{
6339 struct nft_trans *trans, *next; 6417 struct nft_trans *trans, *next;
@@ -6495,9 +6573,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
6495 } 6573 }
6496 } 6574 }
6497 6575
6498 nf_tables_commit_release(net);
6499 nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); 6576 nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
6500 mutex_unlock(&net->nft.commit_mutex); 6577 nf_tables_commit_release(net);
6501 6578
6502 return 0; 6579 return 0;
6503} 6580}
@@ -7168,7 +7245,8 @@ int __nft_release_basechain(struct nft_ctx *ctx)
7168{ 7245{
7169 struct nft_rule *rule, *nr; 7246 struct nft_rule *rule, *nr;
7170 7247
7171 BUG_ON(!nft_is_base_chain(ctx->chain)); 7248 if (WARN_ON(!nft_is_base_chain(ctx->chain)))
7249 return 0;
7172 7250
7173 nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); 7251 nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
7174 list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { 7252 list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
@@ -7202,9 +7280,6 @@ static void __nft_release_tables(struct net *net)
7202 7280
7203 list_for_each_entry(chain, &table->chains, list) 7281 list_for_each_entry(chain, &table->chains, list)
7204 nf_tables_unregister_hook(net, table, chain); 7282 nf_tables_unregister_hook(net, table, chain);
7205 list_for_each_entry(flowtable, &table->flowtables, list)
7206 nf_unregister_net_hooks(net, flowtable->ops,
7207 flowtable->ops_len);
7208 /* No packets are walking on these chains anymore. */ 7283 /* No packets are walking on these chains anymore. */
7209 ctx.table = table; 7284 ctx.table = table;
7210 list_for_each_entry(chain, &table->chains, list) { 7285 list_for_each_entry(chain, &table->chains, list) {
@@ -7271,6 +7346,7 @@ static int __init nf_tables_module_init(void)
7271{ 7346{
7272 int err; 7347 int err;
7273 7348
7349 spin_lock_init(&nf_tables_destroy_list_lock);
7274 err = register_pernet_subsys(&nf_tables_net_ops); 7350 err = register_pernet_subsys(&nf_tables_net_ops);
7275 if (err < 0) 7351 if (err < 0)
7276 return err; 7352 return err;
@@ -7310,6 +7386,7 @@ static void __exit nf_tables_module_exit(void)
7310 unregister_netdevice_notifier(&nf_tables_flowtable_notifier); 7386 unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
7311 nft_chain_filter_fini(); 7387 nft_chain_filter_fini();
7312 unregister_pernet_subsys(&nf_tables_net_ops); 7388 unregister_pernet_subsys(&nf_tables_net_ops);
7389 cancel_work_sync(&trans_destroy_work);
7313 rcu_barrier(); 7390 rcu_barrier();
7314 nf_tables_core_module_exit(); 7391 nf_tables_core_module_exit();
7315} 7392}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index ffd5c0f9412b..3fbce3b9c5ec 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -249,12 +249,24 @@ static struct nft_expr_type *nft_basic_types[] = {
249 &nft_exthdr_type, 249 &nft_exthdr_type,
250}; 250};
251 251
252static struct nft_object_type *nft_basic_objects[] = {
253#ifdef CONFIG_NETWORK_SECMARK
254 &nft_secmark_obj_type,
255#endif
256};
257
252int __init nf_tables_core_module_init(void) 258int __init nf_tables_core_module_init(void)
253{ 259{
254 int err, i; 260 int err, i, j = 0;
261
262 for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
263 err = nft_register_obj(nft_basic_objects[i]);
264 if (err)
265 goto err;
266 }
255 267
256 for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) { 268 for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) {
257 err = nft_register_expr(nft_basic_types[i]); 269 err = nft_register_expr(nft_basic_types[j]);
258 if (err) 270 if (err)
259 goto err; 271 goto err;
260 } 272 }
@@ -262,8 +274,12 @@ int __init nf_tables_core_module_init(void)
262 return 0; 274 return 0;
263 275
264err: 276err:
277 while (j-- > 0)
278 nft_unregister_expr(nft_basic_types[j]);
279
265 while (i-- > 0) 280 while (i-- > 0)
266 nft_unregister_expr(nft_basic_types[i]); 281 nft_unregister_obj(nft_basic_objects[i]);
282
267 return err; 283 return err;
268} 284}
269 285
@@ -274,4 +290,8 @@ void nf_tables_core_module_exit(void)
274 i = ARRAY_SIZE(nft_basic_types); 290 i = ARRAY_SIZE(nft_basic_types);
275 while (i-- > 0) 291 while (i-- > 0)
276 nft_unregister_expr(nft_basic_types[i]); 292 nft_unregister_expr(nft_basic_types[i]);
293
294 i = ARRAY_SIZE(nft_basic_objects);
295 while (i-- > 0)
296 nft_unregister_obj(nft_basic_objects[i]);
277} 297}
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index a30f8ba4b89a..e7a50af1b3d6 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -53,9 +53,6 @@ ctnl_timeout_parse_policy(void *timeout,
53 struct nlattr **tb; 53 struct nlattr **tb;
54 int ret = 0; 54 int ret = 0;
55 55
56 if (!l4proto->ctnl_timeout.nlattr_to_obj)
57 return 0;
58
59 tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), 56 tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
60 GFP_KERNEL); 57 GFP_KERNEL);
61 58
@@ -125,7 +122,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
125 return -EBUSY; 122 return -EBUSY;
126 } 123 }
127 124
128 l4proto = nf_ct_l4proto_find_get(l3num, l4num); 125 l4proto = nf_ct_l4proto_find_get(l4num);
129 126
130 /* This protocol is not supportted, skip. */ 127 /* This protocol is not supportted, skip. */
131 if (l4proto->l4proto != l4num) { 128 if (l4proto->l4proto != l4num) {
@@ -167,6 +164,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
167 struct nfgenmsg *nfmsg; 164 struct nfgenmsg *nfmsg;
168 unsigned int flags = portid ? NLM_F_MULTI : 0; 165 unsigned int flags = portid ? NLM_F_MULTI : 0;
169 const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; 166 const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
167 struct nlattr *nest_parms;
168 int ret;
170 169
171 event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); 170 event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
172 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); 171 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
@@ -186,22 +185,15 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
186 htonl(refcount_read(&timeout->refcnt)))) 185 htonl(refcount_read(&timeout->refcnt))))
187 goto nla_put_failure; 186 goto nla_put_failure;
188 187
189 if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { 188 nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
190 struct nlattr *nest_parms; 189 if (!nest_parms)
191 int ret; 190 goto nla_put_failure;
192
193 nest_parms = nla_nest_start(skb,
194 CTA_TIMEOUT_DATA | NLA_F_NESTED);
195 if (!nest_parms)
196 goto nla_put_failure;
197 191
198 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, 192 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data);
199 &timeout->timeout.data); 193 if (ret < 0)
200 if (ret < 0) 194 goto nla_put_failure;
201 goto nla_put_failure;
202 195
203 nla_nest_end(skb, nest_parms); 196 nla_nest_end(skb, nest_parms);
204 }
205 197
206 nlmsg_end(skb, nlh); 198 nlmsg_end(skb, nlh);
207 return skb->len; 199 return skb->len;
@@ -358,7 +350,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
358 struct netlink_ext_ack *extack) 350 struct netlink_ext_ack *extack)
359{ 351{
360 const struct nf_conntrack_l4proto *l4proto; 352 const struct nf_conntrack_l4proto *l4proto;
361 __u16 l3num;
362 __u8 l4num; 353 __u8 l4num;
363 int ret; 354 int ret;
364 355
@@ -367,9 +358,8 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
367 !cda[CTA_TIMEOUT_DATA]) 358 !cda[CTA_TIMEOUT_DATA])
368 return -EINVAL; 359 return -EINVAL;
369 360
370 l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
371 l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); 361 l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
372 l4proto = nf_ct_l4proto_find_get(l3num, l4num); 362 l4proto = nf_ct_l4proto_find_get(l4num);
373 363
374 /* This protocol is not supported, skip. */ 364 /* This protocol is not supported, skip. */
375 if (l4proto->l4proto != l4num) { 365 if (l4proto->l4proto != l4num) {
@@ -391,12 +381,14 @@ err:
391 381
392static int 382static int
393cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, 383cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
394 u32 seq, u32 type, int event, 384 u32 seq, u32 type, int event, u16 l3num,
395 const struct nf_conntrack_l4proto *l4proto) 385 const struct nf_conntrack_l4proto *l4proto)
396{ 386{
397 struct nlmsghdr *nlh; 387 struct nlmsghdr *nlh;
398 struct nfgenmsg *nfmsg; 388 struct nfgenmsg *nfmsg;
399 unsigned int flags = portid ? NLM_F_MULTI : 0; 389 unsigned int flags = portid ? NLM_F_MULTI : 0;
390 struct nlattr *nest_parms;
391 int ret;
400 392
401 event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); 393 event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
402 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); 394 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
@@ -408,25 +400,19 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
408 nfmsg->version = NFNETLINK_V0; 400 nfmsg->version = NFNETLINK_V0;
409 nfmsg->res_id = 0; 401 nfmsg->res_id = 0;
410 402
411 if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || 403 if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
412 nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) 404 nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
413 goto nla_put_failure; 405 goto nla_put_failure;
414 406
415 if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { 407 nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
416 struct nlattr *nest_parms; 408 if (!nest_parms)
417 int ret; 409 goto nla_put_failure;
418
419 nest_parms = nla_nest_start(skb,
420 CTA_TIMEOUT_DATA | NLA_F_NESTED);
421 if (!nest_parms)
422 goto nla_put_failure;
423 410
424 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); 411 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
425 if (ret < 0) 412 if (ret < 0)
426 goto nla_put_failure; 413 goto nla_put_failure;
427 414
428 nla_nest_end(skb, nest_parms); 415 nla_nest_end(skb, nest_parms);
429 }
430 416
431 nlmsg_end(skb, nlh); 417 nlmsg_end(skb, nlh);
432 return skb->len; 418 return skb->len;
@@ -454,7 +440,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
454 440
455 l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); 441 l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
456 l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); 442 l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
457 l4proto = nf_ct_l4proto_find_get(l3num, l4num); 443 l4proto = nf_ct_l4proto_find_get(l4num);
458 444
459 /* This protocol is not supported, skip. */ 445 /* This protocol is not supported, skip. */
460 if (l4proto->l4proto != l4num) { 446 if (l4proto->l4proto != l4num) {
@@ -472,6 +458,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
472 nlh->nlmsg_seq, 458 nlh->nlmsg_seq,
473 NFNL_MSG_TYPE(nlh->nlmsg_type), 459 NFNL_MSG_TYPE(nlh->nlmsg_type),
474 IPCTNL_MSG_TIMEOUT_DEFAULT_SET, 460 IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
461 l3num,
475 l4proto); 462 l4proto);
476 if (ret <= 0) { 463 if (ret <= 0) {
477 kfree_skb(skb2); 464 kfree_skb(skb2);
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 00db27dfd2ff..6f41dd74729d 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -30,32 +30,27 @@ EXPORT_SYMBOL_GPL(nf_osf_fingers);
30static inline int nf_osf_ttl(const struct sk_buff *skb, 30static inline int nf_osf_ttl(const struct sk_buff *skb,
31 int ttl_check, unsigned char f_ttl) 31 int ttl_check, unsigned char f_ttl)
32{ 32{
33 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
33 const struct iphdr *ip = ip_hdr(skb); 34 const struct iphdr *ip = ip_hdr(skb);
34 35 int ret = 0;
35 if (ttl_check != -1) { 36
36 if (ttl_check == NF_OSF_TTL_TRUE) 37 if (ttl_check == NF_OSF_TTL_TRUE)
37 return ip->ttl == f_ttl; 38 return ip->ttl == f_ttl;
38 if (ttl_check == NF_OSF_TTL_NOCHECK) 39 if (ttl_check == NF_OSF_TTL_NOCHECK)
39 return 1; 40 return 1;
40 else if (ip->ttl <= f_ttl) 41 else if (ip->ttl <= f_ttl)
41 return 1; 42 return 1;
42 else { 43
43 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 44 for_ifa(in_dev) {
44 int ret = 0; 45 if (inet_ifa_match(ip->saddr, ifa)) {
45 46 ret = (ip->ttl == f_ttl);
46 for_ifa(in_dev) { 47 break;
47 if (inet_ifa_match(ip->saddr, ifa)) {
48 ret = (ip->ttl == f_ttl);
49 break;
50 }
51 }
52 endfor_ifa(in_dev);
53
54 return ret;
55 } 48 }
56 } 49 }
57 50
58 return ip->ttl == f_ttl; 51 endfor_ifa(in_dev);
52
53 return ret;
59} 54}
60 55
61struct nf_osf_hdr_ctx { 56struct nf_osf_hdr_ctx {
@@ -213,7 +208,7 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
213 if (!tcp) 208 if (!tcp)
214 return false; 209 return false;
215 210
216 ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; 211 ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0;
217 212
218 list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { 213 list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
219 214
@@ -257,7 +252,8 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
257EXPORT_SYMBOL_GPL(nf_osf_match); 252EXPORT_SYMBOL_GPL(nf_osf_match);
258 253
259const char *nf_osf_find(const struct sk_buff *skb, 254const char *nf_osf_find(const struct sk_buff *skb,
260 const struct list_head *nf_osf_fingers) 255 const struct list_head *nf_osf_fingers,
256 const int ttl_check)
261{ 257{
262 const struct iphdr *ip = ip_hdr(skb); 258 const struct iphdr *ip = ip_hdr(skb);
263 const struct nf_osf_user_finger *f; 259 const struct nf_osf_user_finger *f;
@@ -275,7 +271,7 @@ const char *nf_osf_find(const struct sk_buff *skb,
275 271
276 list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { 272 list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
277 f = &kf->finger; 273 f = &kf->finger;
278 if (!nf_osf_match_one(skb, f, -1, &ctx)) 274 if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
279 continue; 275 continue;
280 276
281 genre = f->genre; 277 genre = f->genre;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index d33094f4ec41..43041f087eb3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -765,7 +765,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
765 return ret; 765 return ret;
766 } 766 }
767 767
768 skb->next = NULL; 768 skb_mark_not_on_list(skb);
769 769
770 entry_seg = nf_queue_entry_dup(entry); 770 entry_seg = nf_queue_entry_dup(entry);
771 if (entry_seg) { 771 if (entry_seg) {
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index fa90a8402845..79d48c1d06f4 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -79,7 +79,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
79 79
80 err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, 80 err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
81 tb[NFTA_CMP_DATA]); 81 tb[NFTA_CMP_DATA]);
82 BUG_ON(err < 0); 82 if (err < 0)
83 return err;
83 84
84 priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); 85 priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
85 err = nft_validate_register_load(priv->sreg, desc.len); 86 err = nft_validate_register_load(priv->sreg, desc.len);
@@ -129,7 +130,8 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
129 130
130 err = nft_data_init(NULL, &data, sizeof(data), &desc, 131 err = nft_data_init(NULL, &data, sizeof(data), &desc,
131 tb[NFTA_CMP_DATA]); 132 tb[NFTA_CMP_DATA]);
132 BUG_ON(err < 0); 133 if (err < 0)
134 return err;
133 135
134 priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); 136 priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
135 err = nft_validate_register_load(priv->sreg, desc.len); 137 err = nft_validate_register_load(priv->sreg, desc.len);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 32535eea51b2..768292eac2a4 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -290,6 +290,24 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
290 module_put(target->me); 290 module_put(target->me);
291} 291}
292 292
293static int nft_extension_dump_info(struct sk_buff *skb, int attr,
294 const void *info,
295 unsigned int size, unsigned int user_size)
296{
297 unsigned int info_size, aligned_size = XT_ALIGN(size);
298 struct nlattr *nla;
299
300 nla = nla_reserve(skb, attr, aligned_size);
301 if (!nla)
302 return -1;
303
304 info_size = user_size ? : size;
305 memcpy(nla_data(nla), info, info_size);
306 memset(nla_data(nla) + info_size, 0, aligned_size - info_size);
307
308 return 0;
309}
310
293static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) 311static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
294{ 312{
295 const struct xt_target *target = expr->ops->data; 313 const struct xt_target *target = expr->ops->data;
@@ -297,7 +315,8 @@ static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
297 315
298 if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || 316 if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
299 nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || 317 nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
300 nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(target->targetsize), info)) 318 nft_extension_dump_info(skb, NFTA_TARGET_INFO, info,
319 target->targetsize, target->usersize))
301 goto nla_put_failure; 320 goto nla_put_failure;
302 321
303 return 0; 322 return 0;
@@ -532,7 +551,8 @@ static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
532 551
533 if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || 552 if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
534 nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || 553 nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
535 nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(match->matchsize), info)) 554 nft_extension_dump_info(skb, NFTA_MATCH_INFO, info,
555 match->matchsize, match->usersize))
536 goto nla_put_failure; 556 goto nla_put_failure;
537 557
538 return 0; 558 return 0;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 5dd87748afa8..586627c361df 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -279,7 +279,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
279{ 279{
280 const struct nft_ct *priv = nft_expr_priv(expr); 280 const struct nft_ct *priv = nft_expr_priv(expr);
281 struct sk_buff *skb = pkt->skb; 281 struct sk_buff *skb = pkt->skb;
282#ifdef CONFIG_NF_CONNTRACK_MARK 282#if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK)
283 u32 value = regs->data[priv->sreg]; 283 u32 value = regs->data[priv->sreg];
284#endif 284#endif
285 enum ip_conntrack_info ctinfo; 285 enum ip_conntrack_info ctinfo;
@@ -298,6 +298,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
298 } 298 }
299 break; 299 break;
300#endif 300#endif
301#ifdef CONFIG_NF_CONNTRACK_SECMARK
302 case NFT_CT_SECMARK:
303 if (ct->secmark != value) {
304 ct->secmark = value;
305 nf_conntrack_event_cache(IPCT_SECMARK, ct);
306 }
307 break;
308#endif
301#ifdef CONFIG_NF_CONNTRACK_LABELS 309#ifdef CONFIG_NF_CONNTRACK_LABELS
302 case NFT_CT_LABELS: 310 case NFT_CT_LABELS:
303 nf_connlabels_replace(ct, 311 nf_connlabels_replace(ct,
@@ -565,6 +573,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
565 len = sizeof(u32); 573 len = sizeof(u32);
566 break; 574 break;
567#endif 575#endif
576#ifdef CONFIG_NF_CONNTRACK_SECMARK
577 case NFT_CT_SECMARK:
578 if (tb[NFTA_CT_DIRECTION])
579 return -EINVAL;
580 len = sizeof(u32);
581 break;
582#endif
568 default: 583 default:
569 return -EOPNOTSUPP; 584 return -EOPNOTSUPP;
570 } 585 }
@@ -776,9 +791,6 @@ nft_ct_timeout_parse_policy(void *timeouts,
776 struct nlattr **tb; 791 struct nlattr **tb;
777 int ret = 0; 792 int ret = 0;
778 793
779 if (!l4proto->ctnl_timeout.nlattr_to_obj)
780 return 0;
781
782 tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), 794 tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
783 GFP_KERNEL); 795 GFP_KERNEL);
784 796
@@ -858,7 +870,7 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
858 l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); 870 l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
859 priv->l4proto = l4num; 871 priv->l4proto = l4num;
860 872
861 l4proto = nf_ct_l4proto_find_get(l3num, l4num); 873 l4proto = nf_ct_l4proto_find_get(l4num);
862 874
863 if (l4proto->l4proto != l4num) { 875 if (l4proto->l4proto != l4num) {
864 ret = -EOPNOTSUPP; 876 ret = -EOPNOTSUPP;
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index 2cc1e0ef56e8..15cc62b293d6 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -46,8 +46,6 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
46 return nft_validate_register_load(priv->sreg_dev, sizeof(int)); 46 return nft_validate_register_load(priv->sreg_dev, sizeof(int));
47} 47}
48 48
49static const struct nft_expr_ops nft_dup_netdev_ingress_ops;
50
51static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) 49static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
52{ 50{
53 struct nft_dup_netdev *priv = nft_expr_priv(expr); 51 struct nft_dup_netdev *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 6e91a37d57f2..07d4efd3d851 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -235,14 +235,31 @@ err1:
235 return err; 235 return err;
236} 236}
237 237
238static void nft_dynset_activate(const struct nft_ctx *ctx,
239 const struct nft_expr *expr)
240{
241 struct nft_dynset *priv = nft_expr_priv(expr);
242
243 nf_tables_rebind_set(ctx, priv->set, &priv->binding);
244}
245
246static void nft_dynset_deactivate(const struct nft_ctx *ctx,
247 const struct nft_expr *expr)
248{
249 struct nft_dynset *priv = nft_expr_priv(expr);
250
251 nf_tables_unbind_set(ctx, priv->set, &priv->binding);
252}
253
238static void nft_dynset_destroy(const struct nft_ctx *ctx, 254static void nft_dynset_destroy(const struct nft_ctx *ctx,
239 const struct nft_expr *expr) 255 const struct nft_expr *expr)
240{ 256{
241 struct nft_dynset *priv = nft_expr_priv(expr); 257 struct nft_dynset *priv = nft_expr_priv(expr);
242 258
243 nf_tables_unbind_set(ctx, priv->set, &priv->binding);
244 if (priv->expr != NULL) 259 if (priv->expr != NULL)
245 nft_expr_destroy(ctx, priv->expr); 260 nft_expr_destroy(ctx, priv->expr);
261
262 nf_tables_destroy_set(ctx, priv->set);
246} 263}
247 264
248static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) 265static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -279,6 +296,8 @@ static const struct nft_expr_ops nft_dynset_ops = {
279 .eval = nft_dynset_eval, 296 .eval = nft_dynset_eval,
280 .init = nft_dynset_init, 297 .init = nft_dynset_init,
281 .destroy = nft_dynset_destroy, 298 .destroy = nft_dynset_destroy,
299 .activate = nft_dynset_activate,
300 .deactivate = nft_dynset_deactivate,
282 .dump = nft_dynset_dump, 301 .dump = nft_dynset_dump,
283}; 302};
284 303
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index d6bab8c3cbb0..e82d9a966c45 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -201,7 +201,7 @@ static int flow_offload_netdev_event(struct notifier_block *this,
201 if (event != NETDEV_DOWN) 201 if (event != NETDEV_DOWN)
202 return NOTIFY_DONE; 202 return NOTIFY_DONE;
203 203
204 nf_flow_table_cleanup(dev_net(dev), dev); 204 nf_flow_table_cleanup(dev);
205 205
206 return NOTIFY_DONE; 206 return NOTIFY_DONE;
207} 207}
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 8abb9891cdf2..d7694e7255a0 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -53,8 +53,6 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
53 return nft_validate_register_load(priv->sreg_dev, sizeof(int)); 53 return nft_validate_register_load(priv->sreg_dev, sizeof(int));
54} 54}
55 55
56static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
57
58static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) 56static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
59{ 57{
60 struct nft_fwd_netdev *priv = nft_expr_priv(expr); 58 struct nft_fwd_netdev *priv = nft_expr_priv(expr);
@@ -169,8 +167,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
169 return nft_validate_register_load(priv->sreg_addr, addr_len); 167 return nft_validate_register_load(priv->sreg_addr, addr_len);
170} 168}
171 169
172static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
173
174static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) 170static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
175{ 171{
176 struct nft_fwd_neigh *priv = nft_expr_priv(expr); 172 struct nft_fwd_neigh *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index ad13e8643599..227b2b15a19c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -121,12 +121,28 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
121 return 0; 121 return 0;
122} 122}
123 123
124static void nft_lookup_activate(const struct nft_ctx *ctx,
125 const struct nft_expr *expr)
126{
127 struct nft_lookup *priv = nft_expr_priv(expr);
128
129 nf_tables_rebind_set(ctx, priv->set, &priv->binding);
130}
131
132static void nft_lookup_deactivate(const struct nft_ctx *ctx,
133 const struct nft_expr *expr)
134{
135 struct nft_lookup *priv = nft_expr_priv(expr);
136
137 nf_tables_unbind_set(ctx, priv->set, &priv->binding);
138}
139
124static void nft_lookup_destroy(const struct nft_ctx *ctx, 140static void nft_lookup_destroy(const struct nft_ctx *ctx,
125 const struct nft_expr *expr) 141 const struct nft_expr *expr)
126{ 142{
127 struct nft_lookup *priv = nft_expr_priv(expr); 143 struct nft_lookup *priv = nft_expr_priv(expr);
128 144
129 nf_tables_unbind_set(ctx, priv->set, &priv->binding); 145 nf_tables_destroy_set(ctx, priv->set);
130} 146}
131 147
132static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) 148static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -209,6 +225,8 @@ static const struct nft_expr_ops nft_lookup_ops = {
209 .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), 225 .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
210 .eval = nft_lookup_eval, 226 .eval = nft_lookup_eval,
211 .init = nft_lookup_init, 227 .init = nft_lookup_init,
228 .activate = nft_lookup_activate,
229 .deactivate = nft_lookup_deactivate,
212 .destroy = nft_lookup_destroy, 230 .destroy = nft_lookup_destroy,
213 .dump = nft_lookup_dump, 231 .dump = nft_lookup_dump,
214 .validate = nft_lookup_validate, 232 .validate = nft_lookup_validate,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 297fe7d97c18..6180626c3f80 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -284,6 +284,11 @@ static void nft_meta_set_eval(const struct nft_expr *expr,
284 284
285 skb->nf_trace = !!value8; 285 skb->nf_trace = !!value8;
286 break; 286 break;
287#ifdef CONFIG_NETWORK_SECMARK
288 case NFT_META_SECMARK:
289 skb->secmark = value;
290 break;
291#endif
287 default: 292 default:
288 WARN_ON(1); 293 WARN_ON(1);
289 } 294 }
@@ -436,6 +441,9 @@ static int nft_meta_set_init(const struct nft_ctx *ctx,
436 switch (priv->key) { 441 switch (priv->key) {
437 case NFT_META_MARK: 442 case NFT_META_MARK:
438 case NFT_META_PRIORITY: 443 case NFT_META_PRIORITY:
444#ifdef CONFIG_NETWORK_SECMARK
445 case NFT_META_SECMARK:
446#endif
439 len = sizeof(u32); 447 len = sizeof(u32);
440 break; 448 break;
441 case NFT_META_NFTRACE: 449 case NFT_META_NFTRACE:
@@ -543,3 +551,111 @@ struct nft_expr_type nft_meta_type __read_mostly = {
543 .maxattr = NFTA_META_MAX, 551 .maxattr = NFTA_META_MAX,
544 .owner = THIS_MODULE, 552 .owner = THIS_MODULE,
545}; 553};
554
555#ifdef CONFIG_NETWORK_SECMARK
556struct nft_secmark {
557 u32 secid;
558 char *ctx;
559};
560
561static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = {
562 [NFTA_SECMARK_CTX] = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN },
563};
564
565static int nft_secmark_compute_secid(struct nft_secmark *priv)
566{
567 u32 tmp_secid = 0;
568 int err;
569
570 err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid);
571 if (err)
572 return err;
573
574 if (!tmp_secid)
575 return -ENOENT;
576
577 err = security_secmark_relabel_packet(tmp_secid);
578 if (err)
579 return err;
580
581 priv->secid = tmp_secid;
582 return 0;
583}
584
585static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs,
586 const struct nft_pktinfo *pkt)
587{
588 const struct nft_secmark *priv = nft_obj_data(obj);
589 struct sk_buff *skb = pkt->skb;
590
591 skb->secmark = priv->secid;
592}
593
594static int nft_secmark_obj_init(const struct nft_ctx *ctx,
595 const struct nlattr * const tb[],
596 struct nft_object *obj)
597{
598 struct nft_secmark *priv = nft_obj_data(obj);
599 int err;
600
601 if (tb[NFTA_SECMARK_CTX] == NULL)
602 return -EINVAL;
603
604 priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
605 if (!priv->ctx)
606 return -ENOMEM;
607
608 err = nft_secmark_compute_secid(priv);
609 if (err) {
610 kfree(priv->ctx);
611 return err;
612 }
613
614 security_secmark_refcount_inc();
615
616 return 0;
617}
618
619static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj,
620 bool reset)
621{
622 struct nft_secmark *priv = nft_obj_data(obj);
623 int err;
624
625 if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx))
626 return -1;
627
628 if (reset) {
629 err = nft_secmark_compute_secid(priv);
630 if (err)
631 return err;
632 }
633
634 return 0;
635}
636
637static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
638{
639 struct nft_secmark *priv = nft_obj_data(obj);
640
641 security_secmark_refcount_dec();
642
643 kfree(priv->ctx);
644}
645
646static const struct nft_object_ops nft_secmark_obj_ops = {
647 .type = &nft_secmark_obj_type,
648 .size = sizeof(struct nft_secmark),
649 .init = nft_secmark_obj_init,
650 .eval = nft_secmark_obj_eval,
651 .dump = nft_secmark_obj_dump,
652 .destroy = nft_secmark_obj_destroy,
653};
654struct nft_object_type nft_secmark_obj_type __read_mostly = {
655 .type = NFT_OBJECT_SECMARK,
656 .ops = &nft_secmark_obj_ops,
657 .maxattr = NFTA_SECMARK_MAX,
658 .policy = nft_secmark_policy,
659 .owner = THIS_MODULE,
660};
661#endif /* CONFIG_NETWORK_SECMARK */
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index cdf348f751ec..a3185ca2a3a9 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -155,12 +155,28 @@ nla_put_failure:
155 return -1; 155 return -1;
156} 156}
157 157
158static void nft_objref_map_activate(const struct nft_ctx *ctx,
159 const struct nft_expr *expr)
160{
161 struct nft_objref_map *priv = nft_expr_priv(expr);
162
163 nf_tables_rebind_set(ctx, priv->set, &priv->binding);
164}
165
166static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
167 const struct nft_expr *expr)
168{
169 struct nft_objref_map *priv = nft_expr_priv(expr);
170
171 nf_tables_unbind_set(ctx, priv->set, &priv->binding);
172}
173
158static void nft_objref_map_destroy(const struct nft_ctx *ctx, 174static void nft_objref_map_destroy(const struct nft_ctx *ctx,
159 const struct nft_expr *expr) 175 const struct nft_expr *expr)
160{ 176{
161 struct nft_objref_map *priv = nft_expr_priv(expr); 177 struct nft_objref_map *priv = nft_expr_priv(expr);
162 178
163 nf_tables_unbind_set(ctx, priv->set, &priv->binding); 179 nf_tables_destroy_set(ctx, priv->set);
164} 180}
165 181
166static struct nft_expr_type nft_objref_type; 182static struct nft_expr_type nft_objref_type;
@@ -169,6 +185,8 @@ static const struct nft_expr_ops nft_objref_map_ops = {
169 .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), 185 .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
170 .eval = nft_objref_map_eval, 186 .eval = nft_objref_map_eval,
171 .init = nft_objref_map_init, 187 .init = nft_objref_map_init,
188 .activate = nft_objref_map_activate,
189 .deactivate = nft_objref_map_deactivate,
172 .destroy = nft_objref_map_destroy, 190 .destroy = nft_objref_map_destroy,
173 .dump = nft_objref_map_dump, 191 .dump = nft_objref_map_dump,
174}; 192};
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index a35fb59ace73..ca5e5d8c5ef8 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -6,10 +6,12 @@
6 6
7struct nft_osf { 7struct nft_osf {
8 enum nft_registers dreg:8; 8 enum nft_registers dreg:8;
9 u8 ttl;
9}; 10};
10 11
11static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { 12static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
12 [NFTA_OSF_DREG] = { .type = NLA_U32 }, 13 [NFTA_OSF_DREG] = { .type = NLA_U32 },
14 [NFTA_OSF_TTL] = { .type = NLA_U8 },
13}; 15};
14 16
15static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, 17static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
@@ -33,7 +35,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
33 return; 35 return;
34 } 36 }
35 37
36 os_name = nf_osf_find(skb, nf_osf_fingers); 38 os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl);
37 if (!os_name) 39 if (!os_name)
38 strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); 40 strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
39 else 41 else
@@ -46,6 +48,14 @@ static int nft_osf_init(const struct nft_ctx *ctx,
46{ 48{
47 struct nft_osf *priv = nft_expr_priv(expr); 49 struct nft_osf *priv = nft_expr_priv(expr);
48 int err; 50 int err;
51 u8 ttl;
52
53 if (nla_get_u8(tb[NFTA_OSF_TTL])) {
54 ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
55 if (ttl > 2)
56 return -EINVAL;
57 priv->ttl = ttl;
58 }
49 59
50 priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); 60 priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
51 err = nft_validate_register_store(ctx, priv->dreg, NULL, 61 err = nft_validate_register_store(ctx, priv->dreg, NULL,
@@ -60,6 +70,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
60{ 70{
61 const struct nft_osf *priv = nft_expr_priv(expr); 71 const struct nft_osf *priv = nft_expr_priv(expr);
62 72
73 if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
74 goto nla_put_failure;
75
63 if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) 76 if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
64 goto nla_put_failure; 77 goto nla_put_failure;
65 78
@@ -69,6 +82,15 @@ nla_put_failure:
69 return -1; 82 return -1;
70} 83}
71 84
85static int nft_osf_validate(const struct nft_ctx *ctx,
86 const struct nft_expr *expr,
87 const struct nft_data **data)
88{
89 return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
90 (1 << NF_INET_PRE_ROUTING) |
91 (1 << NF_INET_FORWARD));
92}
93
72static struct nft_expr_type nft_osf_type; 94static struct nft_expr_type nft_osf_type;
73static const struct nft_expr_ops nft_osf_op = { 95static const struct nft_expr_ops nft_osf_op = {
74 .eval = nft_osf_eval, 96 .eval = nft_osf_eval,
@@ -76,6 +98,7 @@ static const struct nft_expr_ops nft_osf_op = {
76 .init = nft_osf_init, 98 .init = nft_osf_init,
77 .dump = nft_osf_dump, 99 .dump = nft_osf_dump,
78 .type = &nft_osf_type, 100 .type = &nft_osf_type,
101 .validate = nft_osf_validate,
79}; 102};
80 103
81static struct nft_expr_type nft_osf_type __read_mostly = { 104static struct nft_expr_type nft_osf_type __read_mostly = {
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 29f5bd2377b0..b48e58cceeb7 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -94,7 +94,8 @@ static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = {
94 94
95int nft_reject_icmp_code(u8 code) 95int nft_reject_icmp_code(u8 code)
96{ 96{
97 BUG_ON(code > NFT_REJECT_ICMPX_MAX); 97 if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX))
98 return ICMP_NET_UNREACH;
98 99
99 return icmp_code_v4[code]; 100 return icmp_code_v4[code];
100} 101}
@@ -111,7 +112,8 @@ static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = {
111 112
112int nft_reject_icmpv6_code(u8 code) 113int nft_reject_icmpv6_code(u8 code)
113{ 114{
114 BUG_ON(code > NFT_REJECT_ICMPX_MAX); 115 if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX))
116 return ICMPV6_NOROUTE;
115 117
116 return icmp_code_v6[code]; 118 return icmp_code_v6[code];
117} 119}
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 76dba9f6b6f6..f35fa33913ae 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -90,6 +90,11 @@ static void nft_rt_get_eval(const struct nft_expr *expr,
90 case NFT_RT_TCPMSS: 90 case NFT_RT_TCPMSS:
91 nft_reg_store16(dest, get_tcpmss(pkt, dst)); 91 nft_reg_store16(dest, get_tcpmss(pkt, dst));
92 break; 92 break;
93#ifdef CONFIG_XFRM
94 case NFT_RT_XFRM:
95 nft_reg_store8(dest, !!dst->xfrm);
96 break;
97#endif
93 default: 98 default:
94 WARN_ON(1); 99 WARN_ON(1);
95 goto err; 100 goto err;
@@ -130,6 +135,11 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
130 case NFT_RT_TCPMSS: 135 case NFT_RT_TCPMSS:
131 len = sizeof(u16); 136 len = sizeof(u16);
132 break; 137 break;
138#ifdef CONFIG_XFRM
139 case NFT_RT_XFRM:
140 len = sizeof(u8);
141 break;
142#endif
133 default: 143 default:
134 return -EOPNOTSUPP; 144 return -EOPNOTSUPP;
135 } 145 }
@@ -164,6 +174,7 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
164 case NFT_RT_NEXTHOP4: 174 case NFT_RT_NEXTHOP4:
165 case NFT_RT_NEXTHOP6: 175 case NFT_RT_NEXTHOP6:
166 case NFT_RT_CLASSID: 176 case NFT_RT_CLASSID:
177 case NFT_RT_XFRM:
167 return 0; 178 return 0;
168 case NFT_RT_TCPMSS: 179 case NFT_RT_TCPMSS:
169 hooks = (1 << NF_INET_FORWARD) | 180 hooks = (1 << NF_INET_FORWARD) |
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 015124e649cb..339a9dd1c832 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -88,7 +88,7 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
88 .key = key, 88 .key = key,
89 }; 89 };
90 90
91 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); 91 he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
92 if (he != NULL) 92 if (he != NULL)
93 *ext = &he->ext; 93 *ext = &he->ext;
94 94
@@ -106,7 +106,7 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
106 .key = elem->key.val.data, 106 .key = elem->key.val.data,
107 }; 107 };
108 108
109 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); 109 he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
110 if (he != NULL) 110 if (he != NULL)
111 return he; 111 return he;
112 112
@@ -129,7 +129,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
129 .key = key, 129 .key = key,
130 }; 130 };
131 131
132 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); 132 he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
133 if (he != NULL) 133 if (he != NULL)
134 goto out; 134 goto out;
135 135
@@ -217,7 +217,7 @@ static void *nft_rhash_deactivate(const struct net *net,
217 }; 217 };
218 218
219 rcu_read_lock(); 219 rcu_read_lock();
220 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); 220 he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
221 if (he != NULL && 221 if (he != NULL &&
222 !nft_rhash_flush(net, set, he)) 222 !nft_rhash_flush(net, set, he))
223 he = NULL; 223 he = NULL;
@@ -244,21 +244,15 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
244 struct nft_rhash_elem *he; 244 struct nft_rhash_elem *he;
245 struct rhashtable_iter hti; 245 struct rhashtable_iter hti;
246 struct nft_set_elem elem; 246 struct nft_set_elem elem;
247 int err;
248
249 err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC);
250 iter->err = err;
251 if (err)
252 return;
253 247
248 rhashtable_walk_enter(&priv->ht, &hti);
254 rhashtable_walk_start(&hti); 249 rhashtable_walk_start(&hti);
255 250
256 while ((he = rhashtable_walk_next(&hti))) { 251 while ((he = rhashtable_walk_next(&hti))) {
257 if (IS_ERR(he)) { 252 if (IS_ERR(he)) {
258 err = PTR_ERR(he); 253 if (PTR_ERR(he) != -EAGAIN) {
259 if (err != -EAGAIN) { 254 iter->err = PTR_ERR(he);
260 iter->err = err; 255 break;
261 goto out;
262 } 256 }
263 257
264 continue; 258 continue;
@@ -275,13 +269,11 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
275 269
276 iter->err = iter->fn(ctx, set, iter, &elem); 270 iter->err = iter->fn(ctx, set, iter, &elem);
277 if (iter->err < 0) 271 if (iter->err < 0)
278 goto out; 272 break;
279 273
280cont: 274cont:
281 iter->count++; 275 iter->count++;
282 } 276 }
283
284out:
285 rhashtable_walk_stop(&hti); 277 rhashtable_walk_stop(&hti);
286 rhashtable_walk_exit(&hti); 278 rhashtable_walk_exit(&hti);
287} 279}
@@ -293,21 +285,17 @@ static void nft_rhash_gc(struct work_struct *work)
293 struct nft_rhash *priv; 285 struct nft_rhash *priv;
294 struct nft_set_gc_batch *gcb = NULL; 286 struct nft_set_gc_batch *gcb = NULL;
295 struct rhashtable_iter hti; 287 struct rhashtable_iter hti;
296 int err;
297 288
298 priv = container_of(work, struct nft_rhash, gc_work.work); 289 priv = container_of(work, struct nft_rhash, gc_work.work);
299 set = nft_set_container_of(priv); 290 set = nft_set_container_of(priv);
300 291
301 err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); 292 rhashtable_walk_enter(&priv->ht, &hti);
302 if (err)
303 goto schedule;
304
305 rhashtable_walk_start(&hti); 293 rhashtable_walk_start(&hti);
306 294
307 while ((he = rhashtable_walk_next(&hti))) { 295 while ((he = rhashtable_walk_next(&hti))) {
308 if (IS_ERR(he)) { 296 if (IS_ERR(he)) {
309 if (PTR_ERR(he) != -EAGAIN) 297 if (PTR_ERR(he) != -EAGAIN)
310 goto out; 298 break;
311 continue; 299 continue;
312 } 300 }
313 301
@@ -326,17 +314,15 @@ gc:
326 314
327 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); 315 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
328 if (gcb == NULL) 316 if (gcb == NULL)
329 goto out; 317 break;
330 rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); 318 rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
331 atomic_dec(&set->nelems); 319 atomic_dec(&set->nelems);
332 nft_set_gc_batch_add(gcb, he); 320 nft_set_gc_batch_add(gcb, he);
333 } 321 }
334out:
335 rhashtable_walk_stop(&hti); 322 rhashtable_walk_stop(&hti);
336 rhashtable_walk_exit(&hti); 323 rhashtable_walk_exit(&hti);
337 324
338 nft_set_gc_batch_complete(gcb); 325 nft_set_gc_batch_complete(gcb);
339schedule:
340 queue_delayed_work(system_power_efficient_wq, &priv->gc_work, 326 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
341 nft_set_gc_interval(set)); 327 nft_set_gc_interval(set));
342} 328}
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 0e5ec126f6ad..fa61208371f8 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -135,9 +135,12 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
135 d = memcmp(this, key, set->klen); 135 d = memcmp(this, key, set->klen);
136 if (d < 0) { 136 if (d < 0) {
137 parent = rcu_dereference_raw(parent->rb_left); 137 parent = rcu_dereference_raw(parent->rb_left);
138 interval = rbe; 138 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
139 interval = rbe;
139 } else if (d > 0) { 140 } else if (d > 0) {
140 parent = rcu_dereference_raw(parent->rb_right); 141 parent = rcu_dereference_raw(parent->rb_right);
142 if (flags & NFT_SET_ELEM_INTERVAL_END)
143 interval = rbe;
141 } else { 144 } else {
142 if (!nft_set_elem_active(&rbe->ext, genmask)) 145 if (!nft_set_elem_active(&rbe->ext, genmask))
143 parent = rcu_dereference_raw(parent->rb_left); 146 parent = rcu_dereference_raw(parent->rb_left);
@@ -154,7 +157,10 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
154 157
155 if (set->flags & NFT_SET_INTERVAL && interval != NULL && 158 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
156 nft_set_elem_active(&interval->ext, genmask) && 159 nft_set_elem_active(&interval->ext, genmask) &&
157 !nft_rbtree_interval_end(interval)) { 160 ((!nft_rbtree_interval_end(interval) &&
161 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
162 (nft_rbtree_interval_end(interval) &&
163 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
158 *elem = interval; 164 *elem = interval;
159 return true; 165 return true;
160 } 166 }
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
new file mode 100644
index 000000000000..5322609f7662
--- /dev/null
+++ b/net/netfilter/nft_xfrm.c
@@ -0,0 +1,294 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License version 2 as
4 * published by the Free Software Foundation.
5 *
6 * Generic part shared by ipv4 and ipv6 backends.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/netlink.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables_core.h>
16#include <net/netfilter/nf_tables.h>
17#include <linux/in.h>
18#include <net/xfrm.h>
19
20static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
21 [NFTA_XFRM_KEY] = { .type = NLA_U32 },
22 [NFTA_XFRM_DIR] = { .type = NLA_U8 },
23 [NFTA_XFRM_SPNUM] = { .type = NLA_U32 },
24 [NFTA_XFRM_DREG] = { .type = NLA_U32 },
25};
26
27struct nft_xfrm {
28 enum nft_xfrm_keys key:8;
29 enum nft_registers dreg:8;
30 u8 dir;
31 u8 spnum;
32};
33
34static int nft_xfrm_get_init(const struct nft_ctx *ctx,
35 const struct nft_expr *expr,
36 const struct nlattr * const tb[])
37{
38 struct nft_xfrm *priv = nft_expr_priv(expr);
39 unsigned int len = 0;
40 u32 spnum = 0;
41 u8 dir;
42
43 if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG])
44 return -EINVAL;
45
46 switch (ctx->family) {
47 case NFPROTO_IPV4:
48 case NFPROTO_IPV6:
49 case NFPROTO_INET:
50 break;
51 default:
52 return -EOPNOTSUPP;
53 }
54
55 priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
56 switch (priv->key) {
57 case NFT_XFRM_KEY_REQID:
58 case NFT_XFRM_KEY_SPI:
59 len = sizeof(u32);
60 break;
61 case NFT_XFRM_KEY_DADDR_IP4:
62 case NFT_XFRM_KEY_SADDR_IP4:
63 len = sizeof(struct in_addr);
64 break;
65 case NFT_XFRM_KEY_DADDR_IP6:
66 case NFT_XFRM_KEY_SADDR_IP6:
67 len = sizeof(struct in6_addr);
68 break;
69 default:
70 return -EINVAL;
71 }
72
73 dir = nla_get_u8(tb[NFTA_XFRM_DIR]);
74 switch (dir) {
75 case XFRM_POLICY_IN:
76 case XFRM_POLICY_OUT:
77 priv->dir = dir;
78 break;
79 default:
80 return -EINVAL;
81 }
82
83 if (tb[NFTA_XFRM_SPNUM])
84 spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM]));
85
86 if (spnum >= XFRM_MAX_DEPTH)
87 return -ERANGE;
88
89 priv->spnum = spnum;
90
91 priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]);
92 return nft_validate_register_store(ctx, priv->dreg, NULL,
93 NFT_DATA_VALUE, len);
94}
95
96/* Return true if key asks for daddr/saddr and current
97 * state does have a valid address (BEET, TUNNEL).
98 */
99static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
100{
101 switch (k) {
102 case NFT_XFRM_KEY_DADDR_IP4:
103 case NFT_XFRM_KEY_SADDR_IP4:
104 if (family == NFPROTO_IPV4)
105 break;
106 return false;
107 case NFT_XFRM_KEY_DADDR_IP6:
108 case NFT_XFRM_KEY_SADDR_IP6:
109 if (family == NFPROTO_IPV6)
110 break;
111 return false;
112 default:
113 return true;
114 }
115
116 return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
117}
118
119static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
120 struct nft_regs *regs,
121 const struct xfrm_state *state)
122{
123 u32 *dest = &regs->data[priv->dreg];
124
125 if (!xfrm_state_addr_ok(priv->key,
126 state->props.family,
127 state->props.mode)) {
128 regs->verdict.code = NFT_BREAK;
129 return;
130 }
131
132 switch (priv->key) {
133 case NFT_XFRM_KEY_UNSPEC:
134 case __NFT_XFRM_KEY_MAX:
135 WARN_ON_ONCE(1);
136 break;
137 case NFT_XFRM_KEY_DADDR_IP4:
138 *dest = state->id.daddr.a4;
139 return;
140 case NFT_XFRM_KEY_DADDR_IP6:
141 memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
142 return;
143 case NFT_XFRM_KEY_SADDR_IP4:
144 *dest = state->props.saddr.a4;
145 return;
146 case NFT_XFRM_KEY_SADDR_IP6:
147 memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
148 return;
149 case NFT_XFRM_KEY_REQID:
150 *dest = state->props.reqid;
151 return;
152 case NFT_XFRM_KEY_SPI:
153 *dest = state->id.spi;
154 return;
155 }
156
157 regs->verdict.code = NFT_BREAK;
158}
159
160static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv,
161 struct nft_regs *regs,
162 const struct nft_pktinfo *pkt)
163{
164 const struct sec_path *sp = pkt->skb->sp;
165 const struct xfrm_state *state;
166
167 if (sp == NULL || sp->len <= priv->spnum) {
168 regs->verdict.code = NFT_BREAK;
169 return;
170 }
171
172 state = sp->xvec[priv->spnum];
173 nft_xfrm_state_get_key(priv, regs, state);
174}
175
176static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv,
177 struct nft_regs *regs,
178 const struct nft_pktinfo *pkt)
179{
180 const struct dst_entry *dst = skb_dst(pkt->skb);
181 int i;
182
183 for (i = 0; dst && dst->xfrm;
184 dst = ((const struct xfrm_dst *)dst)->child, i++) {
185 if (i < priv->spnum)
186 continue;
187
188 nft_xfrm_state_get_key(priv, regs, dst->xfrm);
189 return;
190 }
191
192 regs->verdict.code = NFT_BREAK;
193}
194
195static void nft_xfrm_get_eval(const struct nft_expr *expr,
196 struct nft_regs *regs,
197 const struct nft_pktinfo *pkt)
198{
199 const struct nft_xfrm *priv = nft_expr_priv(expr);
200
201 switch (priv->dir) {
202 case XFRM_POLICY_IN:
203 nft_xfrm_get_eval_in(priv, regs, pkt);
204 break;
205 case XFRM_POLICY_OUT:
206 nft_xfrm_get_eval_out(priv, regs, pkt);
207 break;
208 default:
209 WARN_ON_ONCE(1);
210 regs->verdict.code = NFT_BREAK;
211 break;
212 }
213}
214
215static int nft_xfrm_get_dump(struct sk_buff *skb,
216 const struct nft_expr *expr)
217{
218 const struct nft_xfrm *priv = nft_expr_priv(expr);
219
220 if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg))
221 return -1;
222
223 if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key)))
224 return -1;
225 if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir))
226 return -1;
227 if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum)))
228 return -1;
229
230 return 0;
231}
232
233static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
234 const struct nft_data **data)
235{
236 const struct nft_xfrm *priv = nft_expr_priv(expr);
237 unsigned int hooks;
238
239 switch (priv->dir) {
240 case XFRM_POLICY_IN:
241 hooks = (1 << NF_INET_FORWARD) |
242 (1 << NF_INET_LOCAL_IN) |
243 (1 << NF_INET_PRE_ROUTING);
244 break;
245 case XFRM_POLICY_OUT:
246 hooks = (1 << NF_INET_FORWARD) |
247 (1 << NF_INET_LOCAL_OUT) |
248 (1 << NF_INET_POST_ROUTING);
249 break;
250 default:
251 WARN_ON_ONCE(1);
252 return -EINVAL;
253 }
254
255 return nft_chain_validate_hooks(ctx->chain, hooks);
256}
257
258
259static struct nft_expr_type nft_xfrm_type;
260static const struct nft_expr_ops nft_xfrm_get_ops = {
261 .type = &nft_xfrm_type,
262 .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)),
263 .eval = nft_xfrm_get_eval,
264 .init = nft_xfrm_get_init,
265 .dump = nft_xfrm_get_dump,
266 .validate = nft_xfrm_validate,
267};
268
269static struct nft_expr_type nft_xfrm_type __read_mostly = {
270 .name = "xfrm",
271 .ops = &nft_xfrm_get_ops,
272 .policy = nft_xfrm_policy,
273 .maxattr = NFTA_XFRM_MAX,
274 .owner = THIS_MODULE,
275};
276
277static int __init nft_xfrm_module_init(void)
278{
279 return nft_register_expr(&nft_xfrm_type);
280}
281
282static void __exit nft_xfrm_module_exit(void)
283{
284 nft_unregister_expr(&nft_xfrm_type);
285}
286
287module_init(nft_xfrm_module_init);
288module_exit(nft_xfrm_module_exit);
289
290MODULE_LICENSE("GPL");
291MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching");
292MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
293MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>");
294MODULE_ALIAS_NFT_EXPR("xfrm");
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 89457efd2e00..2c7a4b80206f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -159,7 +159,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
159 /* Make sure the timeout policy matches any existing protocol tracker, 159 /* Make sure the timeout policy matches any existing protocol tracker,
160 * otherwise default to generic. 160 * otherwise default to generic.
161 */ 161 */
162 l4proto = __nf_ct_l4proto_find(par->family, proto); 162 l4proto = __nf_ct_l4proto_find(proto);
163 if (timeout->l4proto->l4proto != l4proto->l4proto) { 163 if (timeout->l4proto->l4proto != l4proto->l4proto) {
164 ret = -EINVAL; 164 ret = -EINVAL;
165 pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", 165 pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n",
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 5ee859193783..c6acfc2d9c84 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -68,8 +68,6 @@ struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
68{ 68{
69 struct idletimer_tg *entry; 69 struct idletimer_tg *entry;
70 70
71 BUG_ON(!label);
72
73 list_for_each_entry(entry, &idletimer_tg_list, entry) { 71 list_for_each_entry(entry, &idletimer_tg_list, entry) {
74 if (!strcmp(label, entry->attr.attr.name)) 72 if (!strcmp(label, entry->attr.attr.name))
75 return entry; 73 return entry;
@@ -172,8 +170,6 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
172 pr_debug("resetting timer %s, timeout period %u\n", 170 pr_debug("resetting timer %s, timeout period %u\n",
173 info->label, info->timeout); 171 info->label, info->timeout);
174 172
175 BUG_ON(!info->timer);
176
177 mod_timer(&info->timer->timer, 173 mod_timer(&info->timer->timer,
178 msecs_to_jiffies(info->timeout * 1000) + jiffies); 174 msecs_to_jiffies(info->timeout * 1000) + jiffies);
179 175
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 4ad5fe27e08b..f16202d26c20 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -35,8 +35,6 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
35 u32 secmark = 0; 35 u32 secmark = 0;
36 const struct xt_secmark_target_info *info = par->targinfo; 36 const struct xt_secmark_target_info *info = par->targinfo;
37 37
38 BUG_ON(info->mode != mode);
39
40 switch (mode) { 38 switch (mode) {
41 case SECMARK_MODE_SEL: 39 case SECMARK_MODE_SEL:
42 secmark = info->secid; 40 secmark = info->secid;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 0d0d68c989df..1dae02a97ee3 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -14,6 +14,8 @@
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/route.h> 15#include <linux/route.h>
16#include <linux/netfilter/x_tables.h> 16#include <linux/netfilter/x_tables.h>
17#include <net/net_namespace.h>
18#include <net/netns/generic.h>
17#include <net/route.h> 19#include <net/route.h>
18#include <net/netfilter/ipv4/nf_dup_ipv4.h> 20#include <net/netfilter/ipv4/nf_dup_ipv4.h>
19#include <net/netfilter/ipv6/nf_dup_ipv6.h> 21#include <net/netfilter/ipv6/nf_dup_ipv6.h>
@@ -25,8 +27,15 @@ struct xt_tee_priv {
25 int oif; 27 int oif;
26}; 28};
27 29
30static unsigned int tee_net_id __read_mostly;
28static const union nf_inet_addr tee_zero_address; 31static const union nf_inet_addr tee_zero_address;
29 32
33struct tee_net {
34 struct list_head priv_list;
35 /* lock protects the priv_list */
36 struct mutex lock;
37};
38
30static unsigned int 39static unsigned int
31tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) 40tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
32{ 41{
@@ -51,17 +60,16 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
51} 60}
52#endif 61#endif
53 62
54static DEFINE_MUTEX(priv_list_mutex);
55static LIST_HEAD(priv_list);
56
57static int tee_netdev_event(struct notifier_block *this, unsigned long event, 63static int tee_netdev_event(struct notifier_block *this, unsigned long event,
58 void *ptr) 64 void *ptr)
59{ 65{
60 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 66 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
67 struct net *net = dev_net(dev);
68 struct tee_net *tn = net_generic(net, tee_net_id);
61 struct xt_tee_priv *priv; 69 struct xt_tee_priv *priv;
62 70
63 mutex_lock(&priv_list_mutex); 71 mutex_lock(&tn->lock);
64 list_for_each_entry(priv, &priv_list, list) { 72 list_for_each_entry(priv, &tn->priv_list, list) {
65 switch (event) { 73 switch (event) {
66 case NETDEV_REGISTER: 74 case NETDEV_REGISTER:
67 if (!strcmp(dev->name, priv->tginfo->oif)) 75 if (!strcmp(dev->name, priv->tginfo->oif))
@@ -79,13 +87,14 @@ static int tee_netdev_event(struct notifier_block *this, unsigned long event,
79 break; 87 break;
80 } 88 }
81 } 89 }
82 mutex_unlock(&priv_list_mutex); 90 mutex_unlock(&tn->lock);
83 91
84 return NOTIFY_DONE; 92 return NOTIFY_DONE;
85} 93}
86 94
87static int tee_tg_check(const struct xt_tgchk_param *par) 95static int tee_tg_check(const struct xt_tgchk_param *par)
88{ 96{
97 struct tee_net *tn = net_generic(par->net, tee_net_id);
89 struct xt_tee_tginfo *info = par->targinfo; 98 struct xt_tee_tginfo *info = par->targinfo;
90 struct xt_tee_priv *priv; 99 struct xt_tee_priv *priv;
91 100
@@ -95,6 +104,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
95 return -EINVAL; 104 return -EINVAL;
96 105
97 if (info->oif[0]) { 106 if (info->oif[0]) {
107 struct net_device *dev;
108
98 if (info->oif[sizeof(info->oif)-1] != '\0') 109 if (info->oif[sizeof(info->oif)-1] != '\0')
99 return -EINVAL; 110 return -EINVAL;
100 111
@@ -106,9 +117,14 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
106 priv->oif = -1; 117 priv->oif = -1;
107 info->priv = priv; 118 info->priv = priv;
108 119
109 mutex_lock(&priv_list_mutex); 120 dev = dev_get_by_name(par->net, info->oif);
110 list_add(&priv->list, &priv_list); 121 if (dev) {
111 mutex_unlock(&priv_list_mutex); 122 priv->oif = dev->ifindex;
123 dev_put(dev);
124 }
125 mutex_lock(&tn->lock);
126 list_add(&priv->list, &tn->priv_list);
127 mutex_unlock(&tn->lock);
112 } else 128 } else
113 info->priv = NULL; 129 info->priv = NULL;
114 130
@@ -118,12 +134,13 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
118 134
119static void tee_tg_destroy(const struct xt_tgdtor_param *par) 135static void tee_tg_destroy(const struct xt_tgdtor_param *par)
120{ 136{
137 struct tee_net *tn = net_generic(par->net, tee_net_id);
121 struct xt_tee_tginfo *info = par->targinfo; 138 struct xt_tee_tginfo *info = par->targinfo;
122 139
123 if (info->priv) { 140 if (info->priv) {
124 mutex_lock(&priv_list_mutex); 141 mutex_lock(&tn->lock);
125 list_del(&info->priv->list); 142 list_del(&info->priv->list);
126 mutex_unlock(&priv_list_mutex); 143 mutex_unlock(&tn->lock);
127 kfree(info->priv); 144 kfree(info->priv);
128 } 145 }
129 static_key_slow_dec(&xt_tee_enabled); 146 static_key_slow_dec(&xt_tee_enabled);
@@ -156,6 +173,21 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
156#endif 173#endif
157}; 174};
158 175
176static int __net_init tee_net_init(struct net *net)
177{
178 struct tee_net *tn = net_generic(net, tee_net_id);
179
180 INIT_LIST_HEAD(&tn->priv_list);
181 mutex_init(&tn->lock);
182 return 0;
183}
184
185static struct pernet_operations tee_net_ops = {
186 .init = tee_net_init,
187 .id = &tee_net_id,
188 .size = sizeof(struct tee_net),
189};
190
159static struct notifier_block tee_netdev_notifier = { 191static struct notifier_block tee_netdev_notifier = {
160 .notifier_call = tee_netdev_event, 192 .notifier_call = tee_netdev_event,
161}; 193};
@@ -164,22 +196,32 @@ static int __init tee_tg_init(void)
164{ 196{
165 int ret; 197 int ret;
166 198
167 ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); 199 ret = register_pernet_subsys(&tee_net_ops);
168 if (ret) 200 if (ret < 0)
169 return ret; 201 return ret;
202
203 ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
204 if (ret < 0)
205 goto cleanup_subsys;
206
170 ret = register_netdevice_notifier(&tee_netdev_notifier); 207 ret = register_netdevice_notifier(&tee_netdev_notifier);
171 if (ret) { 208 if (ret < 0)
172 xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); 209 goto unregister_targets;
173 return ret;
174 }
175 210
176 return 0; 211 return 0;
212
213unregister_targets:
214 xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
215cleanup_subsys:
216 unregister_pernet_subsys(&tee_net_ops);
217 return ret;
177} 218}
178 219
179static void __exit tee_tg_exit(void) 220static void __exit tee_tg_exit(void)
180{ 221{
181 unregister_netdevice_notifier(&tee_netdev_notifier); 222 unregister_netdevice_notifier(&tee_netdev_notifier);
182 xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); 223 xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
224 unregister_pernet_subsys(&tee_net_ops);
183} 225}
184 226
185module_init(tee_tg_init); 227module_init(tee_tg_init);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 5d92e1781980..5cb1ecb29ea4 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -68,6 +68,38 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
68 return 0; 68 return 0;
69} 69}
70 70
71static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
72{
73 struct xt_cgroup_info_v2 *info = par->matchinfo;
74 struct cgroup *cgrp;
75
76 if ((info->invert_path & ~1) || (info->invert_classid & ~1))
77 return -EINVAL;
78
79 if (!info->has_path && !info->has_classid) {
80 pr_info("xt_cgroup: no path or classid specified\n");
81 return -EINVAL;
82 }
83
84 if (info->has_path && info->has_classid) {
85 pr_info_ratelimited("path and classid specified\n");
86 return -EINVAL;
87 }
88
89 info->priv = NULL;
90 if (info->has_path) {
91 cgrp = cgroup_get_from_path(info->path);
92 if (IS_ERR(cgrp)) {
93 pr_info_ratelimited("invalid path, errno=%ld\n",
94 PTR_ERR(cgrp));
95 return -EINVAL;
96 }
97 info->priv = cgrp;
98 }
99
100 return 0;
101}
102
71static bool 103static bool
72cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) 104cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
73{ 105{
@@ -99,6 +131,24 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
99 info->invert_classid; 131 info->invert_classid;
100} 132}
101 133
134static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
135{
136 const struct xt_cgroup_info_v2 *info = par->matchinfo;
137 struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
138 struct cgroup *ancestor = info->priv;
139 struct sock *sk = skb->sk;
140
141 if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
142 return false;
143
144 if (ancestor)
145 return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
146 info->invert_path;
147 else
148 return (info->classid == sock_cgroup_classid(skcd)) ^
149 info->invert_classid;
150}
151
102static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) 152static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
103{ 153{
104 struct xt_cgroup_info_v1 *info = par->matchinfo; 154 struct xt_cgroup_info_v1 *info = par->matchinfo;
@@ -107,6 +157,14 @@ static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
107 cgroup_put(info->priv); 157 cgroup_put(info->priv);
108} 158}
109 159
160static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par)
161{
162 struct xt_cgroup_info_v2 *info = par->matchinfo;
163
164 if (info->priv)
165 cgroup_put(info->priv);
166}
167
110static struct xt_match cgroup_mt_reg[] __read_mostly = { 168static struct xt_match cgroup_mt_reg[] __read_mostly = {
111 { 169 {
112 .name = "cgroup", 170 .name = "cgroup",
@@ -134,6 +192,20 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
134 (1 << NF_INET_POST_ROUTING) | 192 (1 << NF_INET_POST_ROUTING) |
135 (1 << NF_INET_LOCAL_IN), 193 (1 << NF_INET_LOCAL_IN),
136 }, 194 },
195 {
196 .name = "cgroup",
197 .revision = 2,
198 .family = NFPROTO_UNSPEC,
199 .checkentry = cgroup_mt_check_v2,
200 .match = cgroup_mt_v2,
201 .matchsize = sizeof(struct xt_cgroup_info_v2),
202 .usersize = offsetof(struct xt_cgroup_info_v2, priv),
203 .destroy = cgroup_mt_destroy_v2,
204 .me = THIS_MODULE,
205 .hooks = (1 << NF_INET_LOCAL_OUT) |
206 (1 << NF_INET_POST_ROUTING) |
207 (1 << NF_INET_LOCAL_IN),
208 },
137}; 209};
138 210
139static int __init cgroup_mt_init(void) 211static int __init cgroup_mt_init(void)
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index 8af9707f8789..ac91170fc8c8 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -216,6 +216,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
216 { 216 {
217 .name = "DNAT", 217 .name = "DNAT",
218 .revision = 2, 218 .revision = 2,
219 .checkentry = xt_nat_checkentry,
220 .destroy = xt_nat_destroy,
219 .target = xt_dnat_target_v2, 221 .target = xt_dnat_target_v2,
220 .targetsize = sizeof(struct nf_nat_range2), 222 .targetsize = sizeof(struct nf_nat_range2),
221 .table = "nat", 223 .table = "nat",
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index bf7bba80e24c..7a103553d10d 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -40,14 +40,8 @@
40static bool 40static bool
41xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) 41xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
42{ 42{
43 const struct xt_osf_info *info = p->matchinfo;
44 struct net *net = xt_net(p);
45
46 if (!info)
47 return false;
48
49 return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), 43 return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
50 xt_out(p), info, net, nf_osf_fingers); 44 xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers);
51} 45}
52 46
53static struct xt_match xt_osf_match = { 47static struct xt_match xt_osf_match = {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 930d17fa906c..6bb9f3cde0b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -574,11 +574,6 @@ static int netlink_insert(struct sock *sk, u32 portid)
574 if (nlk_sk(sk)->bound) 574 if (nlk_sk(sk)->bound)
575 goto err; 575 goto err;
576 576
577 err = -ENOMEM;
578 if (BITS_PER_LONG > 32 &&
579 unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
580 goto err;
581
582 nlk_sk(sk)->portid = portid; 577 nlk_sk(sk)->portid = portid;
583 sock_hold(sk); 578 sock_hold(sk);
584 579
@@ -993,7 +988,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
993 struct netlink_sock *nlk = nlk_sk(sk); 988 struct netlink_sock *nlk = nlk_sk(sk);
994 struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; 989 struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
995 int err = 0; 990 int err = 0;
996 long unsigned int groups = nladdr->nl_groups; 991 unsigned long groups = nladdr->nl_groups;
997 bool bound; 992 bool bound;
998 993
999 if (addr_len < sizeof(struct sockaddr_nl)) 994 if (addr_len < sizeof(struct sockaddr_nl))
@@ -1011,9 +1006,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
1011 return err; 1006 return err;
1012 } 1007 }
1013 1008
1014 if (nlk->ngroups == 0) 1009 if (nlk->ngroups < BITS_PER_LONG)
1015 groups = 0;
1016 else if (nlk->ngroups < 8*sizeof(groups))
1017 groups &= (1UL << nlk->ngroups) - 1; 1010 groups &= (1UL << nlk->ngroups) - 1;
1018 1011
1019 bound = nlk->bound; 1012 bound = nlk->bound;
@@ -1713,6 +1706,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1713 nlk->flags &= ~NETLINK_F_EXT_ACK; 1706 nlk->flags &= ~NETLINK_F_EXT_ACK;
1714 err = 0; 1707 err = 0;
1715 break; 1708 break;
1709 case NETLINK_DUMP_STRICT_CHK:
1710 if (val)
1711 nlk->flags |= NETLINK_F_STRICT_CHK;
1712 else
1713 nlk->flags &= ~NETLINK_F_STRICT_CHK;
1714 err = 0;
1715 break;
1716 default: 1716 default:
1717 err = -ENOPROTOOPT; 1717 err = -ENOPROTOOPT;
1718 } 1718 }
@@ -1806,6 +1806,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
1806 return -EFAULT; 1806 return -EFAULT;
1807 err = 0; 1807 err = 0;
1808 break; 1808 break;
1809 case NETLINK_DUMP_STRICT_CHK:
1810 if (len < sizeof(int))
1811 return -EINVAL;
1812 len = sizeof(int);
1813 val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0;
1814 if (put_user(len, optlen) || put_user(val, optval))
1815 return -EFAULT;
1816 err = 0;
1817 break;
1809 default: 1818 default:
1810 err = -ENOPROTOOPT; 1819 err = -ENOPROTOOPT;
1811 } 1820 }
@@ -2178,6 +2187,7 @@ EXPORT_SYMBOL(__nlmsg_put);
2178static int netlink_dump(struct sock *sk) 2187static int netlink_dump(struct sock *sk)
2179{ 2188{
2180 struct netlink_sock *nlk = nlk_sk(sk); 2189 struct netlink_sock *nlk = nlk_sk(sk);
2190 struct netlink_ext_ack extack = {};
2181 struct netlink_callback *cb; 2191 struct netlink_callback *cb;
2182 struct sk_buff *skb = NULL; 2192 struct sk_buff *skb = NULL;
2183 struct nlmsghdr *nlh; 2193 struct nlmsghdr *nlh;
@@ -2229,8 +2239,11 @@ static int netlink_dump(struct sock *sk)
2229 skb_reserve(skb, skb_tailroom(skb) - alloc_size); 2239 skb_reserve(skb, skb_tailroom(skb) - alloc_size);
2230 netlink_skb_set_owner_r(skb, sk); 2240 netlink_skb_set_owner_r(skb, sk);
2231 2241
2232 if (nlk->dump_done_errno > 0) 2242 if (nlk->dump_done_errno > 0) {
2243 cb->extack = &extack;
2233 nlk->dump_done_errno = cb->dump(skb, cb); 2244 nlk->dump_done_errno = cb->dump(skb, cb);
2245 cb->extack = NULL;
2246 }
2234 2247
2235 if (nlk->dump_done_errno > 0 || 2248 if (nlk->dump_done_errno > 0 ||
2236 skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { 2249 skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
@@ -2244,7 +2257,8 @@ static int netlink_dump(struct sock *sk)
2244 } 2257 }
2245 2258
2246 nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, 2259 nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
2247 sizeof(nlk->dump_done_errno), NLM_F_MULTI); 2260 sizeof(nlk->dump_done_errno),
2261 NLM_F_MULTI | cb->answer_flags);
2248 if (WARN_ON(!nlh)) 2262 if (WARN_ON(!nlh))
2249 goto errout_skb; 2263 goto errout_skb;
2250 2264
@@ -2253,6 +2267,12 @@ static int netlink_dump(struct sock *sk)
2253 memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, 2267 memcpy(nlmsg_data(nlh), &nlk->dump_done_errno,
2254 sizeof(nlk->dump_done_errno)); 2268 sizeof(nlk->dump_done_errno));
2255 2269
2270 if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) {
2271 nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
2272 if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg))
2273 nlmsg_end(skb, nlh);
2274 }
2275
2256 if (sk_filter(sk, skb)) 2276 if (sk_filter(sk, skb))
2257 kfree_skb(skb); 2277 kfree_skb(skb);
2258 else 2278 else
@@ -2279,9 +2299,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
2279 const struct nlmsghdr *nlh, 2299 const struct nlmsghdr *nlh,
2280 struct netlink_dump_control *control) 2300 struct netlink_dump_control *control)
2281{ 2301{
2302 struct netlink_sock *nlk, *nlk2;
2282 struct netlink_callback *cb; 2303 struct netlink_callback *cb;
2283 struct sock *sk; 2304 struct sock *sk;
2284 struct netlink_sock *nlk;
2285 int ret; 2305 int ret;
2286 2306
2287 refcount_inc(&skb->users); 2307 refcount_inc(&skb->users);
@@ -2315,6 +2335,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
2315 cb->min_dump_alloc = control->min_dump_alloc; 2335 cb->min_dump_alloc = control->min_dump_alloc;
2316 cb->skb = skb; 2336 cb->skb = skb;
2317 2337
2338 nlk2 = nlk_sk(NETLINK_CB(skb).sk);
2339 cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK);
2340
2318 if (control->start) { 2341 if (control->start) {
2319 ret = control->start(cb); 2342 ret = control->start(cb);
2320 if (ret) 2343 if (ret)
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 962de7b3c023..5f454c8de6a4 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -15,6 +15,7 @@
15#define NETLINK_F_LISTEN_ALL_NSID 0x10 15#define NETLINK_F_LISTEN_ALL_NSID 0x10
16#define NETLINK_F_CAP_ACK 0x20 16#define NETLINK_F_CAP_ACK 0x20
17#define NETLINK_F_EXT_ACK 0x40 17#define NETLINK_F_EXT_ACK 0x40
18#define NETLINK_F_STRICT_CHK 0x80
18 19
19#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) 20#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
20#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) 21#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index dd4adf8b1167..ae296273ce3d 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
556 556
557 pr_debug("%p\n", sk); 557 pr_debug("%p\n", sk);
558 558
559 sock_poll_wait(file, wait); 559 sock_poll_wait(file, sock, wait);
560 560
561 if (sk->sk_state == LLCP_LISTEN) 561 if (sk->sk_state == LLCP_LISTEN)
562 return llcp_accept_poll(sk); 562 return llcp_accept_poll(sk);
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index a66f102c6c01..78fe622eba65 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -192,10 +192,8 @@ static void nci_uart_tty_close(struct tty_struct *tty)
192 if (!nu) 192 if (!nu)
193 return; 193 return;
194 194
195 if (nu->tx_skb) 195 kfree_skb(nu->tx_skb);
196 kfree_skb(nu->tx_skb); 196 kfree_skb(nu->rx_skb);
197 if (nu->rx_skb)
198 kfree_skb(nu->rx_skb);
199 197
200 skb_queue_purge(&nu->tx_q); 198 skb_queue_purge(&nu->tx_q);
201 199
@@ -465,6 +463,7 @@ static struct tty_ldisc_ops nci_uart_ldisc = {
465 .receive_buf = nci_uart_tty_receive, 463 .receive_buf = nci_uart_tty_receive,
466 .write_wakeup = nci_uart_tty_wakeup, 464 .write_wakeup = nci_uart_tty_wakeup,
467 .ioctl = nci_uart_tty_ioctl, 465 .ioctl = nci_uart_tty_ioctl,
466 .compat_ioctl = nci_uart_tty_ioctl,
468}; 467};
469 468
470static int __init nci_uart_init(void) 469static int __init nci_uart_init(void)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 35ae64cbef33..6bec37ab4472 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -933,6 +933,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
933 struct nf_conn *ct; 933 struct nf_conn *ct;
934 934
935 if (!cached) { 935 if (!cached) {
936 struct nf_hook_state state = {
937 .hook = NF_INET_PRE_ROUTING,
938 .pf = info->family,
939 .net = net,
940 };
936 struct nf_conn *tmpl = info->ct; 941 struct nf_conn *tmpl = info->ct;
937 int err; 942 int err;
938 943
@@ -944,8 +949,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
944 nf_ct_set(skb, tmpl, IP_CT_NEW); 949 nf_ct_set(skb, tmpl, IP_CT_NEW);
945 } 950 }
946 951
947 err = nf_conntrack_in(net, info->family, 952 err = nf_conntrack_in(skb, &state);
948 NF_INET_PRE_ROUTING, skb);
949 if (err != NF_ACCEPT) 953 if (err != NF_ACCEPT)
950 return -ENOENT; 954 return -ENOENT;
951 955
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0f5ce77460d4..6679e96ab1dc 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1182,14 +1182,14 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1182 ovs_header->dp_ifindex, 1182 ovs_header->dp_ifindex,
1183 reply, info->snd_portid, 1183 reply, info->snd_portid,
1184 info->snd_seq, 0, 1184 info->snd_seq, 0,
1185 OVS_FLOW_CMD_NEW, 1185 OVS_FLOW_CMD_SET,
1186 ufid_flags); 1186 ufid_flags);
1187 BUG_ON(error < 0); 1187 BUG_ON(error < 0);
1188 } 1188 }
1189 } else { 1189 } else {
1190 /* Could not alloc without acts before locking. */ 1190 /* Could not alloc without acts before locking. */
1191 reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, 1191 reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
1192 info, OVS_FLOW_CMD_NEW, false, 1192 info, OVS_FLOW_CMD_SET, false,
1193 ufid_flags); 1193 ufid_flags);
1194 1194
1195 if (IS_ERR(reply)) { 1195 if (IS_ERR(reply)) {
@@ -1265,7 +1265,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1265 } 1265 }
1266 1266
1267 reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, 1267 reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
1268 OVS_FLOW_CMD_NEW, true, ufid_flags); 1268 OVS_FLOW_CMD_GET, true, ufid_flags);
1269 if (IS_ERR(reply)) { 1269 if (IS_ERR(reply)) {
1270 err = PTR_ERR(reply); 1270 err = PTR_ERR(reply);
1271 goto unlock; 1271 goto unlock;
@@ -1389,7 +1389,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1389 if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, 1389 if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
1390 NETLINK_CB(cb->skb).portid, 1390 NETLINK_CB(cb->skb).portid,
1391 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1391 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1392 OVS_FLOW_CMD_NEW, ufid_flags) < 0) 1392 OVS_FLOW_CMD_GET, ufid_flags) < 0)
1393 break; 1393 break;
1394 1394
1395 cb->args[0] = bucket; 1395 cb->args[0] = bucket;
@@ -1730,7 +1730,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1730 ovs_dp_change(dp, info->attrs); 1730 ovs_dp_change(dp, info->attrs);
1731 1731
1732 err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, 1732 err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1733 info->snd_seq, 0, OVS_DP_CMD_NEW); 1733 info->snd_seq, 0, OVS_DP_CMD_SET);
1734 BUG_ON(err < 0); 1734 BUG_ON(err < 0);
1735 1735
1736 ovs_unlock(); 1736 ovs_unlock();
@@ -1761,7 +1761,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1761 goto err_unlock_free; 1761 goto err_unlock_free;
1762 } 1762 }
1763 err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, 1763 err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1764 info->snd_seq, 0, OVS_DP_CMD_NEW); 1764 info->snd_seq, 0, OVS_DP_CMD_GET);
1765 BUG_ON(err < 0); 1765 BUG_ON(err < 0);
1766 ovs_unlock(); 1766 ovs_unlock();
1767 1767
@@ -1785,7 +1785,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1785 if (i >= skip && 1785 if (i >= skip &&
1786 ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, 1786 ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
1787 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1787 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1788 OVS_DP_CMD_NEW) < 0) 1788 OVS_DP_CMD_GET) < 0)
1789 break; 1789 break;
1790 i++; 1790 i++;
1791 } 1791 }
@@ -2101,7 +2101,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
2101 2101
2102 err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), 2102 err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
2103 info->snd_portid, info->snd_seq, 0, 2103 info->snd_portid, info->snd_seq, 0,
2104 OVS_VPORT_CMD_NEW); 2104 OVS_VPORT_CMD_SET);
2105 BUG_ON(err < 0); 2105 BUG_ON(err < 0);
2106 2106
2107 ovs_unlock(); 2107 ovs_unlock();
@@ -2182,7 +2182,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
2182 goto exit_unlock_free; 2182 goto exit_unlock_free;
2183 err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), 2183 err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
2184 info->snd_portid, info->snd_seq, 0, 2184 info->snd_portid, info->snd_seq, 0,
2185 OVS_VPORT_CMD_NEW); 2185 OVS_VPORT_CMD_GET);
2186 BUG_ON(err < 0); 2186 BUG_ON(err < 0);
2187 rcu_read_unlock(); 2187 rcu_read_unlock();
2188 2188
@@ -2218,7 +2218,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
2218 NETLINK_CB(cb->skb).portid, 2218 NETLINK_CB(cb->skb).portid,
2219 cb->nlh->nlmsg_seq, 2219 cb->nlh->nlmsg_seq,
2220 NLM_F_MULTI, 2220 NLM_F_MULTI,
2221 OVS_VPORT_CMD_NEW) < 0) 2221 OVS_VPORT_CMD_GET) < 0)
2222 goto out; 2222 goto out;
2223 2223
2224 j++; 2224 j++;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 56b8e7167790..35966da84769 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -254,21 +254,18 @@ static bool icmphdr_ok(struct sk_buff *skb)
254 254
255static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) 255static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
256{ 256{
257 unsigned short frag_off;
258 unsigned int payload_ofs = 0;
257 unsigned int nh_ofs = skb_network_offset(skb); 259 unsigned int nh_ofs = skb_network_offset(skb);
258 unsigned int nh_len; 260 unsigned int nh_len;
259 int payload_ofs;
260 struct ipv6hdr *nh; 261 struct ipv6hdr *nh;
261 uint8_t nexthdr; 262 int err, nexthdr, flags = 0;
262 __be16 frag_off;
263 int err;
264 263
265 err = check_header(skb, nh_ofs + sizeof(*nh)); 264 err = check_header(skb, nh_ofs + sizeof(*nh));
266 if (unlikely(err)) 265 if (unlikely(err))
267 return err; 266 return err;
268 267
269 nh = ipv6_hdr(skb); 268 nh = ipv6_hdr(skb);
270 nexthdr = nh->nexthdr;
271 payload_ofs = (u8 *)(nh + 1) - skb->data;
272 269
273 key->ip.proto = NEXTHDR_NONE; 270 key->ip.proto = NEXTHDR_NONE;
274 key->ip.tos = ipv6_get_dsfield(nh); 271 key->ip.tos = ipv6_get_dsfield(nh);
@@ -277,10 +274,9 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
277 key->ipv6.addr.src = nh->saddr; 274 key->ipv6.addr.src = nh->saddr;
278 key->ipv6.addr.dst = nh->daddr; 275 key->ipv6.addr.dst = nh->daddr;
279 276
280 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); 277 nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
281 278 if (flags & IP6_FH_F_FRAG) {
282 if (frag_off) { 279 if (frag_off)
283 if (frag_off & htons(~0x7))
284 key->ip.frag = OVS_FRAG_TYPE_LATER; 280 key->ip.frag = OVS_FRAG_TYPE_LATER;
285 else 281 else
286 key->ip.frag = OVS_FRAG_TYPE_FIRST; 282 key->ip.frag = OVS_FRAG_TYPE_FIRST;
@@ -288,11 +284,11 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
288 key->ip.frag = OVS_FRAG_TYPE_NONE; 284 key->ip.frag = OVS_FRAG_TYPE_NONE;
289 } 285 }
290 286
291 /* Delayed handling of error in ipv6_skip_exthdr() as it 287 /* Delayed handling of error in ipv6_find_hdr() as it
292 * always sets frag_off to a valid value which may be 288 * always sets flags and frag_off to a valid value which may be
293 * used to set key->ip.frag above. 289 * used to set key->ip.frag above.
294 */ 290 */
295 if (unlikely(payload_ofs < 0)) 291 if (unlikely(nexthdr < 0))
296 return -EPROTO; 292 return -EPROTO;
297 293
298 nh_len = payload_ofs - nh_ofs; 294 nh_len = payload_ofs - nh_ofs;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index bb95c43aae76..26f71cbf7527 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -43,7 +43,8 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev)
43} 43}
44 44
45/* Called with rcu_read_lock_bh. */ 45/* Called with rcu_read_lock_bh. */
46static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) 46static netdev_tx_t
47internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
47{ 48{
48 int len, err; 49 int len, err;
49 50
@@ -62,7 +63,7 @@ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
62 } else { 63 } else {
63 netdev->stats.tx_errors++; 64 netdev->stats.tx_errors++;
64 } 65 }
65 return 0; 66 return NETDEV_TX_OK;
66} 67}
67 68
68static int internal_dev_open(struct net_device *netdev) 69static int internal_dev_open(struct net_device *netdev)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d6e94dc7e290..ec3095f13aae 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3808,6 +3808,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3808 3808
3809 return fanout_set_data(po, optval, optlen); 3809 return fanout_set_data(po, optval, optlen);
3810 } 3810 }
3811 case PACKET_IGNORE_OUTGOING:
3812 {
3813 int val;
3814
3815 if (optlen != sizeof(val))
3816 return -EINVAL;
3817 if (copy_from_user(&val, optval, sizeof(val)))
3818 return -EFAULT;
3819 if (val < 0 || val > 1)
3820 return -EINVAL;
3821
3822 po->prot_hook.ignore_outgoing = !!val;
3823 return 0;
3824 }
3811 case PACKET_TX_HAS_OFF: 3825 case PACKET_TX_HAS_OFF:
3812 { 3826 {
3813 unsigned int val; 3827 unsigned int val;
@@ -3931,6 +3945,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3931 ((u32)po->fanout->flags << 24)) : 3945 ((u32)po->fanout->flags << 24)) :
3932 0); 3946 0);
3933 break; 3947 break;
3948 case PACKET_IGNORE_OUTGOING:
3949 val = po->prot_hook.ignore_outgoing;
3950 break;
3934 case PACKET_ROLLOVER_STATS: 3951 case PACKET_ROLLOVER_STATS:
3935 if (!po->rollover) 3952 if (!po->rollover)
3936 return -EINVAL; 3953 return -EINVAL;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c4dcf654d8fe..6bfaf05b63b2 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -278,7 +278,7 @@ struct rds_incoming {
278 struct in6_addr i_saddr; 278 struct in6_addr i_saddr;
279 279
280 rds_rdma_cookie_t i_rdma_cookie; 280 rds_rdma_cookie_t i_rdma_cookie;
281 struct timeval i_rx_tstamp; 281 ktime_t i_rx_tstamp;
282 u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; 282 u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
283}; 283};
284 284
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 504cd6bcc54c..727639dac8a7 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,18 +43,14 @@
43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
44 struct in6_addr *saddr) 44 struct in6_addr *saddr)
45{ 45{
46 int i;
47
48 refcount_set(&inc->i_refcount, 1); 46 refcount_set(&inc->i_refcount, 1);
49 INIT_LIST_HEAD(&inc->i_item); 47 INIT_LIST_HEAD(&inc->i_item);
50 inc->i_conn = conn; 48 inc->i_conn = conn;
51 inc->i_saddr = *saddr; 49 inc->i_saddr = *saddr;
52 inc->i_rdma_cookie = 0; 50 inc->i_rdma_cookie = 0;
53 inc->i_rx_tstamp.tv_sec = 0; 51 inc->i_rx_tstamp = ktime_set(0, 0);
54 inc->i_rx_tstamp.tv_usec = 0;
55 52
56 for (i = 0; i < RDS_RX_MAX_TRACES; i++) 53 memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
57 inc->i_rx_lat_trace[i] = 0;
58} 54}
59EXPORT_SYMBOL_GPL(rds_inc_init); 55EXPORT_SYMBOL_GPL(rds_inc_init);
60 56
@@ -67,8 +63,7 @@ void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
67 inc->i_conn_path = cp; 63 inc->i_conn_path = cp;
68 inc->i_saddr = *saddr; 64 inc->i_saddr = *saddr;
69 inc->i_rdma_cookie = 0; 65 inc->i_rdma_cookie = 0;
70 inc->i_rx_tstamp.tv_sec = 0; 66 inc->i_rx_tstamp = ktime_set(0, 0);
71 inc->i_rx_tstamp.tv_usec = 0;
72} 67}
73EXPORT_SYMBOL_GPL(rds_inc_path_init); 68EXPORT_SYMBOL_GPL(rds_inc_path_init);
74 69
@@ -385,7 +380,7 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
385 be32_to_cpu(inc->i_hdr.h_len), 380 be32_to_cpu(inc->i_hdr.h_len),
386 inc->i_hdr.h_dport); 381 inc->i_hdr.h_dport);
387 if (sock_flag(sk, SOCK_RCVTSTAMP)) 382 if (sock_flag(sk, SOCK_RCVTSTAMP))
388 do_gettimeofday(&inc->i_rx_tstamp); 383 inc->i_rx_tstamp = ktime_get_real();
389 rds_inc_addref(inc); 384 rds_inc_addref(inc);
390 inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); 385 inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
391 list_add_tail(&inc->i_item, &rs->rs_recv_queue); 386 list_add_tail(&inc->i_item, &rs->rs_recv_queue);
@@ -552,11 +547,11 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
552 goto out; 547 goto out;
553 } 548 }
554 549
555 if ((inc->i_rx_tstamp.tv_sec != 0) && 550 if ((inc->i_rx_tstamp != 0) &&
556 sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { 551 sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
552 struct timeval tv = ktime_to_timeval(inc->i_rx_tstamp);
557 ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, 553 ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
558 sizeof(struct timeval), 554 sizeof(tv), &tv);
559 &inc->i_rx_tstamp);
560 if (ret) 555 if (ret)
561 goto out; 556 goto out;
562 } 557 }
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 1355f5ca8d22..abca57040f37 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -510,8 +510,8 @@ void rfkill_remove_epo_lock(void)
510/** 510/**
511 * rfkill_is_epo_lock_active - returns true EPO is active 511 * rfkill_is_epo_lock_active - returns true EPO is active
512 * 512 *
513 * Returns 0 (false) if there is NOT an active EPO contidion, 513 * Returns 0 (false) if there is NOT an active EPO condition,
514 * and 1 (true) if there is an active EPO contition, which 514 * and 1 (true) if there is an active EPO condition, which
515 * locks all radios in one of the BLOCKED states. 515 * locks all radios in one of the BLOCKED states.
516 * 516 *
517 * Can be called in atomic context. 517 * Can be called in atomic context.
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index ac44d8afffb1..64362d078da8 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -97,7 +97,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
97 srx->transport_len > len) 97 srx->transport_len > len)
98 return -EINVAL; 98 return -EINVAL;
99 99
100 if (srx->transport.family != rx->family) 100 if (srx->transport.family != rx->family &&
101 srx->transport.family == AF_INET && rx->family != AF_INET6)
101 return -EAFNOSUPPORT; 102 return -EAFNOSUPPORT;
102 103
103 switch (srx->transport.family) { 104 switch (srx->transport.family) {
@@ -385,6 +386,20 @@ u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call)
385EXPORT_SYMBOL(rxrpc_kernel_check_life); 386EXPORT_SYMBOL(rxrpc_kernel_check_life);
386 387
387/** 388/**
389 * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
390 * @sock: The socket the call is on
391 * @call: The call to query
392 *
393 * Allow a kernel service to retrieve the epoch value from a service call to
394 * see if the client at the other end rebooted.
395 */
396u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call)
397{
398 return call->conn->proto.epoch;
399}
400EXPORT_SYMBOL(rxrpc_kernel_get_epoch);
401
402/**
388 * rxrpc_kernel_check_call - Check a call's state 403 * rxrpc_kernel_check_call - Check a call's state
389 * @sock: The socket the call is on 404 * @sock: The socket the call is on
390 * @call: The call to check 405 * @call: The call to check
@@ -741,7 +756,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
741 struct rxrpc_sock *rx = rxrpc_sk(sk); 756 struct rxrpc_sock *rx = rxrpc_sk(sk);
742 __poll_t mask; 757 __poll_t mask;
743 758
744 sock_poll_wait(file, wait); 759 sock_poll_wait(file, sock, wait);
745 mask = 0; 760 mask = 0;
746 761
747 /* the socket is readable if there are any messages waiting on the Rx 762 /* the socket is readable if there are any messages waiting on the Rx
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a6e6cae82c30..382196e57a26 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -435,7 +435,7 @@ struct rxrpc_connection {
435 struct sk_buff_head rx_queue; /* received conn-level packets */ 435 struct sk_buff_head rx_queue; /* received conn-level packets */
436 const struct rxrpc_security *security; /* applied security module */ 436 const struct rxrpc_security *security; /* applied security module */
437 struct key *server_key; /* security for this service */ 437 struct key *server_key; /* security for this service */
438 struct crypto_skcipher *cipher; /* encryption handle */ 438 struct crypto_sync_skcipher *cipher; /* encryption handle */
439 struct rxrpc_crypt csum_iv; /* packet checksum base */ 439 struct rxrpc_crypt csum_iv; /* packet checksum base */
440 unsigned long flags; 440 unsigned long flags;
441 unsigned long events; 441 unsigned long events;
@@ -1062,6 +1062,7 @@ void rxrpc_put_peer(struct rxrpc_peer *);
1062 */ 1062 */
1063extern const struct seq_operations rxrpc_call_seq_ops; 1063extern const struct seq_operations rxrpc_call_seq_ops;
1064extern const struct seq_operations rxrpc_connection_seq_ops; 1064extern const struct seq_operations rxrpc_connection_seq_ops;
1065extern const struct seq_operations rxrpc_peer_seq_ops;
1065 1066
1066/* 1067/*
1067 * recvmsg.c 1068 * recvmsg.c
@@ -1098,7 +1099,6 @@ void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
1098void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); 1099void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
1099void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); 1100void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
1100void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); 1101void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
1101void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
1102void rxrpc_purge_queue(struct sk_buff_head *); 1102void rxrpc_purge_queue(struct sk_buff_head *);
1103 1103
1104/* 1104/*
@@ -1115,8 +1115,7 @@ static inline void rxrpc_sysctl_exit(void) {}
1115/* 1115/*
1116 * utils.c 1116 * utils.c
1117 */ 1117 */
1118int rxrpc_extract_addr_from_skb(struct rxrpc_local *, struct sockaddr_rxrpc *, 1118int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
1119 struct sk_buff *);
1120 1119
1121static inline bool before(u32 seq1, u32 seq2) 1120static inline bool before(u32 seq1, u32 seq2)
1122{ 1121{
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 8079aacaecac..44860505246d 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -280,7 +280,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
280 peer = NULL; 280 peer = NULL;
281 if (!peer) { 281 if (!peer) {
282 peer = b->peer_backlog[peer_tail]; 282 peer = b->peer_backlog[peer_tail];
283 if (rxrpc_extract_addr_from_skb(local, &peer->srx, skb) < 0) 283 if (rxrpc_extract_addr_from_skb(&peer->srx, skb) < 0)
284 return NULL; 284 return NULL;
285 b->peer_backlog[peer_tail] = NULL; 285 b->peer_backlog[peer_tail] = NULL;
286 smp_store_release(&b->peer_backlog_tail, 286 smp_store_release(&b->peer_backlog_tail,
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 885dae829f4a..c332722820c2 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -86,11 +86,12 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
86 86
87 _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); 87 _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK);
88 88
89 if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) 89 if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
90 goto not_found; 90 goto not_found;
91 91
92 /* We may have to handle mixing IPv4 and IPv6 */ 92 if (srx.transport.family != local->srx.transport.family &&
93 if (srx.transport.family != local->srx.transport.family) { 93 (srx.transport.family == AF_INET &&
94 local->srx.transport.family != AF_INET6)) {
94 pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", 95 pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n",
95 srx.transport.family, 96 srx.transport.family,
96 local->srx.transport.family); 97 local->srx.transport.family);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 570b49d2da42..9128aa0e40aa 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -262,7 +262,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
262 while (list) { 262 while (list) {
263 skb = list; 263 skb = list;
264 list = skb->next; 264 list = skb->next;
265 skb->next = NULL; 265 skb_mark_not_on_list(skb);
266 rxrpc_free_skb(skb, rxrpc_skb_tx_freed); 266 rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
267 } 267 }
268 268
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index 13bd8a4dfac7..927ead43df42 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -39,7 +39,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
39 39
40 _enter(""); 40 _enter("");
41 41
42 if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) 42 if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
43 return; 43 return;
44 44
45 msg.msg_name = &srx.transport; 45 msg.msg_name = &srx.transport;
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 417d80867c4f..fd7eba8467fa 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -102,6 +102,9 @@ static __net_init int rxrpc_init_net(struct net *net)
102 proc_create_net("conns", 0444, rxnet->proc_net, 102 proc_create_net("conns", 0444, rxnet->proc_net,
103 &rxrpc_connection_seq_ops, 103 &rxrpc_connection_seq_ops,
104 sizeof(struct seq_net_private)); 104 sizeof(struct seq_net_private));
105 proc_create_net("peers", 0444, rxnet->proc_net,
106 &rxrpc_peer_seq_ops,
107 sizeof(struct seq_net_private));
105 return 0; 108 return 0;
106 109
107err_proc: 110err_proc:
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index a141ee3ab812..189418888839 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -378,11 +378,13 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
378 if ((lose++ & 7) == 7) { 378 if ((lose++ & 7) == 7) {
379 ret = 0; 379 ret = 0;
380 lost = true; 380 lost = true;
381 goto done;
382 } 381 }
383 } 382 }
384 383
385 _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq); 384 trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
385 retrans, lost);
386 if (lost)
387 goto done;
386 388
387 /* send the packet with the don't fragment bit set if we currently 389 /* send the packet with the don't fragment bit set if we currently
388 * think it's small enough */ 390 * think it's small enough */
@@ -415,8 +417,6 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
415 goto send_fragmentable; 417 goto send_fragmentable;
416 418
417done: 419done:
418 trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
419 retrans, lost);
420 if (ret >= 0) { 420 if (ret >= 0) {
421 if (whdr.flags & RXRPC_REQUEST_ACK) { 421 if (whdr.flags & RXRPC_REQUEST_ACK) {
422 call->peer->rtt_last_req = skb->tstamp; 422 call->peer->rtt_last_req = skb->tstamp;
@@ -561,7 +561,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
561 continue; 561 continue;
562 } 562 }
563 563
564 if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { 564 if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
565 msg.msg_namelen = srx.transport_len; 565 msg.msg_namelen = srx.transport_len;
566 566
567 whdr.epoch = htonl(sp->hdr.epoch); 567 whdr.epoch = htonl(sp->hdr.epoch);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index bd2fa3b7caa7..bc05af89fc38 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -47,6 +47,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
47 */ 47 */
48 switch (srx->transport.family) { 48 switch (srx->transport.family) {
49 case AF_INET: 49 case AF_INET:
50 srx->transport_len = sizeof(srx->transport.sin);
51 srx->transport.family = AF_INET;
50 srx->transport.sin.sin_port = serr->port; 52 srx->transport.sin.sin_port = serr->port;
51 switch (serr->ee.ee_origin) { 53 switch (serr->ee.ee_origin) {
52 case SO_EE_ORIGIN_ICMP: 54 case SO_EE_ORIGIN_ICMP:
@@ -70,20 +72,20 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
70 72
71#ifdef CONFIG_AF_RXRPC_IPV6 73#ifdef CONFIG_AF_RXRPC_IPV6
72 case AF_INET6: 74 case AF_INET6:
73 srx->transport.sin6.sin6_port = serr->port;
74 switch (serr->ee.ee_origin) { 75 switch (serr->ee.ee_origin) {
75 case SO_EE_ORIGIN_ICMP6: 76 case SO_EE_ORIGIN_ICMP6:
76 _net("Rx ICMP6"); 77 _net("Rx ICMP6");
78 srx->transport.sin6.sin6_port = serr->port;
77 memcpy(&srx->transport.sin6.sin6_addr, 79 memcpy(&srx->transport.sin6.sin6_addr,
78 skb_network_header(skb) + serr->addr_offset, 80 skb_network_header(skb) + serr->addr_offset,
79 sizeof(struct in6_addr)); 81 sizeof(struct in6_addr));
80 break; 82 break;
81 case SO_EE_ORIGIN_ICMP: 83 case SO_EE_ORIGIN_ICMP:
82 _net("Rx ICMP on v6 sock"); 84 _net("Rx ICMP on v6 sock");
83 srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; 85 srx->transport_len = sizeof(srx->transport.sin);
84 srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; 86 srx->transport.family = AF_INET;
85 srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); 87 srx->transport.sin.sin_port = serr->port;
86 memcpy(srx->transport.sin6.sin6_addr.s6_addr + 12, 88 memcpy(&srx->transport.sin.sin_addr,
87 skb_network_header(skb) + serr->addr_offset, 89 skb_network_header(skb) + serr->addr_offset,
88 sizeof(struct in_addr)); 90 sizeof(struct in_addr));
89 break; 91 break;
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 9805e3b85c36..c7d976859d40 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -212,3 +212,129 @@ const struct seq_operations rxrpc_connection_seq_ops = {
212 .stop = rxrpc_connection_seq_stop, 212 .stop = rxrpc_connection_seq_stop,
213 .show = rxrpc_connection_seq_show, 213 .show = rxrpc_connection_seq_show,
214}; 214};
215
216/*
217 * generate a list of extant virtual peers in /proc/net/rxrpc/peers
218 */
219static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
220{
221 struct rxrpc_peer *peer;
222 time64_t now;
223 char lbuff[50], rbuff[50];
224
225 if (v == SEQ_START_TOKEN) {
226 seq_puts(seq,
227 "Proto Local "
228 " Remote "
229 " Use CW MTU LastUse RTT Rc\n"
230 );
231 return 0;
232 }
233
234 peer = list_entry(v, struct rxrpc_peer, hash_link);
235
236 sprintf(lbuff, "%pISpc", &peer->local->srx.transport);
237
238 sprintf(rbuff, "%pISpc", &peer->srx.transport);
239
240 now = ktime_get_seconds();
241 seq_printf(seq,
242 "UDP %-47.47s %-47.47s %3u"
243 " %3u %5u %6llus %12llu %2u\n",
244 lbuff,
245 rbuff,
246 atomic_read(&peer->usage),
247 peer->cong_cwnd,
248 peer->mtu,
249 now - peer->last_tx_at,
250 peer->rtt,
251 peer->rtt_cursor);
252
253 return 0;
254}
255
256static void *rxrpc_peer_seq_start(struct seq_file *seq, loff_t *_pos)
257 __acquires(rcu)
258{
259 struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
260 unsigned int bucket, n;
261 unsigned int shift = 32 - HASH_BITS(rxnet->peer_hash);
262 void *p;
263
264 rcu_read_lock();
265
266 if (*_pos >= UINT_MAX)
267 return NULL;
268
269 n = *_pos & ((1U << shift) - 1);
270 bucket = *_pos >> shift;
271 for (;;) {
272 if (bucket >= HASH_SIZE(rxnet->peer_hash)) {
273 *_pos = UINT_MAX;
274 return NULL;
275 }
276 if (n == 0) {
277 if (bucket == 0)
278 return SEQ_START_TOKEN;
279 *_pos += 1;
280 n++;
281 }
282
283 p = seq_hlist_start_rcu(&rxnet->peer_hash[bucket], n - 1);
284 if (p)
285 return p;
286 bucket++;
287 n = 1;
288 *_pos = (bucket << shift) | n;
289 }
290}
291
292static void *rxrpc_peer_seq_next(struct seq_file *seq, void *v, loff_t *_pos)
293{
294 struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
295 unsigned int bucket, n;
296 unsigned int shift = 32 - HASH_BITS(rxnet->peer_hash);
297 void *p;
298
299 if (*_pos >= UINT_MAX)
300 return NULL;
301
302 bucket = *_pos >> shift;
303
304 p = seq_hlist_next_rcu(v, &rxnet->peer_hash[bucket], _pos);
305 if (p)
306 return p;
307
308 for (;;) {
309 bucket++;
310 n = 1;
311 *_pos = (bucket << shift) | n;
312
313 if (bucket >= HASH_SIZE(rxnet->peer_hash)) {
314 *_pos = UINT_MAX;
315 return NULL;
316 }
317 if (n == 0) {
318 *_pos += 1;
319 n++;
320 }
321
322 p = seq_hlist_start_rcu(&rxnet->peer_hash[bucket], n - 1);
323 if (p)
324 return p;
325 }
326}
327
328static void rxrpc_peer_seq_stop(struct seq_file *seq, void *v)
329 __releases(rcu)
330{
331 rcu_read_unlock();
332}
333
334
335const struct seq_operations rxrpc_peer_seq_ops = {
336 .start = rxrpc_peer_seq_start,
337 .next = rxrpc_peer_seq_next,
338 .stop = rxrpc_peer_seq_stop,
339 .show = rxrpc_peer_seq_show,
340};
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 816b19a78809..eaf19ebaa964 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -715,3 +715,46 @@ call_complete:
715 goto out; 715 goto out;
716} 716}
717EXPORT_SYMBOL(rxrpc_kernel_recv_data); 717EXPORT_SYMBOL(rxrpc_kernel_recv_data);
718
719/**
720 * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet
721 * @sock: The socket that the call exists on
722 * @call: The call to query
723 * @_ts: Where to put the timestamp
724 *
725 * Retrieve the timestamp from the first DATA packet of the reply if it is
726 * in the ring. Returns true if successful, false if not.
727 */
728bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call,
729 ktime_t *_ts)
730{
731 struct sk_buff *skb;
732 rxrpc_seq_t hard_ack, top, seq;
733 bool success = false;
734
735 mutex_lock(&call->user_mutex);
736
737 if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY)
738 goto out;
739
740 hard_ack = call->rx_hard_ack;
741 if (hard_ack != 0)
742 goto out;
743
744 seq = hard_ack + 1;
745 top = smp_load_acquire(&call->rx_top);
746 if (after(seq, top))
747 goto out;
748
749 skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK];
750 if (!skb)
751 goto out;
752
753 *_ts = skb_get_ktime(skb);
754 success = true;
755
756out:
757 mutex_unlock(&call->user_mutex);
758 return success;
759}
760EXPORT_SYMBOL(rxrpc_kernel_get_reply_time);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index cea16838d588..cbef9ea43dec 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -46,7 +46,7 @@ struct rxkad_level2_hdr {
46 * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE 46 * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
47 * packets 47 * packets
48 */ 48 */
49static struct crypto_skcipher *rxkad_ci; 49static struct crypto_sync_skcipher *rxkad_ci;
50static DEFINE_MUTEX(rxkad_ci_mutex); 50static DEFINE_MUTEX(rxkad_ci_mutex);
51 51
52/* 52/*
@@ -54,7 +54,7 @@ static DEFINE_MUTEX(rxkad_ci_mutex);
54 */ 54 */
55static int rxkad_init_connection_security(struct rxrpc_connection *conn) 55static int rxkad_init_connection_security(struct rxrpc_connection *conn)
56{ 56{
57 struct crypto_skcipher *ci; 57 struct crypto_sync_skcipher *ci;
58 struct rxrpc_key_token *token; 58 struct rxrpc_key_token *token;
59 int ret; 59 int ret;
60 60
@@ -63,14 +63,14 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
63 token = conn->params.key->payload.data[0]; 63 token = conn->params.key->payload.data[0];
64 conn->security_ix = token->security_index; 64 conn->security_ix = token->security_index;
65 65
66 ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); 66 ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0);
67 if (IS_ERR(ci)) { 67 if (IS_ERR(ci)) {
68 _debug("no cipher"); 68 _debug("no cipher");
69 ret = PTR_ERR(ci); 69 ret = PTR_ERR(ci);
70 goto error; 70 goto error;
71 } 71 }
72 72
73 if (crypto_skcipher_setkey(ci, token->kad->session_key, 73 if (crypto_sync_skcipher_setkey(ci, token->kad->session_key,
74 sizeof(token->kad->session_key)) < 0) 74 sizeof(token->kad->session_key)) < 0)
75 BUG(); 75 BUG();
76 76
@@ -104,7 +104,7 @@ error:
104static int rxkad_prime_packet_security(struct rxrpc_connection *conn) 104static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
105{ 105{
106 struct rxrpc_key_token *token; 106 struct rxrpc_key_token *token;
107 SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); 107 SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
108 struct scatterlist sg; 108 struct scatterlist sg;
109 struct rxrpc_crypt iv; 109 struct rxrpc_crypt iv;
110 __be32 *tmpbuf; 110 __be32 *tmpbuf;
@@ -128,7 +128,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
128 tmpbuf[3] = htonl(conn->security_ix); 128 tmpbuf[3] = htonl(conn->security_ix);
129 129
130 sg_init_one(&sg, tmpbuf, tmpsize); 130 sg_init_one(&sg, tmpbuf, tmpsize);
131 skcipher_request_set_tfm(req, conn->cipher); 131 skcipher_request_set_sync_tfm(req, conn->cipher);
132 skcipher_request_set_callback(req, 0, NULL, NULL); 132 skcipher_request_set_callback(req, 0, NULL, NULL);
133 skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); 133 skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x);
134 crypto_skcipher_encrypt(req); 134 crypto_skcipher_encrypt(req);
@@ -167,7 +167,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
167 memset(&iv, 0, sizeof(iv)); 167 memset(&iv, 0, sizeof(iv));
168 168
169 sg_init_one(&sg, sechdr, 8); 169 sg_init_one(&sg, sechdr, 8);
170 skcipher_request_set_tfm(req, call->conn->cipher); 170 skcipher_request_set_sync_tfm(req, call->conn->cipher);
171 skcipher_request_set_callback(req, 0, NULL, NULL); 171 skcipher_request_set_callback(req, 0, NULL, NULL);
172 skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); 172 skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
173 crypto_skcipher_encrypt(req); 173 crypto_skcipher_encrypt(req);
@@ -212,7 +212,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
212 memcpy(&iv, token->kad->session_key, sizeof(iv)); 212 memcpy(&iv, token->kad->session_key, sizeof(iv));
213 213
214 sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); 214 sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
215 skcipher_request_set_tfm(req, call->conn->cipher); 215 skcipher_request_set_sync_tfm(req, call->conn->cipher);
216 skcipher_request_set_callback(req, 0, NULL, NULL); 216 skcipher_request_set_callback(req, 0, NULL, NULL);
217 skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); 217 skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x);
218 crypto_skcipher_encrypt(req); 218 crypto_skcipher_encrypt(req);
@@ -250,7 +250,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
250 void *sechdr) 250 void *sechdr)
251{ 251{
252 struct rxrpc_skb_priv *sp; 252 struct rxrpc_skb_priv *sp;
253 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); 253 SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
254 struct rxrpc_crypt iv; 254 struct rxrpc_crypt iv;
255 struct scatterlist sg; 255 struct scatterlist sg;
256 u32 x, y; 256 u32 x, y;
@@ -279,7 +279,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
279 call->crypto_buf[1] = htonl(x); 279 call->crypto_buf[1] = htonl(x);
280 280
281 sg_init_one(&sg, call->crypto_buf, 8); 281 sg_init_one(&sg, call->crypto_buf, 8);
282 skcipher_request_set_tfm(req, call->conn->cipher); 282 skcipher_request_set_sync_tfm(req, call->conn->cipher);
283 skcipher_request_set_callback(req, 0, NULL, NULL); 283 skcipher_request_set_callback(req, 0, NULL, NULL);
284 skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); 284 skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
285 crypto_skcipher_encrypt(req); 285 crypto_skcipher_encrypt(req);
@@ -352,7 +352,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
352 /* start the decryption afresh */ 352 /* start the decryption afresh */
353 memset(&iv, 0, sizeof(iv)); 353 memset(&iv, 0, sizeof(iv));
354 354
355 skcipher_request_set_tfm(req, call->conn->cipher); 355 skcipher_request_set_sync_tfm(req, call->conn->cipher);
356 skcipher_request_set_callback(req, 0, NULL, NULL); 356 skcipher_request_set_callback(req, 0, NULL, NULL);
357 skcipher_request_set_crypt(req, sg, sg, 8, iv.x); 357 skcipher_request_set_crypt(req, sg, sg, 8, iv.x);
358 crypto_skcipher_decrypt(req); 358 crypto_skcipher_decrypt(req);
@@ -450,7 +450,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
450 token = call->conn->params.key->payload.data[0]; 450 token = call->conn->params.key->payload.data[0];
451 memcpy(&iv, token->kad->session_key, sizeof(iv)); 451 memcpy(&iv, token->kad->session_key, sizeof(iv));
452 452
453 skcipher_request_set_tfm(req, call->conn->cipher); 453 skcipher_request_set_sync_tfm(req, call->conn->cipher);
454 skcipher_request_set_callback(req, 0, NULL, NULL); 454 skcipher_request_set_callback(req, 0, NULL, NULL);
455 skcipher_request_set_crypt(req, sg, sg, len, iv.x); 455 skcipher_request_set_crypt(req, sg, sg, len, iv.x);
456 crypto_skcipher_decrypt(req); 456 crypto_skcipher_decrypt(req);
@@ -506,7 +506,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
506 unsigned int offset, unsigned int len, 506 unsigned int offset, unsigned int len,
507 rxrpc_seq_t seq, u16 expected_cksum) 507 rxrpc_seq_t seq, u16 expected_cksum)
508{ 508{
509 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); 509 SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
510 struct rxrpc_crypt iv; 510 struct rxrpc_crypt iv;
511 struct scatterlist sg; 511 struct scatterlist sg;
512 bool aborted; 512 bool aborted;
@@ -529,7 +529,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
529 call->crypto_buf[1] = htonl(x); 529 call->crypto_buf[1] = htonl(x);
530 530
531 sg_init_one(&sg, call->crypto_buf, 8); 531 sg_init_one(&sg, call->crypto_buf, 8);
532 skcipher_request_set_tfm(req, call->conn->cipher); 532 skcipher_request_set_sync_tfm(req, call->conn->cipher);
533 skcipher_request_set_callback(req, 0, NULL, NULL); 533 skcipher_request_set_callback(req, 0, NULL, NULL);
534 skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); 534 skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x);
535 crypto_skcipher_encrypt(req); 535 crypto_skcipher_encrypt(req);
@@ -755,7 +755,7 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn,
755 struct rxkad_response *resp, 755 struct rxkad_response *resp,
756 const struct rxkad_key *s2) 756 const struct rxkad_key *s2)
757{ 757{
758 SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); 758 SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
759 struct rxrpc_crypt iv; 759 struct rxrpc_crypt iv;
760 struct scatterlist sg[1]; 760 struct scatterlist sg[1];
761 761
@@ -764,7 +764,7 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn,
764 764
765 sg_init_table(sg, 1); 765 sg_init_table(sg, 1);
766 sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); 766 sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
767 skcipher_request_set_tfm(req, conn->cipher); 767 skcipher_request_set_sync_tfm(req, conn->cipher);
768 skcipher_request_set_callback(req, 0, NULL, NULL); 768 skcipher_request_set_callback(req, 0, NULL, NULL);
769 skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); 769 skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
770 crypto_skcipher_encrypt(req); 770 crypto_skcipher_encrypt(req);
@@ -1021,7 +1021,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
1021 struct rxkad_response *resp, 1021 struct rxkad_response *resp,
1022 const struct rxrpc_crypt *session_key) 1022 const struct rxrpc_crypt *session_key)
1023{ 1023{
1024 SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); 1024 SYNC_SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci);
1025 struct scatterlist sg[1]; 1025 struct scatterlist sg[1];
1026 struct rxrpc_crypt iv; 1026 struct rxrpc_crypt iv;
1027 1027
@@ -1031,7 +1031,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
1031 ASSERT(rxkad_ci != NULL); 1031 ASSERT(rxkad_ci != NULL);
1032 1032
1033 mutex_lock(&rxkad_ci_mutex); 1033 mutex_lock(&rxkad_ci_mutex);
1034 if (crypto_skcipher_setkey(rxkad_ci, session_key->x, 1034 if (crypto_sync_skcipher_setkey(rxkad_ci, session_key->x,
1035 sizeof(*session_key)) < 0) 1035 sizeof(*session_key)) < 0)
1036 BUG(); 1036 BUG();
1037 1037
@@ -1039,7 +1039,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
1039 1039
1040 sg_init_table(sg, 1); 1040 sg_init_table(sg, 1);
1041 sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); 1041 sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
1042 skcipher_request_set_tfm(req, rxkad_ci); 1042 skcipher_request_set_sync_tfm(req, rxkad_ci);
1043 skcipher_request_set_callback(req, 0, NULL, NULL); 1043 skcipher_request_set_callback(req, 0, NULL, NULL);
1044 skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); 1044 skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
1045 crypto_skcipher_decrypt(req); 1045 crypto_skcipher_decrypt(req);
@@ -1218,7 +1218,7 @@ static void rxkad_clear(struct rxrpc_connection *conn)
1218 _enter(""); 1218 _enter("");
1219 1219
1220 if (conn->cipher) 1220 if (conn->cipher)
1221 crypto_free_skcipher(conn->cipher); 1221 crypto_free_sync_skcipher(conn->cipher);
1222} 1222}
1223 1223
1224/* 1224/*
@@ -1228,7 +1228,7 @@ static int rxkad_init(void)
1228{ 1228{
1229 /* pin the cipher we need so that the crypto layer doesn't invoke 1229 /* pin the cipher we need so that the crypto layer doesn't invoke
1230 * keventd to go get it */ 1230 * keventd to go get it */
1231 rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); 1231 rxkad_ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0);
1232 return PTR_ERR_OR_ZERO(rxkad_ci); 1232 return PTR_ERR_OR_ZERO(rxkad_ci);
1233} 1233}
1234 1234
@@ -1238,7 +1238,7 @@ static int rxkad_init(void)
1238static void rxkad_exit(void) 1238static void rxkad_exit(void)
1239{ 1239{
1240 if (rxkad_ci) 1240 if (rxkad_ci)
1241 crypto_free_skcipher(rxkad_ci); 1241 crypto_free_sync_skcipher(rxkad_ci);
1242} 1242}
1243 1243
1244/* 1244/*
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index b8985d01876a..913dca65cc65 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -69,21 +69,6 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
69} 69}
70 70
71/* 71/*
72 * Note the injected loss of a socket buffer.
73 */
74void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
75{
76 const void *here = __builtin_return_address(0);
77 if (skb) {
78 int n;
79 CHECK_SLAB_OKAY(&skb->users);
80 n = atomic_dec_return(select_skb_count(op));
81 trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
82 kfree_skb(skb);
83 }
84}
85
86/*
87 * Clear a queue of socket buffers. 72 * Clear a queue of socket buffers.
88 */ 73 */
89void rxrpc_purge_queue(struct sk_buff_head *list) 74void rxrpc_purge_queue(struct sk_buff_head *list)
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
index e801171fa351..ff7af71c4b49 100644
--- a/net/rxrpc/utils.c
+++ b/net/rxrpc/utils.c
@@ -17,28 +17,17 @@
17/* 17/*
18 * Fill out a peer address from a socket buffer containing a packet. 18 * Fill out a peer address from a socket buffer containing a packet.
19 */ 19 */
20int rxrpc_extract_addr_from_skb(struct rxrpc_local *local, 20int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
21 struct sockaddr_rxrpc *srx,
22 struct sk_buff *skb)
23{ 21{
24 memset(srx, 0, sizeof(*srx)); 22 memset(srx, 0, sizeof(*srx));
25 23
26 switch (ntohs(skb->protocol)) { 24 switch (ntohs(skb->protocol)) {
27 case ETH_P_IP: 25 case ETH_P_IP:
28 if (local->srx.transport.family == AF_INET6) { 26 srx->transport_type = SOCK_DGRAM;
29 srx->transport_type = SOCK_DGRAM; 27 srx->transport_len = sizeof(srx->transport.sin);
30 srx->transport_len = sizeof(srx->transport.sin6); 28 srx->transport.sin.sin_family = AF_INET;
31 srx->transport.sin6.sin6_family = AF_INET6; 29 srx->transport.sin.sin_port = udp_hdr(skb)->source;
32 srx->transport.sin6.sin6_port = udp_hdr(skb)->source; 30 srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
33 srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
34 srx->transport.sin6.sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr;
35 } else {
36 srx->transport_type = SOCK_DGRAM;
37 srx->transport_len = sizeof(srx->transport.sin);
38 srx->transport.sin.sin_family = AF_INET;
39 srx->transport.sin.sin_port = udp_hdr(skb)->source;
40 srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
41 }
42 return 0; 31 return 0;
43 32
44#ifdef CONFIG_AF_RXRPC_IPV6 33#ifdef CONFIG_AF_RXRPC_IPV6
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e95741388311..1b9afdee5ba9 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -194,6 +194,17 @@ config NET_SCH_ETF
194 To compile this code as a module, choose M here: the 194 To compile this code as a module, choose M here: the
195 module will be called sch_etf. 195 module will be called sch_etf.
196 196
197config NET_SCH_TAPRIO
198 tristate "Time Aware Priority (taprio) Scheduler"
199 help
200 Say Y here if you want to use the Time Aware Priority (taprio) packet
201 scheduling algorithm.
202
203 See the top of <file:net/sched/sch_taprio.c> for more details.
204
205 To compile this code as a module, choose M here: the
206 module will be called sch_taprio.
207
197config NET_SCH_GRED 208config NET_SCH_GRED
198 tristate "Generic Random Early Detection (GRED)" 209 tristate "Generic Random Early Detection (GRED)"
199 ---help--- 210 ---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f0403f49edcb..8a40431d7b5c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
57obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o 57obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
58obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o 58obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
59obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o 59obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
60obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
60 61
61obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 62obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
62obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 63obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e12f8ef7baa4..9c1b0729aebf 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -81,6 +81,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
81static void free_tcf(struct tc_action *p) 81static void free_tcf(struct tc_action *p)
82{ 82{
83 free_percpu(p->cpu_bstats); 83 free_percpu(p->cpu_bstats);
84 free_percpu(p->cpu_bstats_hw);
84 free_percpu(p->cpu_qstats); 85 free_percpu(p->cpu_qstats);
85 86
86 tcf_set_action_cookie(&p->act_cookie, NULL); 87 tcf_set_action_cookie(&p->act_cookie, NULL);
@@ -103,11 +104,11 @@ static int __tcf_action_put(struct tc_action *p, bool bind)
103{ 104{
104 struct tcf_idrinfo *idrinfo = p->idrinfo; 105 struct tcf_idrinfo *idrinfo = p->idrinfo;
105 106
106 if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { 107 if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
107 if (bind) 108 if (bind)
108 atomic_dec(&p->tcfa_bindcnt); 109 atomic_dec(&p->tcfa_bindcnt);
109 idr_remove(&idrinfo->action_idr, p->tcfa_index); 110 idr_remove(&idrinfo->action_idr, p->tcfa_index);
110 spin_unlock(&idrinfo->lock); 111 mutex_unlock(&idrinfo->lock);
111 112
112 tcf_action_cleanup(p); 113 tcf_action_cleanup(p);
113 return 1; 114 return 1;
@@ -199,7 +200,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
199 struct tc_action *p; 200 struct tc_action *p;
200 unsigned long id = 1; 201 unsigned long id = 1;
201 202
202 spin_lock(&idrinfo->lock); 203 mutex_lock(&idrinfo->lock);
203 204
204 s_i = cb->args[0]; 205 s_i = cb->args[0];
205 206
@@ -234,7 +235,7 @@ done:
234 if (index >= 0) 235 if (index >= 0)
235 cb->args[0] = index + 1; 236 cb->args[0] = index + 1;
236 237
237 spin_unlock(&idrinfo->lock); 238 mutex_unlock(&idrinfo->lock);
238 if (n_i) { 239 if (n_i) {
239 if (act_flags & TCA_FLAG_LARGE_DUMP_ON) 240 if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
240 cb->args[1] = n_i; 241 cb->args[1] = n_i;
@@ -246,6 +247,20 @@ nla_put_failure:
246 goto done; 247 goto done;
247} 248}
248 249
250static int tcf_idr_release_unsafe(struct tc_action *p)
251{
252 if (atomic_read(&p->tcfa_bindcnt) > 0)
253 return -EPERM;
254
255 if (refcount_dec_and_test(&p->tcfa_refcnt)) {
256 idr_remove(&p->idrinfo->action_idr, p->tcfa_index);
257 tcf_action_cleanup(p);
258 return ACT_P_DELETED;
259 }
260
261 return 0;
262}
263
249static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, 264static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
250 const struct tc_action_ops *ops) 265 const struct tc_action_ops *ops)
251{ 266{
@@ -262,15 +277,19 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
262 if (nla_put_string(skb, TCA_KIND, ops->kind)) 277 if (nla_put_string(skb, TCA_KIND, ops->kind))
263 goto nla_put_failure; 278 goto nla_put_failure;
264 279
280 mutex_lock(&idrinfo->lock);
265 idr_for_each_entry_ul(idr, p, id) { 281 idr_for_each_entry_ul(idr, p, id) {
266 ret = __tcf_idr_release(p, false, true); 282 ret = tcf_idr_release_unsafe(p);
267 if (ret == ACT_P_DELETED) { 283 if (ret == ACT_P_DELETED) {
268 module_put(ops->owner); 284 module_put(ops->owner);
269 n_i++; 285 n_i++;
270 } else if (ret < 0) { 286 } else if (ret < 0) {
287 mutex_unlock(&idrinfo->lock);
271 goto nla_put_failure; 288 goto nla_put_failure;
272 } 289 }
273 } 290 }
291 mutex_unlock(&idrinfo->lock);
292
274 if (nla_put_u32(skb, TCA_FCNT, n_i)) 293 if (nla_put_u32(skb, TCA_FCNT, n_i))
275 goto nla_put_failure; 294 goto nla_put_failure;
276 nla_nest_end(skb, nest); 295 nla_nest_end(skb, nest);
@@ -305,13 +324,13 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
305 struct tcf_idrinfo *idrinfo = tn->idrinfo; 324 struct tcf_idrinfo *idrinfo = tn->idrinfo;
306 struct tc_action *p; 325 struct tc_action *p;
307 326
308 spin_lock(&idrinfo->lock); 327 mutex_lock(&idrinfo->lock);
309 p = idr_find(&idrinfo->action_idr, index); 328 p = idr_find(&idrinfo->action_idr, index);
310 if (IS_ERR(p)) 329 if (IS_ERR(p))
311 p = NULL; 330 p = NULL;
312 else if (p) 331 else if (p)
313 refcount_inc(&p->tcfa_refcnt); 332 refcount_inc(&p->tcfa_refcnt);
314 spin_unlock(&idrinfo->lock); 333 mutex_unlock(&idrinfo->lock);
315 334
316 if (p) { 335 if (p) {
317 *a = p; 336 *a = p;
@@ -326,10 +345,10 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
326 struct tc_action *p; 345 struct tc_action *p;
327 int ret = 0; 346 int ret = 0;
328 347
329 spin_lock(&idrinfo->lock); 348 mutex_lock(&idrinfo->lock);
330 p = idr_find(&idrinfo->action_idr, index); 349 p = idr_find(&idrinfo->action_idr, index);
331 if (!p) { 350 if (!p) {
332 spin_unlock(&idrinfo->lock); 351 mutex_unlock(&idrinfo->lock);
333 return -ENOENT; 352 return -ENOENT;
334 } 353 }
335 354
@@ -339,7 +358,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
339 358
340 WARN_ON(p != idr_remove(&idrinfo->action_idr, 359 WARN_ON(p != idr_remove(&idrinfo->action_idr,
341 p->tcfa_index)); 360 p->tcfa_index));
342 spin_unlock(&idrinfo->lock); 361 mutex_unlock(&idrinfo->lock);
343 362
344 tcf_action_cleanup(p); 363 tcf_action_cleanup(p);
345 module_put(owner); 364 module_put(owner);
@@ -350,7 +369,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
350 ret = -EPERM; 369 ret = -EPERM;
351 } 370 }
352 371
353 spin_unlock(&idrinfo->lock); 372 mutex_unlock(&idrinfo->lock);
354 return ret; 373 return ret;
355} 374}
356 375
@@ -372,9 +391,12 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
372 p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); 391 p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
373 if (!p->cpu_bstats) 392 if (!p->cpu_bstats)
374 goto err1; 393 goto err1;
394 p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
395 if (!p->cpu_bstats_hw)
396 goto err2;
375 p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); 397 p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
376 if (!p->cpu_qstats) 398 if (!p->cpu_qstats)
377 goto err2; 399 goto err3;
378 } 400 }
379 spin_lock_init(&p->tcfa_lock); 401 spin_lock_init(&p->tcfa_lock);
380 p->tcfa_index = index; 402 p->tcfa_index = index;
@@ -386,15 +408,17 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
386 &p->tcfa_rate_est, 408 &p->tcfa_rate_est,
387 &p->tcfa_lock, NULL, est); 409 &p->tcfa_lock, NULL, est);
388 if (err) 410 if (err)
389 goto err3; 411 goto err4;
390 } 412 }
391 413
392 p->idrinfo = idrinfo; 414 p->idrinfo = idrinfo;
393 p->ops = ops; 415 p->ops = ops;
394 *a = p; 416 *a = p;
395 return 0; 417 return 0;
396err3: 418err4:
397 free_percpu(p->cpu_qstats); 419 free_percpu(p->cpu_qstats);
420err3:
421 free_percpu(p->cpu_bstats_hw);
398err2: 422err2:
399 free_percpu(p->cpu_bstats); 423 free_percpu(p->cpu_bstats);
400err1: 424err1:
@@ -407,10 +431,10 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
407{ 431{
408 struct tcf_idrinfo *idrinfo = tn->idrinfo; 432 struct tcf_idrinfo *idrinfo = tn->idrinfo;
409 433
410 spin_lock(&idrinfo->lock); 434 mutex_lock(&idrinfo->lock);
411 /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ 435 /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
412 WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); 436 WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
413 spin_unlock(&idrinfo->lock); 437 mutex_unlock(&idrinfo->lock);
414} 438}
415EXPORT_SYMBOL(tcf_idr_insert); 439EXPORT_SYMBOL(tcf_idr_insert);
416 440
@@ -420,10 +444,10 @@ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
420{ 444{
421 struct tcf_idrinfo *idrinfo = tn->idrinfo; 445 struct tcf_idrinfo *idrinfo = tn->idrinfo;
422 446
423 spin_lock(&idrinfo->lock); 447 mutex_lock(&idrinfo->lock);
424 /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ 448 /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
425 WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); 449 WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
426 spin_unlock(&idrinfo->lock); 450 mutex_unlock(&idrinfo->lock);
427} 451}
428EXPORT_SYMBOL(tcf_idr_cleanup); 452EXPORT_SYMBOL(tcf_idr_cleanup);
429 453
@@ -441,14 +465,14 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
441 int ret; 465 int ret;
442 466
443again: 467again:
444 spin_lock(&idrinfo->lock); 468 mutex_lock(&idrinfo->lock);
445 if (*index) { 469 if (*index) {
446 p = idr_find(&idrinfo->action_idr, *index); 470 p = idr_find(&idrinfo->action_idr, *index);
447 if (IS_ERR(p)) { 471 if (IS_ERR(p)) {
448 /* This means that another process allocated 472 /* This means that another process allocated
449 * index but did not assign the pointer yet. 473 * index but did not assign the pointer yet.
450 */ 474 */
451 spin_unlock(&idrinfo->lock); 475 mutex_unlock(&idrinfo->lock);
452 goto again; 476 goto again;
453 } 477 }
454 478
@@ -461,7 +485,7 @@ again:
461 } else { 485 } else {
462 *a = NULL; 486 *a = NULL;
463 ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, 487 ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
464 *index, GFP_ATOMIC); 488 *index, GFP_KERNEL);
465 if (!ret) 489 if (!ret)
466 idr_replace(&idrinfo->action_idr, 490 idr_replace(&idrinfo->action_idr,
467 ERR_PTR(-EBUSY), *index); 491 ERR_PTR(-EBUSY), *index);
@@ -470,12 +494,12 @@ again:
470 *index = 1; 494 *index = 1;
471 *a = NULL; 495 *a = NULL;
472 ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, 496 ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
473 UINT_MAX, GFP_ATOMIC); 497 UINT_MAX, GFP_KERNEL);
474 if (!ret) 498 if (!ret)
475 idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), 499 idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
476 *index); 500 *index);
477 } 501 }
478 spin_unlock(&idrinfo->lock); 502 mutex_unlock(&idrinfo->lock);
479 return ret; 503 return ret;
480} 504}
481EXPORT_SYMBOL(tcf_idr_check_alloc); 505EXPORT_SYMBOL(tcf_idr_check_alloc);
@@ -979,6 +1003,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
979 goto errout; 1003 goto errout;
980 1004
981 if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || 1005 if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
1006 gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw,
1007 &p->tcfa_bstats_hw) < 0 ||
982 gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || 1008 gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
983 gnet_stats_copy_queue(&d, p->cpu_qstats, 1009 gnet_stats_copy_queue(&d, p->cpu_qstats,
984 &p->tcfa_qstats, 1010 &p->tcfa_qstats,
@@ -1073,12 +1099,14 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
1073 err = -EINVAL; 1099 err = -EINVAL;
1074 ops = tc_lookup_action(tb[TCA_ACT_KIND]); 1100 ops = tc_lookup_action(tb[TCA_ACT_KIND]);
1075 if (!ops) { /* could happen in batch of actions */ 1101 if (!ops) { /* could happen in batch of actions */
1076 NL_SET_ERR_MSG(extack, "Specified TC action not found"); 1102 NL_SET_ERR_MSG(extack, "Specified TC action kind not found");
1077 goto err_out; 1103 goto err_out;
1078 } 1104 }
1079 err = -ENOENT; 1105 err = -ENOENT;
1080 if (ops->lookup(net, &a, index, extack) == 0) 1106 if (ops->lookup(net, &a, index) == 0) {
1107 NL_SET_ERR_MSG(extack, "TC action with specified index not found");
1081 goto err_mod; 1108 goto err_mod;
1109 }
1082 1110
1083 module_put(ops->owner); 1111 module_put(ops->owner);
1084 return a; 1112 return a;
@@ -1424,7 +1452,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
1424 u32 act_count = 0; 1452 u32 act_count = 0;
1425 1453
1426 ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, 1454 ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
1427 tcaa_policy, NULL); 1455 tcaa_policy, cb->extack);
1428 if (ret < 0) 1456 if (ret < 0)
1429 return ret; 1457 return ret;
1430 1458
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0c68bc9cf0b4..c7633843e223 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -387,8 +387,7 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
387 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 387 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
388} 388}
389 389
390static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, 390static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
391 struct netlink_ext_ack *extack)
392{ 391{
393 struct tc_action_net *tn = net_generic(net, bpf_net_id); 392 struct tc_action_net *tn = net_generic(net, bpf_net_id);
394 393
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 6f0f273f1139..8475913f2070 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -143,8 +143,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
143 return -EEXIST; 143 return -EEXIST;
144 } 144 }
145 /* replacing action and zone */ 145 /* replacing action and zone */
146 spin_lock_bh(&ci->tcf_lock);
146 ci->tcf_action = parm->action; 147 ci->tcf_action = parm->action;
147 ci->zone = parm->zone; 148 ci->zone = parm->zone;
149 spin_unlock_bh(&ci->tcf_lock);
148 ret = 0; 150 ret = 0;
149 } 151 }
150 152
@@ -156,16 +158,16 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
156{ 158{
157 unsigned char *b = skb_tail_pointer(skb); 159 unsigned char *b = skb_tail_pointer(skb);
158 struct tcf_connmark_info *ci = to_connmark(a); 160 struct tcf_connmark_info *ci = to_connmark(a);
159
160 struct tc_connmark opt = { 161 struct tc_connmark opt = {
161 .index = ci->tcf_index, 162 .index = ci->tcf_index,
162 .refcnt = refcount_read(&ci->tcf_refcnt) - ref, 163 .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
163 .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, 164 .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
164 .action = ci->tcf_action,
165 .zone = ci->zone,
166 }; 165 };
167 struct tcf_t t; 166 struct tcf_t t;
168 167
168 spin_lock_bh(&ci->tcf_lock);
169 opt.action = ci->tcf_action;
170 opt.zone = ci->zone;
169 if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) 171 if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
170 goto nla_put_failure; 172 goto nla_put_failure;
171 173
@@ -173,9 +175,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
173 if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, 175 if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
174 TCA_CONNMARK_PAD)) 176 TCA_CONNMARK_PAD))
175 goto nla_put_failure; 177 goto nla_put_failure;
178 spin_unlock_bh(&ci->tcf_lock);
176 179
177 return skb->len; 180 return skb->len;
181
178nla_put_failure: 182nla_put_failure:
183 spin_unlock_bh(&ci->tcf_lock);
179 nlmsg_trim(skb, b); 184 nlmsg_trim(skb, b);
180 return -1; 185 return -1;
181} 186}
@@ -190,8 +195,7 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
190 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 195 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
191} 196}
192 197
193static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, 198static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
194 struct netlink_ext_ack *extack)
195{ 199{
196 struct tc_action_net *tn = net_generic(net, connmark_net_id); 200 struct tc_action_net *tn = net_generic(net, connmark_net_id);
197 201
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b8a67ae3105a..3dc25b7806d7 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -646,8 +646,7 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
646 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 646 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
647} 647}
648 648
649static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index, 649static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
650 struct netlink_ext_ack *extack)
651{ 650{
652 struct tc_action_net *tn = net_generic(net, csum_net_id); 651 struct tc_action_net *tn = net_generic(net, csum_net_id);
653 652
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index cd1d9bd32ef9..b61c20ebb314 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -88,6 +88,11 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
88 p_parm = nla_data(tb[TCA_GACT_PROB]); 88 p_parm = nla_data(tb[TCA_GACT_PROB]);
89 if (p_parm->ptype >= MAX_RAND) 89 if (p_parm->ptype >= MAX_RAND)
90 return -EINVAL; 90 return -EINVAL;
91 if (TC_ACT_EXT_CMP(p_parm->paction, TC_ACT_GOTO_CHAIN)) {
92 NL_SET_ERR_MSG(extack,
93 "goto chain not allowed on fallback");
94 return -EINVAL;
95 }
91 } 96 }
92#endif 97#endif
93 98
@@ -157,7 +162,7 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
157} 162}
158 163
159static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, 164static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
160 u64 lastuse) 165 u64 lastuse, bool hw)
161{ 166{
162 struct tcf_gact *gact = to_gact(a); 167 struct tcf_gact *gact = to_gact(a);
163 int action = READ_ONCE(gact->tcf_action); 168 int action = READ_ONCE(gact->tcf_action);
@@ -168,6 +173,10 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
168 if (action == TC_ACT_SHOT) 173 if (action == TC_ACT_SHOT)
169 this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; 174 this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
170 175
176 if (hw)
177 _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
178 bytes, packets);
179
171 tm->lastuse = max_t(u64, tm->lastuse, lastuse); 180 tm->lastuse = max_t(u64, tm->lastuse, lastuse);
172} 181}
173 182
@@ -222,8 +231,7 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
222 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 231 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
223} 232}
224 233
225static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index, 234static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
226 struct netlink_ext_ack *extack)
227{ 235{
228 struct tc_action_net *tn = net_generic(net, gact_net_id); 236 struct tc_action_net *tn = net_generic(net, gact_net_id);
229 237
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 06a3d4801878..30b63fa23ee2 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -855,8 +855,7 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
855 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 855 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
856} 856}
857 857
858static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, 858static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
859 struct netlink_ext_ack *extack)
860{ 859{
861 struct tc_action_net *tn = net_generic(net, ife_net_id); 860 struct tc_action_net *tn = net_generic(net, ife_net_id);
862 861
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8525de811616..8af6c11d2482 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -329,8 +329,7 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
329 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 329 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
330} 330}
331 331
332static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, 332static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
333 struct netlink_ext_ack *extack)
334{ 333{
335 struct tc_action_net *tn = net_generic(net, ipt_net_id); 334 struct tc_action_net *tn = net_generic(net, ipt_net_id);
336 335
@@ -379,8 +378,7 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
379 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 378 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
380} 379}
381 380
382static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, 381static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
383 struct netlink_ext_ack *extack)
384{ 382{
385 struct tc_action_net *tn = net_generic(net, xt_net_id); 383 struct tc_action_net *tn = net_generic(net, xt_net_id);
386 384
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8bf66d0a6800..1dae5f2b358f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -283,12 +283,15 @@ out:
283} 283}
284 284
285static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, 285static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
286 u64 lastuse) 286 u64 lastuse, bool hw)
287{ 287{
288 struct tcf_mirred *m = to_mirred(a); 288 struct tcf_mirred *m = to_mirred(a);
289 struct tcf_t *tm = &m->tcf_tm; 289 struct tcf_t *tm = &m->tcf_tm;
290 290
291 _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); 291 _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
292 if (hw)
293 _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
294 bytes, packets);
292 tm->lastuse = max_t(u64, tm->lastuse, lastuse); 295 tm->lastuse = max_t(u64, tm->lastuse, lastuse);
293} 296}
294 297
@@ -338,8 +341,7 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
338 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 341 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
339} 342}
340 343
341static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index, 344static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
342 struct netlink_ext_ack *extack)
343{ 345{
344 struct tc_action_net *tn = net_generic(net, mirred_net_id); 346 struct tc_action_net *tn = net_generic(net, mirred_net_id);
345 347
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4313aa102440..c5c1e23add77 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -256,28 +256,31 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
256 unsigned char *b = skb_tail_pointer(skb); 256 unsigned char *b = skb_tail_pointer(skb);
257 struct tcf_nat *p = to_tcf_nat(a); 257 struct tcf_nat *p = to_tcf_nat(a);
258 struct tc_nat opt = { 258 struct tc_nat opt = {
259 .old_addr = p->old_addr,
260 .new_addr = p->new_addr,
261 .mask = p->mask,
262 .flags = p->flags,
263
264 .index = p->tcf_index, 259 .index = p->tcf_index,
265 .action = p->tcf_action,
266 .refcnt = refcount_read(&p->tcf_refcnt) - ref, 260 .refcnt = refcount_read(&p->tcf_refcnt) - ref,
267 .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, 261 .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
268 }; 262 };
269 struct tcf_t t; 263 struct tcf_t t;
270 264
265 spin_lock_bh(&p->tcf_lock);
266 opt.old_addr = p->old_addr;
267 opt.new_addr = p->new_addr;
268 opt.mask = p->mask;
269 opt.flags = p->flags;
270 opt.action = p->tcf_action;
271
271 if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) 272 if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
272 goto nla_put_failure; 273 goto nla_put_failure;
273 274
274 tcf_tm_dump(&t, &p->tcf_tm); 275 tcf_tm_dump(&t, &p->tcf_tm);
275 if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) 276 if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
276 goto nla_put_failure; 277 goto nla_put_failure;
278 spin_unlock_bh(&p->tcf_lock);
277 279
278 return skb->len; 280 return skb->len;
279 281
280nla_put_failure: 282nla_put_failure:
283 spin_unlock_bh(&p->tcf_lock);
281 nlmsg_trim(skb, b); 284 nlmsg_trim(skb, b);
282 return -1; 285 return -1;
283} 286}
@@ -292,8 +295,7 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
292 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 295 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
293} 296}
294 297
295static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, 298static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
296 struct netlink_ext_ack *extack)
297{ 299{
298 struct tc_action_net *tn = net_generic(net, nat_net_id); 300 struct tc_action_net *tn = net_generic(net, nat_net_id);
299 301
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index ad99a99f11f6..da3dd0f68cc2 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -460,8 +460,7 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
460 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 460 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
461} 461}
462 462
463static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, 463static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
464 struct netlink_ext_ack *extack)
465{ 464{
466 struct tc_action_net *tn = net_generic(net, pedit_net_id); 465 struct tc_action_net *tn = net_generic(net, pedit_net_id);
467 466
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5d8bfa878477..052855d47354 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,8 +22,7 @@
22#include <net/act_api.h> 22#include <net/act_api.h>
23#include <net/netlink.h> 23#include <net/netlink.h>
24 24
25struct tcf_police { 25struct tcf_police_params {
26 struct tc_action common;
27 int tcfp_result; 26 int tcfp_result;
28 u32 tcfp_ewma_rate; 27 u32 tcfp_ewma_rate;
29 s64 tcfp_burst; 28 s64 tcfp_burst;
@@ -36,6 +35,12 @@ struct tcf_police {
36 bool rate_present; 35 bool rate_present;
37 struct psched_ratecfg peak; 36 struct psched_ratecfg peak;
38 bool peak_present; 37 bool peak_present;
38 struct rcu_head rcu;
39};
40
41struct tcf_police {
42 struct tc_action common;
43 struct tcf_police_params __rcu *params;
39}; 44};
40 45
41#define to_police(pc) ((struct tcf_police *)pc) 46#define to_police(pc) ((struct tcf_police *)pc)
@@ -84,6 +89,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
84 struct tcf_police *police; 89 struct tcf_police *police;
85 struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; 90 struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
86 struct tc_action_net *tn = net_generic(net, police_net_id); 91 struct tc_action_net *tn = net_generic(net, police_net_id);
92 struct tcf_police_params *new;
87 bool exists = false; 93 bool exists = false;
88 int size; 94 int size;
89 95
@@ -110,7 +116,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
110 116
111 if (!exists) { 117 if (!exists) {
112 ret = tcf_idr_create(tn, parm->index, NULL, a, 118 ret = tcf_idr_create(tn, parm->index, NULL, a,
113 &act_police_ops, bind, false); 119 &act_police_ops, bind, true);
114 if (ret) { 120 if (ret) {
115 tcf_idr_cleanup(tn, parm->index); 121 tcf_idr_cleanup(tn, parm->index);
116 return ret; 122 return ret;
@@ -137,7 +143,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
137 } 143 }
138 144
139 if (est) { 145 if (est) {
140 err = gen_replace_estimator(&police->tcf_bstats, NULL, 146 err = gen_replace_estimator(&police->tcf_bstats,
147 police->common.cpu_bstats,
141 &police->tcf_rate_est, 148 &police->tcf_rate_est,
142 &police->tcf_lock, 149 &police->tcf_lock,
143 NULL, est); 150 NULL, est);
@@ -150,50 +157,68 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
150 goto failure; 157 goto failure;
151 } 158 }
152 159
153 spin_lock_bh(&police->tcf_lock); 160 new = kzalloc(sizeof(*new), GFP_KERNEL);
161 if (unlikely(!new)) {
162 err = -ENOMEM;
163 goto failure;
164 }
165
154 /* No failure allowed after this point */ 166 /* No failure allowed after this point */
155 police->tcfp_mtu = parm->mtu; 167 new->tcfp_mtu = parm->mtu;
156 if (police->tcfp_mtu == 0) { 168 if (!new->tcfp_mtu) {
157 police->tcfp_mtu = ~0; 169 new->tcfp_mtu = ~0;
158 if (R_tab) 170 if (R_tab)
159 police->tcfp_mtu = 255 << R_tab->rate.cell_log; 171 new->tcfp_mtu = 255 << R_tab->rate.cell_log;
160 } 172 }
161 if (R_tab) { 173 if (R_tab) {
162 police->rate_present = true; 174 new->rate_present = true;
163 psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); 175 psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
164 qdisc_put_rtab(R_tab); 176 qdisc_put_rtab(R_tab);
165 } else { 177 } else {
166 police->rate_present = false; 178 new->rate_present = false;
167 } 179 }
168 if (P_tab) { 180 if (P_tab) {
169 police->peak_present = true; 181 new->peak_present = true;
170 psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); 182 psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
171 qdisc_put_rtab(P_tab); 183 qdisc_put_rtab(P_tab);
172 } else { 184 } else {
173 police->peak_present = false; 185 new->peak_present = false;
174 } 186 }
175 187
176 if (tb[TCA_POLICE_RESULT]) 188 new->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
177 police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); 189 new->tcfp_toks = new->tcfp_burst;
178 police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); 190 if (new->peak_present) {
179 police->tcfp_toks = police->tcfp_burst; 191 new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
180 if (police->peak_present) { 192 new->tcfp_mtu);
181 police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, 193 new->tcfp_ptoks = new->tcfp_mtu_ptoks;
182 police->tcfp_mtu);
183 police->tcfp_ptoks = police->tcfp_mtu_ptoks;
184 } 194 }
185 police->tcf_action = parm->action;
186 195
187 if (tb[TCA_POLICE_AVRATE]) 196 if (tb[TCA_POLICE_AVRATE])
188 police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); 197 new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
198
199 if (tb[TCA_POLICE_RESULT]) {
200 new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
201 if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) {
202 NL_SET_ERR_MSG(extack,
203 "goto chain not allowed on fallback");
204 err = -EINVAL;
205 goto failure;
206 }
207 }
189 208
209 spin_lock_bh(&police->tcf_lock);
210 new->tcfp_t_c = ktime_get_ns();
211 police->tcf_action = parm->action;
212 rcu_swap_protected(police->params,
213 new,
214 lockdep_is_held(&police->tcf_lock));
190 spin_unlock_bh(&police->tcf_lock); 215 spin_unlock_bh(&police->tcf_lock);
191 if (ret != ACT_P_CREATED)
192 return ret;
193 216
194 police->tcfp_t_c = ktime_get_ns(); 217 if (new)
195 tcf_idr_insert(tn, *a); 218 kfree_rcu(new, rcu);
196 219
220 if (ret == ACT_P_CREATED)
221 tcf_idr_insert(tn, *a);
197 return ret; 222 return ret;
198 223
199failure: 224failure:
@@ -207,64 +232,69 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
207 struct tcf_result *res) 232 struct tcf_result *res)
208{ 233{
209 struct tcf_police *police = to_police(a); 234 struct tcf_police *police = to_police(a);
210 s64 now; 235 struct tcf_police_params *p;
211 s64 toks; 236 s64 now, toks, ptoks = 0;
212 s64 ptoks = 0; 237 int ret;
213 238
214 spin_lock(&police->tcf_lock);
215
216 bstats_update(&police->tcf_bstats, skb);
217 tcf_lastuse_update(&police->tcf_tm); 239 tcf_lastuse_update(&police->tcf_tm);
240 bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb);
241
242 ret = READ_ONCE(police->tcf_action);
243 p = rcu_dereference_bh(police->params);
218 244
219 if (police->tcfp_ewma_rate) { 245 if (p->tcfp_ewma_rate) {
220 struct gnet_stats_rate_est64 sample; 246 struct gnet_stats_rate_est64 sample;
221 247
222 if (!gen_estimator_read(&police->tcf_rate_est, &sample) || 248 if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
223 sample.bps >= police->tcfp_ewma_rate) { 249 sample.bps >= p->tcfp_ewma_rate)
224 police->tcf_qstats.overlimits++; 250 goto inc_overlimits;
225 if (police->tcf_action == TC_ACT_SHOT)
226 police->tcf_qstats.drops++;
227 spin_unlock(&police->tcf_lock);
228 return police->tcf_action;
229 }
230 } 251 }
231 252
232 if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { 253 if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
233 if (!police->rate_present) { 254 if (!p->rate_present) {
234 spin_unlock(&police->tcf_lock); 255 ret = p->tcfp_result;
235 return police->tcfp_result; 256 goto end;
236 } 257 }
237 258
238 now = ktime_get_ns(); 259 now = ktime_get_ns();
239 toks = min_t(s64, now - police->tcfp_t_c, 260 toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst);
240 police->tcfp_burst); 261 if (p->peak_present) {
241 if (police->peak_present) { 262 ptoks = toks + p->tcfp_ptoks;
242 ptoks = toks + police->tcfp_ptoks; 263 if (ptoks > p->tcfp_mtu_ptoks)
243 if (ptoks > police->tcfp_mtu_ptoks) 264 ptoks = p->tcfp_mtu_ptoks;
244 ptoks = police->tcfp_mtu_ptoks; 265 ptoks -= (s64)psched_l2t_ns(&p->peak,
245 ptoks -= (s64) psched_l2t_ns(&police->peak, 266 qdisc_pkt_len(skb));
246 qdisc_pkt_len(skb));
247 } 267 }
248 toks += police->tcfp_toks; 268 toks += p->tcfp_toks;
249 if (toks > police->tcfp_burst) 269 if (toks > p->tcfp_burst)
250 toks = police->tcfp_burst; 270 toks = p->tcfp_burst;
251 toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); 271 toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
252 if ((toks|ptoks) >= 0) { 272 if ((toks|ptoks) >= 0) {
253 police->tcfp_t_c = now; 273 p->tcfp_t_c = now;
254 police->tcfp_toks = toks; 274 p->tcfp_toks = toks;
255 police->tcfp_ptoks = ptoks; 275 p->tcfp_ptoks = ptoks;
256 if (police->tcfp_result == TC_ACT_SHOT) 276 ret = p->tcfp_result;
257 police->tcf_qstats.drops++; 277 goto inc_drops;
258 spin_unlock(&police->tcf_lock);
259 return police->tcfp_result;
260 } 278 }
261 } 279 }
262 280
263 police->tcf_qstats.overlimits++; 281inc_overlimits:
264 if (police->tcf_action == TC_ACT_SHOT) 282 qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats));
265 police->tcf_qstats.drops++; 283inc_drops:
266 spin_unlock(&police->tcf_lock); 284 if (ret == TC_ACT_SHOT)
267 return police->tcf_action; 285 qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats));
286end:
287 return ret;
288}
289
290static void tcf_police_cleanup(struct tc_action *a)
291{
292 struct tcf_police *police = to_police(a);
293 struct tcf_police_params *p;
294
295 p = rcu_dereference_protected(police->params, 1);
296 if (p)
297 kfree_rcu(p, rcu);
268} 298}
269 299
270static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, 300static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
@@ -272,6 +302,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
272{ 302{
273 unsigned char *b = skb_tail_pointer(skb); 303 unsigned char *b = skb_tail_pointer(skb);
274 struct tcf_police *police = to_police(a); 304 struct tcf_police *police = to_police(a);
305 struct tcf_police_params *p;
275 struct tc_police opt = { 306 struct tc_police opt = {
276 .index = police->tcf_index, 307 .index = police->tcf_index,
277 .refcnt = refcount_read(&police->tcf_refcnt) - ref, 308 .refcnt = refcount_read(&police->tcf_refcnt) - ref,
@@ -281,19 +312,21 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
281 312
282 spin_lock_bh(&police->tcf_lock); 313 spin_lock_bh(&police->tcf_lock);
283 opt.action = police->tcf_action; 314 opt.action = police->tcf_action;
284 opt.mtu = police->tcfp_mtu; 315 p = rcu_dereference_protected(police->params,
285 opt.burst = PSCHED_NS2TICKS(police->tcfp_burst); 316 lockdep_is_held(&police->tcf_lock));
286 if (police->rate_present) 317 opt.mtu = p->tcfp_mtu;
287 psched_ratecfg_getrate(&opt.rate, &police->rate); 318 opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
288 if (police->peak_present) 319 if (p->rate_present)
289 psched_ratecfg_getrate(&opt.peakrate, &police->peak); 320 psched_ratecfg_getrate(&opt.rate, &p->rate);
321 if (p->peak_present)
322 psched_ratecfg_getrate(&opt.peakrate, &p->peak);
290 if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) 323 if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
291 goto nla_put_failure; 324 goto nla_put_failure;
292 if (police->tcfp_result && 325 if (p->tcfp_result &&
293 nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) 326 nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result))
294 goto nla_put_failure; 327 goto nla_put_failure;
295 if (police->tcfp_ewma_rate && 328 if (p->tcfp_ewma_rate &&
296 nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) 329 nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
297 goto nla_put_failure; 330 goto nla_put_failure;
298 331
299 t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); 332 t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
@@ -312,8 +345,7 @@ nla_put_failure:
312 return -1; 345 return -1;
313} 346}
314 347
315static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, 348static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
316 struct netlink_ext_ack *extack)
317{ 349{
318 struct tc_action_net *tn = net_generic(net, police_net_id); 350 struct tc_action_net *tn = net_generic(net, police_net_id);
319 351
@@ -333,6 +365,7 @@ static struct tc_action_ops act_police_ops = {
333 .init = tcf_police_init, 365 .init = tcf_police_init,
334 .walk = tcf_police_walker, 366 .walk = tcf_police_walker,
335 .lookup = tcf_police_search, 367 .lookup = tcf_police_search,
368 .cleanup = tcf_police_cleanup,
336 .size = sizeof(struct tcf_police), 369 .size = sizeof(struct tcf_police),
337}; 370};
338 371
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 6b67aa13d2dd..1a0c682fd734 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -224,8 +224,7 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
224 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 224 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
225} 225}
226 226
227static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, 227static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
228 struct netlink_ext_ack *extack)
229{ 228{
230 struct tc_action_net *tn = net_generic(net, sample_net_id); 229 struct tc_action_net *tn = net_generic(net, sample_net_id);
231 230
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 52400d49f81f..902957beceb3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -188,8 +188,7 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
188 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 188 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
189} 189}
190 190
191static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, 191static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
192 struct netlink_ext_ack *extack)
193{ 192{
194 struct tc_action_net *tn = net_generic(net, simp_net_id); 193 struct tc_action_net *tn = net_generic(net, simp_net_id);
195 194
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 73e44ce2a883..64dba3708fce 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -99,7 +99,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
99 struct netlink_ext_ack *extack) 99 struct netlink_ext_ack *extack)
100{ 100{
101 struct tc_action_net *tn = net_generic(net, skbedit_net_id); 101 struct tc_action_net *tn = net_generic(net, skbedit_net_id);
102 struct tcf_skbedit_params *params_old, *params_new; 102 struct tcf_skbedit_params *params_new;
103 struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; 103 struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
104 struct tc_skbedit *parm; 104 struct tc_skbedit *parm;
105 struct tcf_skbedit *d; 105 struct tcf_skbedit *d;
@@ -187,8 +187,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
187 } 187 }
188 } 188 }
189 189
190 ASSERT_RTNL();
191
192 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); 190 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
193 if (unlikely(!params_new)) { 191 if (unlikely(!params_new)) {
194 if (ret == ACT_P_CREATED) 192 if (ret == ACT_P_CREATED)
@@ -210,11 +208,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
210 if (flags & SKBEDIT_F_MASK) 208 if (flags & SKBEDIT_F_MASK)
211 params_new->mask = *mask; 209 params_new->mask = *mask;
212 210
211 spin_lock_bh(&d->tcf_lock);
213 d->tcf_action = parm->action; 212 d->tcf_action = parm->action;
214 params_old = rtnl_dereference(d->params); 213 rcu_swap_protected(d->params, params_new,
215 rcu_assign_pointer(d->params, params_new); 214 lockdep_is_held(&d->tcf_lock));
216 if (params_old) 215 spin_unlock_bh(&d->tcf_lock);
217 kfree_rcu(params_old, rcu); 216 if (params_new)
217 kfree_rcu(params_new, rcu);
218 218
219 if (ret == ACT_P_CREATED) 219 if (ret == ACT_P_CREATED)
220 tcf_idr_insert(tn, *a); 220 tcf_idr_insert(tn, *a);
@@ -231,12 +231,14 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
231 .index = d->tcf_index, 231 .index = d->tcf_index,
232 .refcnt = refcount_read(&d->tcf_refcnt) - ref, 232 .refcnt = refcount_read(&d->tcf_refcnt) - ref,
233 .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, 233 .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
234 .action = d->tcf_action,
235 }; 234 };
236 u64 pure_flags = 0; 235 u64 pure_flags = 0;
237 struct tcf_t t; 236 struct tcf_t t;
238 237
239 params = rtnl_dereference(d->params); 238 spin_lock_bh(&d->tcf_lock);
239 params = rcu_dereference_protected(d->params,
240 lockdep_is_held(&d->tcf_lock));
241 opt.action = d->tcf_action;
240 242
241 if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) 243 if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
242 goto nla_put_failure; 244 goto nla_put_failure;
@@ -264,9 +266,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
264 tcf_tm_dump(&t, &d->tcf_tm); 266 tcf_tm_dump(&t, &d->tcf_tm);
265 if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) 267 if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
266 goto nla_put_failure; 268 goto nla_put_failure;
269 spin_unlock_bh(&d->tcf_lock);
270
267 return skb->len; 271 return skb->len;
268 272
269nla_put_failure: 273nla_put_failure:
274 spin_unlock_bh(&d->tcf_lock);
270 nlmsg_trim(skb, b); 275 nlmsg_trim(skb, b);
271 return -1; 276 return -1;
272} 277}
@@ -291,8 +296,7 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
291 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 296 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
292} 297}
293 298
294static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, 299static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
295 struct netlink_ext_ack *extack)
296{ 300{
297 struct tc_action_net *tn = net_generic(net, skbedit_net_id); 301 struct tc_action_net *tn = net_generic(net, skbedit_net_id);
298 302
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 588077fafd6c..59710a183bd3 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -251,8 +251,7 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
251 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 251 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
252} 252}
253 253
254static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, 254static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
255 struct netlink_ext_ack *extack)
256{ 255{
257 struct tc_action_net *tn = net_generic(net, skbmod_net_id); 256 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
258 257
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 681f6f04e7da..4cca8f274662 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -548,8 +548,7 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
548 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 548 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
549} 549}
550 550
551static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, 551static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
552 struct netlink_ext_ack *extack)
553{ 552{
554 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); 553 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
555 554
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 033d273afe50..ba677d54a7af 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -288,8 +288,7 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
288 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 288 return tcf_generic_walker(tn, skb, cb, type, ops, extack);
289} 289}
290 290
291static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, 291static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
292 struct netlink_ext_ack *extack)
293{ 292{
294 struct tc_action_net *tn = net_generic(net, vlan_net_id); 293 struct tc_action_net *tn = net_generic(net, vlan_net_id);
295 294
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 70f144ac5e1d..f427a1e00e7e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -242,8 +242,8 @@ static void tcf_chain_destroy(struct tcf_chain *chain)
242 if (!chain->index) 242 if (!chain->index)
243 block->chain0.chain = NULL; 243 block->chain0.chain = NULL;
244 kfree(chain); 244 kfree(chain);
245 if (list_empty(&block->chain_list) && block->refcnt == 0) 245 if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt))
246 kfree(block); 246 kfree_rcu(block, rcu);
247} 247}
248 248
249static void tcf_chain_hold(struct tcf_chain *chain) 249static void tcf_chain_hold(struct tcf_chain *chain)
@@ -475,6 +475,7 @@ tcf_chain0_head_change_cb_del(struct tcf_block *block,
475} 475}
476 476
477struct tcf_net { 477struct tcf_net {
478 spinlock_t idr_lock; /* Protects idr */
478 struct idr idr; 479 struct idr idr;
479}; 480};
480 481
@@ -484,16 +485,25 @@ static int tcf_block_insert(struct tcf_block *block, struct net *net,
484 struct netlink_ext_ack *extack) 485 struct netlink_ext_ack *extack)
485{ 486{
486 struct tcf_net *tn = net_generic(net, tcf_net_id); 487 struct tcf_net *tn = net_generic(net, tcf_net_id);
488 int err;
489
490 idr_preload(GFP_KERNEL);
491 spin_lock(&tn->idr_lock);
492 err = idr_alloc_u32(&tn->idr, block, &block->index, block->index,
493 GFP_NOWAIT);
494 spin_unlock(&tn->idr_lock);
495 idr_preload_end();
487 496
488 return idr_alloc_u32(&tn->idr, block, &block->index, block->index, 497 return err;
489 GFP_KERNEL);
490} 498}
491 499
492static void tcf_block_remove(struct tcf_block *block, struct net *net) 500static void tcf_block_remove(struct tcf_block *block, struct net *net)
493{ 501{
494 struct tcf_net *tn = net_generic(net, tcf_net_id); 502 struct tcf_net *tn = net_generic(net, tcf_net_id);
495 503
504 spin_lock(&tn->idr_lock);
496 idr_remove(&tn->idr, block->index); 505 idr_remove(&tn->idr, block->index);
506 spin_unlock(&tn->idr_lock);
497} 507}
498 508
499static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, 509static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
@@ -512,7 +522,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
512 INIT_LIST_HEAD(&block->owner_list); 522 INIT_LIST_HEAD(&block->owner_list);
513 INIT_LIST_HEAD(&block->chain0.filter_chain_list); 523 INIT_LIST_HEAD(&block->chain0.filter_chain_list);
514 524
515 block->refcnt = 1; 525 refcount_set(&block->refcnt, 1);
516 block->net = net; 526 block->net = net;
517 block->index = block_index; 527 block->index = block_index;
518 528
@@ -529,6 +539,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
529 return idr_find(&tn->idr, block_index); 539 return idr_find(&tn->idr, block_index);
530} 540}
531 541
542static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index)
543{
544 struct tcf_block *block;
545
546 rcu_read_lock();
547 block = tcf_block_lookup(net, block_index);
548 if (block && !refcount_inc_not_zero(&block->refcnt))
549 block = NULL;
550 rcu_read_unlock();
551
552 return block;
553}
554
555static void tcf_block_flush_all_chains(struct tcf_block *block)
556{
557 struct tcf_chain *chain;
558
559 /* Hold a refcnt for all chains, so that they don't disappear
560 * while we are iterating.
561 */
562 list_for_each_entry(chain, &block->chain_list, list)
563 tcf_chain_hold(chain);
564
565 list_for_each_entry(chain, &block->chain_list, list)
566 tcf_chain_flush(chain);
567}
568
569static void tcf_block_put_all_chains(struct tcf_block *block)
570{
571 struct tcf_chain *chain, *tmp;
572
573 /* At this point, all the chains should have refcnt >= 1. */
574 list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
575 tcf_chain_put_explicitly_created(chain);
576 tcf_chain_put(chain);
577 }
578}
579
580static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
581 struct tcf_block_ext_info *ei)
582{
583 if (refcount_dec_and_test(&block->refcnt)) {
584 /* Flushing/putting all chains will cause the block to be
585 * deallocated when last chain is freed. However, if chain_list
586 * is empty, block has to be manually deallocated. After block
587 * reference counter reached 0, it is no longer possible to
588 * increment it or add new chains to block.
589 */
590 bool free_block = list_empty(&block->chain_list);
591
592 if (tcf_block_shared(block))
593 tcf_block_remove(block, block->net);
594 if (!free_block)
595 tcf_block_flush_all_chains(block);
596
597 if (q)
598 tcf_block_offload_unbind(block, q, ei);
599
600 if (free_block)
601 kfree_rcu(block, rcu);
602 else
603 tcf_block_put_all_chains(block);
604 } else if (q) {
605 tcf_block_offload_unbind(block, q, ei);
606 }
607}
608
609static void tcf_block_refcnt_put(struct tcf_block *block)
610{
611 __tcf_block_put(block, NULL, NULL);
612}
613
532/* Find tcf block. 614/* Find tcf block.
533 * Set q, parent, cl when appropriate. 615 * Set q, parent, cl when appropriate.
534 */ 616 */
@@ -539,9 +621,10 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
539 struct netlink_ext_ack *extack) 621 struct netlink_ext_ack *extack)
540{ 622{
541 struct tcf_block *block; 623 struct tcf_block *block;
624 int err = 0;
542 625
543 if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { 626 if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
544 block = tcf_block_lookup(net, block_index); 627 block = tcf_block_refcnt_get(net, block_index);
545 if (!block) { 628 if (!block) {
546 NL_SET_ERR_MSG(extack, "Block of given index was not found"); 629 NL_SET_ERR_MSG(extack, "Block of given index was not found");
547 return ERR_PTR(-EINVAL); 630 return ERR_PTR(-EINVAL);
@@ -550,55 +633,106 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
550 const struct Qdisc_class_ops *cops; 633 const struct Qdisc_class_ops *cops;
551 struct net_device *dev; 634 struct net_device *dev;
552 635
636 rcu_read_lock();
637
553 /* Find link */ 638 /* Find link */
554 dev = __dev_get_by_index(net, ifindex); 639 dev = dev_get_by_index_rcu(net, ifindex);
555 if (!dev) 640 if (!dev) {
641 rcu_read_unlock();
556 return ERR_PTR(-ENODEV); 642 return ERR_PTR(-ENODEV);
643 }
557 644
558 /* Find qdisc */ 645 /* Find qdisc */
559 if (!*parent) { 646 if (!*parent) {
560 *q = dev->qdisc; 647 *q = dev->qdisc;
561 *parent = (*q)->handle; 648 *parent = (*q)->handle;
562 } else { 649 } else {
563 *q = qdisc_lookup(dev, TC_H_MAJ(*parent)); 650 *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
564 if (!*q) { 651 if (!*q) {
565 NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); 652 NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
566 return ERR_PTR(-EINVAL); 653 err = -EINVAL;
654 goto errout_rcu;
567 } 655 }
568 } 656 }
569 657
658 *q = qdisc_refcount_inc_nz(*q);
659 if (!*q) {
660 NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
661 err = -EINVAL;
662 goto errout_rcu;
663 }
664
570 /* Is it classful? */ 665 /* Is it classful? */
571 cops = (*q)->ops->cl_ops; 666 cops = (*q)->ops->cl_ops;
572 if (!cops) { 667 if (!cops) {
573 NL_SET_ERR_MSG(extack, "Qdisc not classful"); 668 NL_SET_ERR_MSG(extack, "Qdisc not classful");
574 return ERR_PTR(-EINVAL); 669 err = -EINVAL;
670 goto errout_rcu;
575 } 671 }
576 672
577 if (!cops->tcf_block) { 673 if (!cops->tcf_block) {
578 NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); 674 NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
579 return ERR_PTR(-EOPNOTSUPP); 675 err = -EOPNOTSUPP;
676 goto errout_rcu;
580 } 677 }
581 678
679 /* At this point we know that qdisc is not noop_qdisc,
680 * which means that qdisc holds a reference to net_device
681 * and we hold a reference to qdisc, so it is safe to release
682 * rcu read lock.
683 */
684 rcu_read_unlock();
685
582 /* Do we search for filter, attached to class? */ 686 /* Do we search for filter, attached to class? */
583 if (TC_H_MIN(*parent)) { 687 if (TC_H_MIN(*parent)) {
584 *cl = cops->find(*q, *parent); 688 *cl = cops->find(*q, *parent);
585 if (*cl == 0) { 689 if (*cl == 0) {
586 NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); 690 NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
587 return ERR_PTR(-ENOENT); 691 err = -ENOENT;
692 goto errout_qdisc;
588 } 693 }
589 } 694 }
590 695
591 /* And the last stroke */ 696 /* And the last stroke */
592 block = cops->tcf_block(*q, *cl, extack); 697 block = cops->tcf_block(*q, *cl, extack);
593 if (!block) 698 if (!block) {
594 return ERR_PTR(-EINVAL); 699 err = -EINVAL;
700 goto errout_qdisc;
701 }
595 if (tcf_block_shared(block)) { 702 if (tcf_block_shared(block)) {
596 NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); 703 NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
597 return ERR_PTR(-EOPNOTSUPP); 704 err = -EOPNOTSUPP;
705 goto errout_qdisc;
598 } 706 }
707
708 /* Always take reference to block in order to support execution
709 * of rules update path of cls API without rtnl lock. Caller
710 * must release block when it is finished using it. 'if' block
711 * of this conditional obtain reference to block by calling
712 * tcf_block_refcnt_get().
713 */
714 refcount_inc(&block->refcnt);
599 } 715 }
600 716
601 return block; 717 return block;
718
719errout_rcu:
720 rcu_read_unlock();
721errout_qdisc:
722 if (*q) {
723 qdisc_put(*q);
724 *q = NULL;
725 }
726 return ERR_PTR(err);
727}
728
729static void tcf_block_release(struct Qdisc *q, struct tcf_block *block)
730{
731 if (!IS_ERR_OR_NULL(block))
732 tcf_block_refcnt_put(block);
733
734 if (q)
735 qdisc_put(q);
602} 736}
603 737
604struct tcf_block_owner_item { 738struct tcf_block_owner_item {
@@ -666,21 +800,16 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
666{ 800{
667 struct net *net = qdisc_net(q); 801 struct net *net = qdisc_net(q);
668 struct tcf_block *block = NULL; 802 struct tcf_block *block = NULL;
669 bool created = false;
670 int err; 803 int err;
671 804
672 if (ei->block_index) { 805 if (ei->block_index)
673 /* block_index not 0 means the shared block is requested */ 806 /* block_index not 0 means the shared block is requested */
674 block = tcf_block_lookup(net, ei->block_index); 807 block = tcf_block_refcnt_get(net, ei->block_index);
675 if (block)
676 block->refcnt++;
677 }
678 808
679 if (!block) { 809 if (!block) {
680 block = tcf_block_create(net, q, ei->block_index, extack); 810 block = tcf_block_create(net, q, ei->block_index, extack);
681 if (IS_ERR(block)) 811 if (IS_ERR(block))
682 return PTR_ERR(block); 812 return PTR_ERR(block);
683 created = true;
684 if (tcf_block_shared(block)) { 813 if (tcf_block_shared(block)) {
685 err = tcf_block_insert(block, net, extack); 814 err = tcf_block_insert(block, net, extack);
686 if (err) 815 if (err)
@@ -710,14 +839,8 @@ err_block_offload_bind:
710err_chain0_head_change_cb_add: 839err_chain0_head_change_cb_add:
711 tcf_block_owner_del(block, q, ei->binder_type); 840 tcf_block_owner_del(block, q, ei->binder_type);
712err_block_owner_add: 841err_block_owner_add:
713 if (created) {
714 if (tcf_block_shared(block))
715 tcf_block_remove(block, net);
716err_block_insert: 842err_block_insert:
717 kfree(block); 843 tcf_block_refcnt_put(block);
718 } else {
719 block->refcnt--;
720 }
721 return err; 844 return err;
722} 845}
723EXPORT_SYMBOL(tcf_block_get_ext); 846EXPORT_SYMBOL(tcf_block_get_ext);
@@ -749,42 +872,12 @@ EXPORT_SYMBOL(tcf_block_get);
749void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, 872void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
750 struct tcf_block_ext_info *ei) 873 struct tcf_block_ext_info *ei)
751{ 874{
752 struct tcf_chain *chain, *tmp;
753
754 if (!block) 875 if (!block)
755 return; 876 return;
756 tcf_chain0_head_change_cb_del(block, ei); 877 tcf_chain0_head_change_cb_del(block, ei);
757 tcf_block_owner_del(block, q, ei->binder_type); 878 tcf_block_owner_del(block, q, ei->binder_type);
758 879
759 if (block->refcnt == 1) { 880 __tcf_block_put(block, q, ei);
760 if (tcf_block_shared(block))
761 tcf_block_remove(block, block->net);
762
763 /* Hold a refcnt for all chains, so that they don't disappear
764 * while we are iterating.
765 */
766 list_for_each_entry(chain, &block->chain_list, list)
767 tcf_chain_hold(chain);
768
769 list_for_each_entry(chain, &block->chain_list, list)
770 tcf_chain_flush(chain);
771 }
772
773 tcf_block_offload_unbind(block, q, ei);
774
775 if (block->refcnt == 1) {
776 /* At this point, all the chains should have refcnt >= 1. */
777 list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
778 tcf_chain_put_explicitly_created(chain);
779 tcf_chain_put(chain);
780 }
781
782 block->refcnt--;
783 if (list_empty(&block->chain_list))
784 kfree(block);
785 } else {
786 block->refcnt--;
787 }
788} 881}
789EXPORT_SYMBOL(tcf_block_put_ext); 882EXPORT_SYMBOL(tcf_block_put_ext);
790 883
@@ -1334,6 +1427,7 @@ replay:
1334errout: 1427errout:
1335 if (chain) 1428 if (chain)
1336 tcf_chain_put(chain); 1429 tcf_chain_put(chain);
1430 tcf_block_release(q, block);
1337 if (err == -EAGAIN) 1431 if (err == -EAGAIN)
1338 /* Replay the request. */ 1432 /* Replay the request. */
1339 goto replay; 1433 goto replay;
@@ -1455,6 +1549,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
1455errout: 1549errout:
1456 if (chain) 1550 if (chain)
1457 tcf_chain_put(chain); 1551 tcf_chain_put(chain);
1552 tcf_block_release(q, block);
1458 return err; 1553 return err;
1459} 1554}
1460 1555
@@ -1540,6 +1635,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
1540errout: 1635errout:
1541 if (chain) 1636 if (chain)
1542 tcf_chain_put(chain); 1637 tcf_chain_put(chain);
1638 tcf_block_release(q, block);
1543 return err; 1639 return err;
1544} 1640}
1545 1641
@@ -1633,12 +1729,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
1633 if (nlmsg_len(cb->nlh) < sizeof(*tcm)) 1729 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1634 return skb->len; 1730 return skb->len;
1635 1731
1636 err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); 1732 err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL,
1733 cb->extack);
1637 if (err) 1734 if (err)
1638 return err; 1735 return err;
1639 1736
1640 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { 1737 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
1641 block = tcf_block_lookup(net, tcm->tcm_block_index); 1738 block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
1642 if (!block) 1739 if (!block)
1643 goto out; 1740 goto out;
1644 /* If we work with block index, q is NULL and parent value 1741 /* If we work with block index, q is NULL and parent value
@@ -1697,6 +1794,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
1697 } 1794 }
1698 } 1795 }
1699 1796
1797 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
1798 tcf_block_refcnt_put(block);
1700 cb->args[0] = index; 1799 cb->args[0] = index;
1701 1800
1702out: 1801out:
@@ -1856,7 +1955,8 @@ replay:
1856 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; 1955 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
1857 if (chain_index > TC_ACT_EXT_VAL_MASK) { 1956 if (chain_index > TC_ACT_EXT_VAL_MASK) {
1858 NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); 1957 NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
1859 return -EINVAL; 1958 err = -EINVAL;
1959 goto errout_block;
1860 } 1960 }
1861 chain = tcf_chain_lookup(block, chain_index); 1961 chain = tcf_chain_lookup(block, chain_index);
1862 if (n->nlmsg_type == RTM_NEWCHAIN) { 1962 if (n->nlmsg_type == RTM_NEWCHAIN) {
@@ -1868,23 +1968,27 @@ replay:
1868 tcf_chain_hold(chain); 1968 tcf_chain_hold(chain);
1869 } else { 1969 } else {
1870 NL_SET_ERR_MSG(extack, "Filter chain already exists"); 1970 NL_SET_ERR_MSG(extack, "Filter chain already exists");
1871 return -EEXIST; 1971 err = -EEXIST;
1972 goto errout_block;
1872 } 1973 }
1873 } else { 1974 } else {
1874 if (!(n->nlmsg_flags & NLM_F_CREATE)) { 1975 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1875 NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); 1976 NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
1876 return -ENOENT; 1977 err = -ENOENT;
1978 goto errout_block;
1877 } 1979 }
1878 chain = tcf_chain_create(block, chain_index); 1980 chain = tcf_chain_create(block, chain_index);
1879 if (!chain) { 1981 if (!chain) {
1880 NL_SET_ERR_MSG(extack, "Failed to create filter chain"); 1982 NL_SET_ERR_MSG(extack, "Failed to create filter chain");
1881 return -ENOMEM; 1983 err = -ENOMEM;
1984 goto errout_block;
1882 } 1985 }
1883 } 1986 }
1884 } else { 1987 } else {
1885 if (!chain || tcf_chain_held_by_acts_only(chain)) { 1988 if (!chain || tcf_chain_held_by_acts_only(chain)) {
1886 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); 1989 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
1887 return -EINVAL; 1990 err = -EINVAL;
1991 goto errout_block;
1888 } 1992 }
1889 tcf_chain_hold(chain); 1993 tcf_chain_hold(chain);
1890 } 1994 }
@@ -1928,6 +2032,8 @@ replay:
1928 2032
1929errout: 2033errout:
1930 tcf_chain_put(chain); 2034 tcf_chain_put(chain);
2035errout_block:
2036 tcf_block_release(q, block);
1931 if (err == -EAGAIN) 2037 if (err == -EAGAIN)
1932 /* Replay the request. */ 2038 /* Replay the request. */
1933 goto replay; 2039 goto replay;
@@ -1952,12 +2058,12 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
1952 return skb->len; 2058 return skb->len;
1953 2059
1954 err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, 2060 err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1955 NULL); 2061 cb->extack);
1956 if (err) 2062 if (err)
1957 return err; 2063 return err;
1958 2064
1959 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { 2065 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
1960 block = tcf_block_lookup(net, tcm->tcm_block_index); 2066 block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
1961 if (!block) 2067 if (!block)
1962 goto out; 2068 goto out;
1963 /* If we work with block index, q is NULL and parent value 2069 /* If we work with block index, q is NULL and parent value
@@ -2024,6 +2130,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
2024 index++; 2130 index++;
2025 } 2131 }
2026 2132
2133 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
2134 tcf_block_refcnt_put(block);
2027 cb->args[0] = index; 2135 cb->args[0] = index;
2028 2136
2029out: 2137out:
@@ -2216,6 +2324,7 @@ static __net_init int tcf_net_init(struct net *net)
2216{ 2324{
2217 struct tcf_net *tn = net_generic(net, tcf_net_id); 2325 struct tcf_net *tn = net_generic(net, tcf_net_id);
2218 2326
2327 spin_lock_init(&tn->idr_lock);
2219 idr_init(&tn->idr); 2328 idr_init(&tn->idr);
2220 return 0; 2329 return 0;
2221} 2330}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6fd9bdd93796..9aada2d0ef06 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -98,7 +98,7 @@ struct cls_fl_filter {
98 struct list_head list; 98 struct list_head list;
99 u32 handle; 99 u32 handle;
100 u32 flags; 100 u32 flags;
101 unsigned int in_hw_count; 101 u32 in_hw_count;
102 struct rcu_work rwork; 102 struct rcu_work rwork;
103 struct net_device *hw_dev; 103 struct net_device *hw_dev;
104}; 104};
@@ -993,7 +993,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
993} 993}
994 994
995#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) 995#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
996#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) 996#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
997 997
998#define FL_KEY_IS_MASKED(mask, member) \ 998#define FL_KEY_IS_MASKED(mask, member) \
999 memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ 999 memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1880 if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) 1880 if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
1881 goto nla_put_failure; 1881 goto nla_put_failure;
1882 1882
1883 if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
1884 goto nla_put_failure;
1885
1883 if (tcf_exts_dump(skb, &f->exts)) 1886 if (tcf_exts_dump(skb, &f->exts))
1884 goto nla_put_failure; 1887 goto nla_put_failure;
1885 1888
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b2c3406a2cf2..4b28fd44576d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,7 +68,6 @@ struct tc_u_knode {
68 u32 mask; 68 u32 mask;
69 u32 __percpu *pcpu_success; 69 u32 __percpu *pcpu_success;
70#endif 70#endif
71 struct tcf_proto *tp;
72 struct rcu_work rwork; 71 struct rcu_work rwork;
73 /* The 'sel' field MUST be the last field in structure to allow for 72 /* The 'sel' field MUST be the last field in structure to allow for
74 * tc_u32_keys allocated at end of structure. 73 * tc_u32_keys allocated at end of structure.
@@ -80,10 +79,10 @@ struct tc_u_hnode {
80 struct tc_u_hnode __rcu *next; 79 struct tc_u_hnode __rcu *next;
81 u32 handle; 80 u32 handle;
82 u32 prio; 81 u32 prio;
83 struct tc_u_common *tp_c;
84 int refcnt; 82 int refcnt;
85 unsigned int divisor; 83 unsigned int divisor;
86 struct idr handle_idr; 84 struct idr handle_idr;
85 bool is_root;
87 struct rcu_head rcu; 86 struct rcu_head rcu;
88 u32 flags; 87 u32 flags;
89 /* The 'ht' field MUST be the last field in structure to allow for 88 /* The 'ht' field MUST be the last field in structure to allow for
@@ -98,7 +97,7 @@ struct tc_u_common {
98 int refcnt; 97 int refcnt;
99 struct idr handle_idr; 98 struct idr handle_idr;
100 struct hlist_node hnode; 99 struct hlist_node hnode;
101 struct rcu_head rcu; 100 long knodes;
102}; 101};
103 102
104static inline unsigned int u32_hash_fold(__be32 key, 103static inline unsigned int u32_hash_fold(__be32 key,
@@ -344,19 +343,16 @@ static void *tc_u_common_ptr(const struct tcf_proto *tp)
344 return block->q; 343 return block->q;
345} 344}
346 345
347static unsigned int tc_u_hash(const struct tcf_proto *tp) 346static struct hlist_head *tc_u_hash(void *key)
348{ 347{
349 return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT); 348 return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT);
350} 349}
351 350
352static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) 351static struct tc_u_common *tc_u_common_find(void *key)
353{ 352{
354 struct tc_u_common *tc; 353 struct tc_u_common *tc;
355 unsigned int h; 354 hlist_for_each_entry(tc, tc_u_hash(key), hnode) {
356 355 if (tc->ptr == key)
357 h = tc_u_hash(tp);
358 hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
359 if (tc->ptr == tc_u_common_ptr(tp))
360 return tc; 356 return tc;
361 } 357 }
362 return NULL; 358 return NULL;
@@ -365,10 +361,8 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
365static int u32_init(struct tcf_proto *tp) 361static int u32_init(struct tcf_proto *tp)
366{ 362{
367 struct tc_u_hnode *root_ht; 363 struct tc_u_hnode *root_ht;
368 struct tc_u_common *tp_c; 364 void *key = tc_u_common_ptr(tp);
369 unsigned int h; 365 struct tc_u_common *tp_c = tc_u_common_find(key);
370
371 tp_c = tc_u_common_find(tp);
372 366
373 root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL); 367 root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
374 if (root_ht == NULL) 368 if (root_ht == NULL)
@@ -377,6 +371,7 @@ static int u32_init(struct tcf_proto *tp)
377 root_ht->refcnt++; 371 root_ht->refcnt++;
378 root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; 372 root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
379 root_ht->prio = tp->prio; 373 root_ht->prio = tp->prio;
374 root_ht->is_root = true;
380 idr_init(&root_ht->handle_idr); 375 idr_init(&root_ht->handle_idr);
381 376
382 if (tp_c == NULL) { 377 if (tp_c == NULL) {
@@ -385,18 +380,16 @@ static int u32_init(struct tcf_proto *tp)
385 kfree(root_ht); 380 kfree(root_ht);
386 return -ENOBUFS; 381 return -ENOBUFS;
387 } 382 }
388 tp_c->ptr = tc_u_common_ptr(tp); 383 tp_c->ptr = key;
389 INIT_HLIST_NODE(&tp_c->hnode); 384 INIT_HLIST_NODE(&tp_c->hnode);
390 idr_init(&tp_c->handle_idr); 385 idr_init(&tp_c->handle_idr);
391 386
392 h = tc_u_hash(tp); 387 hlist_add_head(&tp_c->hnode, tc_u_hash(key));
393 hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
394 } 388 }
395 389
396 tp_c->refcnt++; 390 tp_c->refcnt++;
397 RCU_INIT_POINTER(root_ht->next, tp_c->hlist); 391 RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
398 rcu_assign_pointer(tp_c->hlist, root_ht); 392 rcu_assign_pointer(tp_c->hlist, root_ht);
399 root_ht->tp_c = tp_c;
400 393
401 root_ht->refcnt++; 394 root_ht->refcnt++;
402 rcu_assign_pointer(tp->root, root_ht); 395 rcu_assign_pointer(tp->root, root_ht);
@@ -404,8 +397,7 @@ static int u32_init(struct tcf_proto *tp)
404 return 0; 397 return 0;
405} 398}
406 399
407static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, 400static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
408 bool free_pf)
409{ 401{
410 struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); 402 struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
411 403
@@ -439,7 +431,7 @@ static void u32_delete_key_work(struct work_struct *work)
439 struct tc_u_knode, 431 struct tc_u_knode,
440 rwork); 432 rwork);
441 rtnl_lock(); 433 rtnl_lock();
442 u32_destroy_key(key->tp, key, false); 434 u32_destroy_key(key, false);
443 rtnl_unlock(); 435 rtnl_unlock();
444} 436}
445 437
@@ -456,12 +448,13 @@ static void u32_delete_key_freepf_work(struct work_struct *work)
456 struct tc_u_knode, 448 struct tc_u_knode,
457 rwork); 449 rwork);
458 rtnl_lock(); 450 rtnl_lock();
459 u32_destroy_key(key->tp, key, true); 451 u32_destroy_key(key, true);
460 rtnl_unlock(); 452 rtnl_unlock();
461} 453}
462 454
463static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) 455static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
464{ 456{
457 struct tc_u_common *tp_c = tp->data;
465 struct tc_u_knode __rcu **kp; 458 struct tc_u_knode __rcu **kp;
466 struct tc_u_knode *pkp; 459 struct tc_u_knode *pkp;
467 struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); 460 struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
@@ -472,6 +465,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
472 kp = &pkp->next, pkp = rtnl_dereference(*kp)) { 465 kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
473 if (pkp == key) { 466 if (pkp == key) {
474 RCU_INIT_POINTER(*kp, key->next); 467 RCU_INIT_POINTER(*kp, key->next);
468 tp_c->knodes--;
475 469
476 tcf_unbind_filter(tp, &key->res); 470 tcf_unbind_filter(tp, &key->res);
477 idr_remove(&ht->handle_idr, key->handle); 471 idr_remove(&ht->handle_idr, key->handle);
@@ -586,6 +580,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
586static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, 580static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
587 struct netlink_ext_ack *extack) 581 struct netlink_ext_ack *extack)
588{ 582{
583 struct tc_u_common *tp_c = tp->data;
589 struct tc_u_knode *n; 584 struct tc_u_knode *n;
590 unsigned int h; 585 unsigned int h;
591 586
@@ -593,13 +588,14 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
593 while ((n = rtnl_dereference(ht->ht[h])) != NULL) { 588 while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
594 RCU_INIT_POINTER(ht->ht[h], 589 RCU_INIT_POINTER(ht->ht[h],
595 rtnl_dereference(n->next)); 590 rtnl_dereference(n->next));
591 tp_c->knodes--;
596 tcf_unbind_filter(tp, &n->res); 592 tcf_unbind_filter(tp, &n->res);
597 u32_remove_hw_knode(tp, n, extack); 593 u32_remove_hw_knode(tp, n, extack);
598 idr_remove(&ht->handle_idr, n->handle); 594 idr_remove(&ht->handle_idr, n->handle);
599 if (tcf_exts_get_net(&n->exts)) 595 if (tcf_exts_get_net(&n->exts))
600 tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); 596 tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
601 else 597 else
602 u32_destroy_key(n->tp, n, true); 598 u32_destroy_key(n, true);
603 } 599 }
604 } 600 }
605} 601}
@@ -632,17 +628,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
632 return -ENOENT; 628 return -ENOENT;
633} 629}
634 630
635static bool ht_empty(struct tc_u_hnode *ht)
636{
637 unsigned int h;
638
639 for (h = 0; h <= ht->divisor; h++)
640 if (rcu_access_pointer(ht->ht[h]))
641 return false;
642
643 return true;
644}
645
646static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) 631static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
647{ 632{
648 struct tc_u_common *tp_c = tp->data; 633 struct tc_u_common *tp_c = tp->data;
@@ -680,20 +665,16 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
680 struct netlink_ext_ack *extack) 665 struct netlink_ext_ack *extack)
681{ 666{
682 struct tc_u_hnode *ht = arg; 667 struct tc_u_hnode *ht = arg;
683 struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
684 struct tc_u_common *tp_c = tp->data; 668 struct tc_u_common *tp_c = tp->data;
685 int ret = 0; 669 int ret = 0;
686 670
687 if (ht == NULL)
688 goto out;
689
690 if (TC_U32_KEY(ht->handle)) { 671 if (TC_U32_KEY(ht->handle)) {
691 u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack); 672 u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack);
692 ret = u32_delete_key(tp, (struct tc_u_knode *)ht); 673 ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
693 goto out; 674 goto out;
694 } 675 }
695 676
696 if (root_ht == ht) { 677 if (ht->is_root) {
697 NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); 678 NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
698 return -EINVAL; 679 return -EINVAL;
699 } 680 }
@@ -706,38 +687,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
706 } 687 }
707 688
708out: 689out:
709 *last = true; 690 *last = tp_c->refcnt == 1 && tp_c->knodes == 0;
710 if (root_ht) {
711 if (root_ht->refcnt > 2) {
712 *last = false;
713 goto ret;
714 }
715 if (root_ht->refcnt == 2) {
716 if (!ht_empty(root_ht)) {
717 *last = false;
718 goto ret;
719 }
720 }
721 }
722
723 if (tp_c->refcnt > 1) {
724 *last = false;
725 goto ret;
726 }
727
728 if (tp_c->refcnt == 1) {
729 struct tc_u_hnode *ht;
730
731 for (ht = rtnl_dereference(tp_c->hlist);
732 ht;
733 ht = rtnl_dereference(ht->next))
734 if (!ht_empty(ht)) {
735 *last = false;
736 break;
737 }
738 }
739
740ret:
741 return ret; 691 return ret;
742} 692}
743 693
@@ -768,7 +718,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
768}; 718};
769 719
770static int u32_set_parms(struct net *net, struct tcf_proto *tp, 720static int u32_set_parms(struct net *net, struct tcf_proto *tp,
771 unsigned long base, struct tc_u_hnode *ht, 721 unsigned long base,
772 struct tc_u_knode *n, struct nlattr **tb, 722 struct tc_u_knode *n, struct nlattr **tb,
773 struct nlattr *est, bool ovr, 723 struct nlattr *est, bool ovr,
774 struct netlink_ext_ack *extack) 724 struct netlink_ext_ack *extack)
@@ -789,12 +739,16 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
789 } 739 }
790 740
791 if (handle) { 741 if (handle) {
792 ht_down = u32_lookup_ht(ht->tp_c, handle); 742 ht_down = u32_lookup_ht(tp->data, handle);
793 743
794 if (!ht_down) { 744 if (!ht_down) {
795 NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); 745 NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
796 return -EINVAL; 746 return -EINVAL;
797 } 747 }
748 if (ht_down->is_root) {
749 NL_SET_ERR_MSG_MOD(extack, "Not linking to root node");
750 return -EINVAL;
751 }
798 ht_down->refcnt++; 752 ht_down->refcnt++;
799 } 753 }
800 754
@@ -891,7 +845,6 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
891 /* Similarly success statistics must be moved as pointers */ 845 /* Similarly success statistics must be moved as pointers */
892 new->pcpu_success = n->pcpu_success; 846 new->pcpu_success = n->pcpu_success;
893#endif 847#endif
894 new->tp = tp;
895 memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); 848 memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
896 849
897 if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { 850 if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
@@ -960,18 +913,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
960 if (!new) 913 if (!new)
961 return -ENOMEM; 914 return -ENOMEM;
962 915
963 err = u32_set_parms(net, tp, base, 916 err = u32_set_parms(net, tp, base, new, tb,
964 rtnl_dereference(n->ht_up), new, tb,
965 tca[TCA_RATE], ovr, extack); 917 tca[TCA_RATE], ovr, extack);
966 918
967 if (err) { 919 if (err) {
968 u32_destroy_key(tp, new, false); 920 u32_destroy_key(new, false);
969 return err; 921 return err;
970 } 922 }
971 923
972 err = u32_replace_hw_knode(tp, new, flags, extack); 924 err = u32_replace_hw_knode(tp, new, flags, extack);
973 if (err) { 925 if (err) {
974 u32_destroy_key(tp, new, false); 926 u32_destroy_key(new, false);
975 return err; 927 return err;
976 } 928 }
977 929
@@ -988,7 +940,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
988 if (tb[TCA_U32_DIVISOR]) { 940 if (tb[TCA_U32_DIVISOR]) {
989 unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); 941 unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
990 942
991 if (--divisor > 0x100) { 943 if (!is_power_of_2(divisor)) {
944 NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2");
945 return -EINVAL;
946 }
947 if (divisor-- > 0x100) {
992 NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); 948 NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
993 return -EINVAL; 949 return -EINVAL;
994 } 950 }
@@ -1013,7 +969,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1013 return err; 969 return err;
1014 } 970 }
1015 } 971 }
1016 ht->tp_c = tp_c;
1017 ht->refcnt = 1; 972 ht->refcnt = 1;
1018 ht->divisor = divisor; 973 ht->divisor = divisor;
1019 ht->handle = handle; 974 ht->handle = handle;
@@ -1103,7 +1058,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1103 n->handle = handle; 1058 n->handle = handle;
1104 n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; 1059 n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
1105 n->flags = flags; 1060 n->flags = flags;
1106 n->tp = tp;
1107 1061
1108 err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); 1062 err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
1109 if (err < 0) 1063 if (err < 0)
@@ -1125,7 +1079,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1125 } 1079 }
1126#endif 1080#endif
1127 1081
1128 err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr, 1082 err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr,
1129 extack); 1083 extack);
1130 if (err == 0) { 1084 if (err == 0) {
1131 struct tc_u_knode __rcu **ins; 1085 struct tc_u_knode __rcu **ins;
@@ -1146,6 +1100,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1146 1100
1147 RCU_INIT_POINTER(n->next, pins); 1101 RCU_INIT_POINTER(n->next, pins);
1148 rcu_assign_pointer(*ins, n); 1102 rcu_assign_pointer(*ins, n);
1103 tp_c->knodes++;
1149 *arg = n; 1104 *arg = n;
1150 return 0; 1105 return 0;
1151 } 1106 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3dc0acf54245..ca3b0f46de53 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -27,7 +27,6 @@
27#include <linux/kmod.h> 27#include <linux/kmod.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/hrtimer.h> 29#include <linux/hrtimer.h>
30#include <linux/lockdep.h>
31#include <linux/slab.h> 30#include <linux/slab.h>
32#include <linux/hashtable.h> 31#include <linux/hashtable.h>
33 32
@@ -315,6 +314,24 @@ out:
315 return q; 314 return q;
316} 315}
317 316
317struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318{
319 struct netdev_queue *nq;
320 struct Qdisc *q;
321
322 if (!handle)
323 return NULL;
324 q = qdisc_match_from_root(dev->qdisc, handle);
325 if (q)
326 goto out;
327
328 nq = dev_ingress_queue_rcu(dev);
329 if (nq)
330 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331out:
332 return q;
333}
334
318static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 335static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319{ 336{
320 unsigned long cl; 337 unsigned long cl;
@@ -921,7 +938,7 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb,
921 qdisc_notify(net, skb, n, clid, old, new); 938 qdisc_notify(net, skb, n, clid, old, new);
922 939
923 if (old) 940 if (old)
924 qdisc_destroy(old); 941 qdisc_put(old);
925} 942}
926 943
927/* Graft qdisc "new" to class "classid" of qdisc "parent" or 944/* Graft qdisc "new" to class "classid" of qdisc "parent" or
@@ -974,7 +991,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
974 qdisc_refcount_inc(new); 991 qdisc_refcount_inc(new);
975 992
976 if (!ingress) 993 if (!ingress)
977 qdisc_destroy(old); 994 qdisc_put(old);
978 } 995 }
979 996
980skip: 997skip:
@@ -1053,10 +1070,6 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1053 return 0; 1070 return 0;
1054} 1071}
1055 1072
1056/* lockdep annotation is needed for ingress; egress gets it only for name */
1057static struct lock_class_key qdisc_tx_lock;
1058static struct lock_class_key qdisc_rx_lock;
1059
1060/* 1073/*
1061 Allocate and initialize new qdisc. 1074 Allocate and initialize new qdisc.
1062 1075
@@ -1121,7 +1134,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
1121 if (handle == TC_H_INGRESS) { 1134 if (handle == TC_H_INGRESS) {
1122 sch->flags |= TCQ_F_INGRESS; 1135 sch->flags |= TCQ_F_INGRESS;
1123 handle = TC_H_MAKE(TC_H_INGRESS, 0); 1136 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1124 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1125 } else { 1137 } else {
1126 if (handle == 0) { 1138 if (handle == 0) {
1127 handle = qdisc_alloc_handle(dev); 1139 handle = qdisc_alloc_handle(dev);
@@ -1129,7 +1141,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
1129 if (handle == 0) 1141 if (handle == 0)
1130 goto err_out3; 1142 goto err_out3;
1131 } 1143 }
1132 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1133 if (!netif_is_multiqueue(dev)) 1144 if (!netif_is_multiqueue(dev))
1134 sch->flags |= TCQ_F_ONETXQUEUE; 1145 sch->flags |= TCQ_F_ONETXQUEUE;
1135 } 1146 }
@@ -1309,7 +1320,6 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1309 1320
1310const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { 1321const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1311 [TCA_KIND] = { .type = NLA_STRING }, 1322 [TCA_KIND] = { .type = NLA_STRING },
1312 [TCA_OPTIONS] = { .type = NLA_NESTED },
1313 [TCA_RATE] = { .type = NLA_BINARY, 1323 [TCA_RATE] = { .type = NLA_BINARY,
1314 .len = sizeof(struct tc_estimator) }, 1324 .len = sizeof(struct tc_estimator) },
1315 [TCA_STAB] = { .type = NLA_NESTED }, 1325 [TCA_STAB] = { .type = NLA_NESTED },
@@ -1582,7 +1592,7 @@ graft:
1582 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); 1592 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1583 if (err) { 1593 if (err) {
1584 if (q) 1594 if (q)
1585 qdisc_destroy(q); 1595 qdisc_put(q);
1586 return err; 1596 return err;
1587 } 1597 }
1588 1598
@@ -1660,7 +1670,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1660 ASSERT_RTNL(); 1670 ASSERT_RTNL();
1661 1671
1662 err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, 1672 err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1663 rtm_tca_policy, NULL); 1673 rtm_tca_policy, cb->extack);
1664 if (err < 0) 1674 if (err < 0)
1665 return err; 1675 return err;
1666 1676
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index cd49afca9617..d714d3747bcb 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -150,7 +150,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
150 pr_debug("atm_tc_put: destroying\n"); 150 pr_debug("atm_tc_put: destroying\n");
151 list_del_init(&flow->list); 151 list_del_init(&flow->list);
152 pr_debug("atm_tc_put: qdisc %p\n", flow->q); 152 pr_debug("atm_tc_put: qdisc %p\n", flow->q);
153 qdisc_destroy(flow->q); 153 qdisc_put(flow->q);
154 tcf_block_put(flow->block); 154 tcf_block_put(flow->block);
155 if (flow->sock) { 155 if (flow->sock) {
156 pr_debug("atm_tc_put: f_count %ld\n", 156 pr_debug("atm_tc_put: f_count %ld\n",
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 793016d722ec..b910cd5c56f7 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -812,7 +812,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
812 812
813 if (skb) { 813 if (skb) {
814 flow->head = skb->next; 814 flow->head = skb->next;
815 skb->next = NULL; 815 skb_mark_not_on_list(skb);
816 } 816 }
817 817
818 return skb; 818 return skb;
@@ -1252,7 +1252,7 @@ found:
1252 else 1252 else
1253 flow->head = elig_ack->next; 1253 flow->head = elig_ack->next;
1254 1254
1255 elig_ack->next = NULL; 1255 skb_mark_not_on_list(elig_ack);
1256 1256
1257 return elig_ack; 1257 return elig_ack;
1258} 1258}
@@ -1675,7 +1675,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1675 1675
1676 while (segs) { 1676 while (segs) {
1677 nskb = segs->next; 1677 nskb = segs->next;
1678 segs->next = NULL; 1678 skb_mark_not_on_list(segs);
1679 qdisc_skb_cb(segs)->pkt_len = segs->len; 1679 qdisc_skb_cb(segs)->pkt_len = segs->len;
1680 cobalt_set_enqueue_time(segs, now); 1680 cobalt_set_enqueue_time(segs, now);
1681 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1681 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d53cfe..4dc05409e3fb 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1418,7 +1418,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1418 WARN_ON(cl->filters); 1418 WARN_ON(cl->filters);
1419 1419
1420 tcf_block_put(cl->block); 1420 tcf_block_put(cl->block);
1421 qdisc_destroy(cl->q); 1421 qdisc_put(cl->q);
1422 qdisc_put_rtab(cl->R_tab); 1422 qdisc_put_rtab(cl->R_tab);
1423 gen_kill_estimator(&cl->rate_est); 1423 gen_kill_estimator(&cl->rate_est);
1424 if (cl != &q->link) 1424 if (cl != &q->link)
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index e26a24017faa..e689e11b6d0f 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -379,7 +379,7 @@ static void cbs_destroy(struct Qdisc *sch)
379 cbs_disable_offload(dev, q); 379 cbs_disable_offload(dev, q);
380 380
381 if (q->qdisc) 381 if (q->qdisc)
382 qdisc_destroy(q->qdisc); 382 qdisc_put(q->qdisc);
383} 383}
384 384
385static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) 385static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8a9939..cdebaed0f8cf 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -134,7 +134,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
134 tca[TCA_RATE]); 134 tca[TCA_RATE]);
135 if (err) { 135 if (err) {
136 NL_SET_ERR_MSG(extack, "Failed to replace estimator"); 136 NL_SET_ERR_MSG(extack, "Failed to replace estimator");
137 qdisc_destroy(cl->qdisc); 137 qdisc_put(cl->qdisc);
138 kfree(cl); 138 kfree(cl);
139 return err; 139 return err;
140 } 140 }
@@ -153,7 +153,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
153static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) 153static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
154{ 154{
155 gen_kill_estimator(&cl->rate_est); 155 gen_kill_estimator(&cl->rate_est);
156 qdisc_destroy(cl->qdisc); 156 qdisc_put(cl->qdisc);
157 kfree(cl); 157 kfree(cl);
158} 158}
159 159
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 049714c57075..f6f480784bc6 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -412,7 +412,7 @@ static void dsmark_destroy(struct Qdisc *sch)
412 pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); 412 pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
413 413
414 tcf_block_put(p->block); 414 tcf_block_put(p->block);
415 qdisc_destroy(p->q); 415 qdisc_put(p->q);
416 if (p->mv != p->embedded) 416 if (p->mv != p->embedded)
417 kfree(p->mv); 417 kfree(p->mv);
418} 418}
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 24893d3b5d22..3809c9bf8896 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -177,7 +177,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
177 if (q) { 177 if (q) {
178 err = fifo_set_limit(q, limit); 178 err = fifo_set_limit(q, limit);
179 if (err < 0) { 179 if (err < 0) {
180 qdisc_destroy(q); 180 qdisc_put(q);
181 q = NULL; 181 q = NULL;
182 } 182 }
183 } 183 }
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4808713c73b9..4b1af706896c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -92,8 +92,8 @@ struct fq_sched_data {
92 u32 quantum; 92 u32 quantum;
93 u32 initial_quantum; 93 u32 initial_quantum;
94 u32 flow_refill_delay; 94 u32 flow_refill_delay;
95 u32 flow_max_rate; /* optional max rate per flow */
96 u32 flow_plimit; /* max packets per flow */ 95 u32 flow_plimit; /* max packets per flow */
96 unsigned long flow_max_rate; /* optional max rate per flow */
97 u32 orphan_mask; /* mask for orphaned skb */ 97 u32 orphan_mask; /* mask for orphaned skb */
98 u32 low_rate_threshold; 98 u32 low_rate_threshold;
99 struct rb_root *fq_root; 99 struct rb_root *fq_root;
@@ -106,7 +106,6 @@ struct fq_sched_data {
106 106
107 u64 stat_gc_flows; 107 u64 stat_gc_flows;
108 u64 stat_internal_packets; 108 u64 stat_internal_packets;
109 u64 stat_tcp_retrans;
110 u64 stat_throttled; 109 u64 stat_throttled;
111 u64 stat_flows_plimit; 110 u64 stat_flows_plimit;
112 u64 stat_pkts_too_long; 111 u64 stat_pkts_too_long;
@@ -319,7 +318,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
319 318
320 if (skb) { 319 if (skb) {
321 flow->head = skb->next; 320 flow->head = skb->next;
322 skb->next = NULL; 321 skb_mark_not_on_list(skb);
323 flow->qlen--; 322 flow->qlen--;
324 qdisc_qstats_backlog_dec(sch, skb); 323 qdisc_qstats_backlog_dec(sch, skb);
325 sch->q.qlen--; 324 sch->q.qlen--;
@@ -327,62 +326,17 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
327 return skb; 326 return skb;
328} 327}
329 328
330/* We might add in the future detection of retransmits
331 * For the time being, just return false
332 */
333static bool skb_is_retransmit(struct sk_buff *skb)
334{
335 return false;
336}
337
338/* add skb to flow queue
339 * flow queue is a linked list, kind of FIFO, except for TCP retransmits
340 * We special case tcp retransmits to be transmitted before other packets.
341 * We rely on fact that TCP retransmits are unlikely, so we do not waste
342 * a separate queue or a pointer.
343 * head-> [retrans pkt 1]
344 * [retrans pkt 2]
345 * [ normal pkt 1]
346 * [ normal pkt 2]
347 * [ normal pkt 3]
348 * tail-> [ normal pkt 4]
349 */
350static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 329static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
351{ 330{
352 struct sk_buff *prev, *head = flow->head; 331 struct sk_buff *head = flow->head;
353 332
354 skb->next = NULL; 333 skb->next = NULL;
355 if (!head) { 334 if (!head)
356 flow->head = skb; 335 flow->head = skb;
357 flow->tail = skb; 336 else
358 return;
359 }
360 if (likely(!skb_is_retransmit(skb))) {
361 flow->tail->next = skb; 337 flow->tail->next = skb;
362 flow->tail = skb;
363 return;
364 }
365 338
366 /* This skb is a tcp retransmit, 339 flow->tail = skb;
367 * find the last retrans packet in the queue
368 */
369 prev = NULL;
370 while (skb_is_retransmit(head)) {
371 prev = head;
372 head = head->next;
373 if (!head)
374 break;
375 }
376 if (!prev) { /* no rtx packet in queue, become the new head */
377 skb->next = flow->head;
378 flow->head = skb;
379 } else {
380 if (prev == flow->tail)
381 flow->tail = skb;
382 else
383 skb->next = prev->next;
384 prev->next = skb;
385 }
386} 340}
387 341
388static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 342static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -401,8 +355,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
401 } 355 }
402 356
403 f->qlen++; 357 f->qlen++;
404 if (skb_is_retransmit(skb))
405 q->stat_tcp_retrans++;
406 qdisc_qstats_backlog_inc(sch, skb); 358 qdisc_qstats_backlog_inc(sch, skb);
407 if (fq_flow_is_detached(f)) { 359 if (fq_flow_is_detached(f)) {
408 struct sock *sk = skb->sk; 360 struct sock *sk = skb->sk;
@@ -464,7 +416,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
464 struct fq_flow_head *head; 416 struct fq_flow_head *head;
465 struct sk_buff *skb; 417 struct sk_buff *skb;
466 struct fq_flow *f; 418 struct fq_flow *f;
467 u32 rate, plen; 419 unsigned long rate;
420 u32 plen;
468 421
469 skb = fq_dequeue_head(sch, &q->internal); 422 skb = fq_dequeue_head(sch, &q->internal);
470 if (skb) 423 if (skb)
@@ -491,11 +444,16 @@ begin:
491 } 444 }
492 445
493 skb = f->head; 446 skb = f->head;
494 if (unlikely(skb && now < f->time_next_packet && 447 if (skb) {
495 !skb_is_tcp_pure_ack(skb))) { 448 u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
496 head->first = f->next; 449 f->time_next_packet);
497 fq_flow_set_throttled(q, f); 450
498 goto begin; 451 if (now < time_next_packet) {
452 head->first = f->next;
453 f->time_next_packet = time_next_packet;
454 fq_flow_set_throttled(q, f);
455 goto begin;
456 }
499 } 457 }
500 458
501 skb = fq_dequeue_head(sch, f); 459 skb = fq_dequeue_head(sch, f);
@@ -513,11 +471,7 @@ begin:
513 prefetch(&skb->end); 471 prefetch(&skb->end);
514 f->credit -= qdisc_pkt_len(skb); 472 f->credit -= qdisc_pkt_len(skb);
515 473
516 if (!q->rate_enable) 474 if (ktime_to_ns(skb->tstamp) || !q->rate_enable)
517 goto out;
518
519 /* Do not pace locally generated ack packets */
520 if (skb_is_tcp_pure_ack(skb))
521 goto out; 475 goto out;
522 476
523 rate = q->flow_max_rate; 477 rate = q->flow_max_rate;
@@ -532,11 +486,11 @@ begin:
532 if (f->credit > 0) 486 if (f->credit > 0)
533 goto out; 487 goto out;
534 } 488 }
535 if (rate != ~0U) { 489 if (rate != ~0UL) {
536 u64 len = (u64)plen * NSEC_PER_SEC; 490 u64 len = (u64)plen * NSEC_PER_SEC;
537 491
538 if (likely(rate)) 492 if (likely(rate))
539 do_div(len, rate); 493 len = div64_ul(len, rate);
540 /* Since socket rate can change later, 494 /* Since socket rate can change later,
541 * clamp the delay to 1 second. 495 * clamp the delay to 1 second.
542 * Really, providers of too big packets should be fixed ! 496 * Really, providers of too big packets should be fixed !
@@ -748,9 +702,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
748 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 702 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
749 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 703 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
750 704
751 if (tb[TCA_FQ_FLOW_MAX_RATE]) 705 if (tb[TCA_FQ_FLOW_MAX_RATE]) {
752 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 706 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
753 707
708 q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
709 }
754 if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) 710 if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
755 q->low_rate_threshold = 711 q->low_rate_threshold =
756 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); 712 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
@@ -813,7 +769,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
813 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 769 q->quantum = 2 * psched_mtu(qdisc_dev(sch));
814 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 770 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
815 q->flow_refill_delay = msecs_to_jiffies(40); 771 q->flow_refill_delay = msecs_to_jiffies(40);
816 q->flow_max_rate = ~0U; 772 q->flow_max_rate = ~0UL;
817 q->time_next_delayed_flow = ~0ULL; 773 q->time_next_delayed_flow = ~0ULL;
818 q->rate_enable = 1; 774 q->rate_enable = 1;
819 q->new_flows.first = NULL; 775 q->new_flows.first = NULL;
@@ -823,7 +779,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
823 q->fq_trees_log = ilog2(1024); 779 q->fq_trees_log = ilog2(1024);
824 q->orphan_mask = 1024 - 1; 780 q->orphan_mask = 1024 - 1;
825 q->low_rate_threshold = 550000 / 8; 781 q->low_rate_threshold = 550000 / 8;
826 qdisc_watchdog_init(&q->watchdog, sch); 782 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
827 783
828 if (opt) 784 if (opt)
829 err = fq_change(sch, opt, extack); 785 err = fq_change(sch, opt, extack);
@@ -849,7 +805,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
849 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 805 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
850 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 806 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
851 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 807 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
852 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || 808 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
809 min_t(unsigned long, q->flow_max_rate, ~0U)) ||
853 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 810 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
854 jiffies_to_usecs(q->flow_refill_delay)) || 811 jiffies_to_usecs(q->flow_refill_delay)) ||
855 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 812 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
@@ -873,7 +830,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
873 830
874 st.gc_flows = q->stat_gc_flows; 831 st.gc_flows = q->stat_gc_flows;
875 st.highprio_packets = q->stat_internal_packets; 832 st.highprio_packets = q->stat_internal_packets;
876 st.tcp_retrans = q->stat_tcp_retrans; 833 st.tcp_retrans = 0;
877 st.throttled = q->stat_throttled; 834 st.throttled = q->stat_throttled;
878 st.flows_plimit = q->stat_flows_plimit; 835 st.flows_plimit = q->stat_flows_plimit;
879 st.pkts_too_long = q->stat_pkts_too_long; 836 st.pkts_too_long = q->stat_pkts_too_long;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c0a9d5dbf94..cd04d40c30b6 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -124,7 +124,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
124 struct sk_buff *skb = flow->head; 124 struct sk_buff *skb = flow->head;
125 125
126 flow->head = skb->next; 126 flow->head = skb->next;
127 skb->next = NULL; 127 skb_mark_not_on_list(skb);
128 return skb; 128 return skb;
129} 129}
130 130
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69078c82963e..de1663f7d3ad 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -184,7 +184,7 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
184 skb = nskb; 184 skb = nskb;
185 (*packets)++; /* GSO counts as one pkt */ 185 (*packets)++; /* GSO counts as one pkt */
186 } 186 }
187 skb->next = NULL; 187 skb_mark_not_on_list(skb);
188} 188}
189 189
190/* This variant of try_bulk_dequeue_skb() makes sure 190/* This variant of try_bulk_dequeue_skb() makes sure
@@ -210,7 +210,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
210 skb = nskb; 210 skb = nskb;
211 } while (++cnt < 8); 211 } while (++cnt < 8);
212 (*packets) += cnt; 212 (*packets) += cnt;
213 skb->next = NULL; 213 skb_mark_not_on_list(skb);
214} 214}
215 215
216/* Note that dequeue_skb can possibly return a SKB list (via skb->next). 216/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
@@ -572,6 +572,18 @@ struct Qdisc noop_qdisc = {
572 .dev_queue = &noop_netdev_queue, 572 .dev_queue = &noop_netdev_queue,
573 .running = SEQCNT_ZERO(noop_qdisc.running), 573 .running = SEQCNT_ZERO(noop_qdisc.running),
574 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), 574 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
575 .gso_skb = {
576 .next = (struct sk_buff *)&noop_qdisc.gso_skb,
577 .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
578 .qlen = 0,
579 .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
580 },
581 .skb_bad_txq = {
582 .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
583 .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
584 .qlen = 0,
585 .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
586 },
575}; 587};
576EXPORT_SYMBOL(noop_qdisc); 588EXPORT_SYMBOL(noop_qdisc);
577 589
@@ -901,7 +913,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
901 if (!ops->init || ops->init(sch, NULL, extack) == 0) 913 if (!ops->init || ops->init(sch, NULL, extack) == 0)
902 return sch; 914 return sch;
903 915
904 qdisc_destroy(sch); 916 qdisc_put(sch);
905 return NULL; 917 return NULL;
906} 918}
907EXPORT_SYMBOL(qdisc_create_dflt); 919EXPORT_SYMBOL(qdisc_create_dflt);
@@ -941,15 +953,18 @@ void qdisc_free(struct Qdisc *qdisc)
941 kfree((char *) qdisc - qdisc->padded); 953 kfree((char *) qdisc - qdisc->padded);
942} 954}
943 955
944void qdisc_destroy(struct Qdisc *qdisc) 956static void qdisc_free_cb(struct rcu_head *head)
957{
958 struct Qdisc *q = container_of(head, struct Qdisc, rcu);
959
960 qdisc_free(q);
961}
962
963static void qdisc_destroy(struct Qdisc *qdisc)
945{ 964{
946 const struct Qdisc_ops *ops = qdisc->ops; 965 const struct Qdisc_ops *ops = qdisc->ops;
947 struct sk_buff *skb, *tmp; 966 struct sk_buff *skb, *tmp;
948 967
949 if (qdisc->flags & TCQ_F_BUILTIN ||
950 !refcount_dec_and_test(&qdisc->refcnt))
951 return;
952
953#ifdef CONFIG_NET_SCHED 968#ifdef CONFIG_NET_SCHED
954 qdisc_hash_del(qdisc); 969 qdisc_hash_del(qdisc);
955 970
@@ -974,9 +989,34 @@ void qdisc_destroy(struct Qdisc *qdisc)
974 kfree_skb_list(skb); 989 kfree_skb_list(skb);
975 } 990 }
976 991
977 qdisc_free(qdisc); 992 call_rcu(&qdisc->rcu, qdisc_free_cb);
993}
994
995void qdisc_put(struct Qdisc *qdisc)
996{
997 if (qdisc->flags & TCQ_F_BUILTIN ||
998 !refcount_dec_and_test(&qdisc->refcnt))
999 return;
1000
1001 qdisc_destroy(qdisc);
1002}
1003EXPORT_SYMBOL(qdisc_put);
1004
1005/* Version of qdisc_put() that is called with rtnl mutex unlocked.
1006 * Intended to be used as optimization, this function only takes rtnl lock if
1007 * qdisc reference counter reached zero.
1008 */
1009
1010void qdisc_put_unlocked(struct Qdisc *qdisc)
1011{
1012 if (qdisc->flags & TCQ_F_BUILTIN ||
1013 !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
1014 return;
1015
1016 qdisc_destroy(qdisc);
1017 rtnl_unlock();
978} 1018}
979EXPORT_SYMBOL(qdisc_destroy); 1019EXPORT_SYMBOL(qdisc_put_unlocked);
980 1020
981/* Attach toplevel qdisc to device queue. */ 1021/* Attach toplevel qdisc to device queue. */
982struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, 1022struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
@@ -1245,8 +1285,6 @@ static void dev_init_scheduler_queue(struct net_device *dev,
1245 1285
1246 rcu_assign_pointer(dev_queue->qdisc, qdisc); 1286 rcu_assign_pointer(dev_queue->qdisc, qdisc);
1247 dev_queue->qdisc_sleeping = qdisc; 1287 dev_queue->qdisc_sleeping = qdisc;
1248 __skb_queue_head_init(&qdisc->gso_skb);
1249 __skb_queue_head_init(&qdisc->skb_bad_txq);
1250} 1288}
1251 1289
1252void dev_init_scheduler(struct net_device *dev) 1290void dev_init_scheduler(struct net_device *dev)
@@ -1270,7 +1308,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
1270 rcu_assign_pointer(dev_queue->qdisc, qdisc_default); 1308 rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1271 dev_queue->qdisc_sleeping = qdisc_default; 1309 dev_queue->qdisc_sleeping = qdisc_default;
1272 1310
1273 qdisc_destroy(qdisc); 1311 qdisc_put(qdisc);
1274 } 1312 }
1275} 1313}
1276 1314
@@ -1279,7 +1317,7 @@ void dev_shutdown(struct net_device *dev)
1279 netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); 1317 netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1280 if (dev_ingress_queue(dev)) 1318 if (dev_ingress_queue(dev))
1281 shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); 1319 shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1282 qdisc_destroy(dev->qdisc); 1320 qdisc_put(dev->qdisc);
1283 dev->qdisc = &noop_qdisc; 1321 dev->qdisc = &noop_qdisc;
1284 1322
1285 WARN_ON(timer_pending(&dev->watchdog_timer)); 1323 WARN_ON(timer_pending(&dev->watchdog_timer));
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3278a76f6861..b18ec1f6de60 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1092,7 +1092,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
1092 struct hfsc_sched *q = qdisc_priv(sch); 1092 struct hfsc_sched *q = qdisc_priv(sch);
1093 1093
1094 tcf_block_put(cl->block); 1094 tcf_block_put(cl->block);
1095 qdisc_destroy(cl->qdisc); 1095 qdisc_put(cl->qdisc);
1096 gen_kill_estimator(&cl->rate_est); 1096 gen_kill_estimator(&cl->rate_est);
1097 if (cl != &q->root) 1097 if (cl != &q->root)
1098 kfree(cl); 1098 kfree(cl);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c3a8388dcdf6..9d6a47697406 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -330,7 +330,7 @@ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
330 struct sk_buff *skb = bucket->head; 330 struct sk_buff *skb = bucket->head;
331 331
332 bucket->head = skb->next; 332 bucket->head = skb->next;
333 skb->next = NULL; 333 skb_mark_not_on_list(skb);
334 return skb; 334 return skb;
335} 335}
336 336
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 43c4bfe625a9..58b449490757 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -132,7 +132,7 @@ struct htb_class {
132 struct htb_class_inner { 132 struct htb_class_inner {
133 struct htb_prio clprio[TC_HTB_NUMPRIO]; 133 struct htb_prio clprio[TC_HTB_NUMPRIO];
134 } inner; 134 } inner;
135 } un; 135 };
136 s64 pq_key; 136 s64 pq_key;
137 137
138 int prio_activity; /* for which prios are we active */ 138 int prio_activity; /* for which prios are we active */
@@ -411,13 +411,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
411 int prio = ffz(~m); 411 int prio = ffz(~m);
412 m &= ~(1 << prio); 412 m &= ~(1 << prio);
413 413
414 if (p->un.inner.clprio[prio].feed.rb_node) 414 if (p->inner.clprio[prio].feed.rb_node)
415 /* parent already has its feed in use so that 415 /* parent already has its feed in use so that
416 * reset bit in mask as parent is already ok 416 * reset bit in mask as parent is already ok
417 */ 417 */
418 mask &= ~(1 << prio); 418 mask &= ~(1 << prio);
419 419
420 htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); 420 htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
421 } 421 }
422 p->prio_activity |= mask; 422 p->prio_activity |= mask;
423 cl = p; 423 cl = p;
@@ -447,19 +447,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
447 int prio = ffz(~m); 447 int prio = ffz(~m);
448 m &= ~(1 << prio); 448 m &= ~(1 << prio);
449 449
450 if (p->un.inner.clprio[prio].ptr == cl->node + prio) { 450 if (p->inner.clprio[prio].ptr == cl->node + prio) {
451 /* we are removing child which is pointed to from 451 /* we are removing child which is pointed to from
452 * parent feed - forget the pointer but remember 452 * parent feed - forget the pointer but remember
453 * classid 453 * classid
454 */ 454 */
455 p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; 455 p->inner.clprio[prio].last_ptr_id = cl->common.classid;
456 p->un.inner.clprio[prio].ptr = NULL; 456 p->inner.clprio[prio].ptr = NULL;
457 } 457 }
458 458
459 htb_safe_rb_erase(cl->node + prio, 459 htb_safe_rb_erase(cl->node + prio,
460 &p->un.inner.clprio[prio].feed); 460 &p->inner.clprio[prio].feed);
461 461
462 if (!p->un.inner.clprio[prio].feed.rb_node) 462 if (!p->inner.clprio[prio].feed.rb_node)
463 mask |= 1 << prio; 463 mask |= 1 << prio;
464 } 464 }
465 465
@@ -555,7 +555,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
555 */ 555 */
556static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) 556static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
557{ 557{
558 WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen); 558 WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
559 559
560 if (!cl->prio_activity) { 560 if (!cl->prio_activity) {
561 cl->prio_activity = 1 << cl->prio; 561 cl->prio_activity = 1 << cl->prio;
@@ -577,22 +577,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
577 cl->prio_activity = 0; 577 cl->prio_activity = 0;
578} 578}
579 579
580static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
581 struct qdisc_skb_head *qh)
582{
583 struct sk_buff *last = qh->tail;
584
585 if (last) {
586 skb->next = NULL;
587 last->next = skb;
588 qh->tail = skb;
589 } else {
590 qh->tail = skb;
591 qh->head = skb;
592 }
593 qh->qlen++;
594}
595
596static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, 580static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
597 struct sk_buff **to_free) 581 struct sk_buff **to_free)
598{ 582{
@@ -603,7 +587,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
603 if (cl == HTB_DIRECT) { 587 if (cl == HTB_DIRECT) {
604 /* enqueue to helper queue */ 588 /* enqueue to helper queue */
605 if (q->direct_queue.qlen < q->direct_qlen) { 589 if (q->direct_queue.qlen < q->direct_qlen) {
606 htb_enqueue_tail(skb, sch, &q->direct_queue); 590 __qdisc_enqueue_tail(skb, &q->direct_queue);
607 q->direct_pkts++; 591 q->direct_pkts++;
608 } else { 592 } else {
609 return qdisc_drop(skb, sch, to_free); 593 return qdisc_drop(skb, sch, to_free);
@@ -615,7 +599,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
615 __qdisc_drop(skb, to_free); 599 __qdisc_drop(skb, to_free);
616 return ret; 600 return ret;
617#endif 601#endif
618 } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, 602 } else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
619 to_free)) != NET_XMIT_SUCCESS) { 603 to_free)) != NET_XMIT_SUCCESS) {
620 if (net_xmit_drop_count(ret)) { 604 if (net_xmit_drop_count(ret)) {
621 qdisc_qstats_drop(sch); 605 qdisc_qstats_drop(sch);
@@ -823,7 +807,7 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
823 cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); 807 cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
824 if (!cl->level) 808 if (!cl->level)
825 return cl; 809 return cl;
826 clp = &cl->un.inner.clprio[prio]; 810 clp = &cl->inner.clprio[prio];
827 (++sp)->root = clp->feed.rb_node; 811 (++sp)->root = clp->feed.rb_node;
828 sp->pptr = &clp->ptr; 812 sp->pptr = &clp->ptr;
829 sp->pid = &clp->last_ptr_id; 813 sp->pid = &clp->last_ptr_id;
@@ -857,7 +841,7 @@ next:
857 * graft operation on the leaf since last dequeue; 841 * graft operation on the leaf since last dequeue;
858 * simply deactivate and skip such class 842 * simply deactivate and skip such class
859 */ 843 */
860 if (unlikely(cl->un.leaf.q->q.qlen == 0)) { 844 if (unlikely(cl->leaf.q->q.qlen == 0)) {
861 struct htb_class *next; 845 struct htb_class *next;
862 htb_deactivate(q, cl); 846 htb_deactivate(q, cl);
863 847
@@ -873,12 +857,12 @@ next:
873 goto next; 857 goto next;
874 } 858 }
875 859
876 skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); 860 skb = cl->leaf.q->dequeue(cl->leaf.q);
877 if (likely(skb != NULL)) 861 if (likely(skb != NULL))
878 break; 862 break;
879 863
880 qdisc_warn_nonwc("htb", cl->un.leaf.q); 864 qdisc_warn_nonwc("htb", cl->leaf.q);
881 htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: 865 htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
882 &q->hlevel[0].hprio[prio].ptr); 866 &q->hlevel[0].hprio[prio].ptr);
883 cl = htb_lookup_leaf(hprio, prio); 867 cl = htb_lookup_leaf(hprio, prio);
884 868
@@ -886,16 +870,16 @@ next:
886 870
887 if (likely(skb != NULL)) { 871 if (likely(skb != NULL)) {
888 bstats_update(&cl->bstats, skb); 872 bstats_update(&cl->bstats, skb);
889 cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); 873 cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
890 if (cl->un.leaf.deficit[level] < 0) { 874 if (cl->leaf.deficit[level] < 0) {
891 cl->un.leaf.deficit[level] += cl->quantum; 875 cl->leaf.deficit[level] += cl->quantum;
892 htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : 876 htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
893 &q->hlevel[0].hprio[prio].ptr); 877 &q->hlevel[0].hprio[prio].ptr);
894 } 878 }
895 /* this used to be after charge_class but this constelation 879 /* this used to be after charge_class but this constelation
896 * gives us slightly better performance 880 * gives us slightly better performance
897 */ 881 */
898 if (!cl->un.leaf.q->q.qlen) 882 if (!cl->leaf.q->q.qlen)
899 htb_deactivate(q, cl); 883 htb_deactivate(q, cl);
900 htb_charge_class(q, cl, level, skb); 884 htb_charge_class(q, cl, level, skb);
901 } 885 }
@@ -972,10 +956,10 @@ static void htb_reset(struct Qdisc *sch)
972 for (i = 0; i < q->clhash.hashsize; i++) { 956 for (i = 0; i < q->clhash.hashsize; i++) {
973 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { 957 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
974 if (cl->level) 958 if (cl->level)
975 memset(&cl->un.inner, 0, sizeof(cl->un.inner)); 959 memset(&cl->inner, 0, sizeof(cl->inner));
976 else { 960 else {
977 if (cl->un.leaf.q) 961 if (cl->leaf.q)
978 qdisc_reset(cl->un.leaf.q); 962 qdisc_reset(cl->leaf.q);
979 } 963 }
980 cl->prio_activity = 0; 964 cl->prio_activity = 0;
981 cl->cmode = HTB_CAN_SEND; 965 cl->cmode = HTB_CAN_SEND;
@@ -1098,8 +1082,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1098 */ 1082 */
1099 tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; 1083 tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1100 tcm->tcm_handle = cl->common.classid; 1084 tcm->tcm_handle = cl->common.classid;
1101 if (!cl->level && cl->un.leaf.q) 1085 if (!cl->level && cl->leaf.q)
1102 tcm->tcm_info = cl->un.leaf.q->handle; 1086 tcm->tcm_info = cl->leaf.q->handle;
1103 1087
1104 nest = nla_nest_start(skb, TCA_OPTIONS); 1088 nest = nla_nest_start(skb, TCA_OPTIONS);
1105 if (nest == NULL) 1089 if (nest == NULL)
@@ -1142,9 +1126,9 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1142 }; 1126 };
1143 __u32 qlen = 0; 1127 __u32 qlen = 0;
1144 1128
1145 if (!cl->level && cl->un.leaf.q) { 1129 if (!cl->level && cl->leaf.q) {
1146 qlen = cl->un.leaf.q->q.qlen; 1130 qlen = cl->leaf.q->q.qlen;
1147 qs.backlog = cl->un.leaf.q->qstats.backlog; 1131 qs.backlog = cl->leaf.q->qstats.backlog;
1148 } 1132 }
1149 cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), 1133 cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1150 INT_MIN, INT_MAX); 1134 INT_MIN, INT_MAX);
@@ -1172,14 +1156,14 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1172 cl->common.classid, extack)) == NULL) 1156 cl->common.classid, extack)) == NULL)
1173 return -ENOBUFS; 1157 return -ENOBUFS;
1174 1158
1175 *old = qdisc_replace(sch, new, &cl->un.leaf.q); 1159 *old = qdisc_replace(sch, new, &cl->leaf.q);
1176 return 0; 1160 return 0;
1177} 1161}
1178 1162
1179static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg) 1163static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1180{ 1164{
1181 struct htb_class *cl = (struct htb_class *)arg; 1165 struct htb_class *cl = (struct htb_class *)arg;
1182 return !cl->level ? cl->un.leaf.q : NULL; 1166 return !cl->level ? cl->leaf.q : NULL;
1183} 1167}
1184 1168
1185static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) 1169static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -1205,15 +1189,15 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1205{ 1189{
1206 struct htb_class *parent = cl->parent; 1190 struct htb_class *parent = cl->parent;
1207 1191
1208 WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); 1192 WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1209 1193
1210 if (parent->cmode != HTB_CAN_SEND) 1194 if (parent->cmode != HTB_CAN_SEND)
1211 htb_safe_rb_erase(&parent->pq_node, 1195 htb_safe_rb_erase(&parent->pq_node,
1212 &q->hlevel[parent->level].wait_pq); 1196 &q->hlevel[parent->level].wait_pq);
1213 1197
1214 parent->level = 0; 1198 parent->level = 0;
1215 memset(&parent->un.inner, 0, sizeof(parent->un.inner)); 1199 memset(&parent->inner, 0, sizeof(parent->inner));
1216 parent->un.leaf.q = new_q ? new_q : &noop_qdisc; 1200 parent->leaf.q = new_q ? new_q : &noop_qdisc;
1217 parent->tokens = parent->buffer; 1201 parent->tokens = parent->buffer;
1218 parent->ctokens = parent->cbuffer; 1202 parent->ctokens = parent->cbuffer;
1219 parent->t_c = ktime_get_ns(); 1203 parent->t_c = ktime_get_ns();
@@ -1223,8 +1207,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1223static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) 1207static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1224{ 1208{
1225 if (!cl->level) { 1209 if (!cl->level) {
1226 WARN_ON(!cl->un.leaf.q); 1210 WARN_ON(!cl->leaf.q);
1227 qdisc_destroy(cl->un.leaf.q); 1211 qdisc_put(cl->leaf.q);
1228 } 1212 }
1229 gen_kill_estimator(&cl->rate_est); 1213 gen_kill_estimator(&cl->rate_est);
1230 tcf_block_put(cl->block); 1214 tcf_block_put(cl->block);
@@ -1286,11 +1270,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
1286 sch_tree_lock(sch); 1270 sch_tree_lock(sch);
1287 1271
1288 if (!cl->level) { 1272 if (!cl->level) {
1289 unsigned int qlen = cl->un.leaf.q->q.qlen; 1273 unsigned int qlen = cl->leaf.q->q.qlen;
1290 unsigned int backlog = cl->un.leaf.q->qstats.backlog; 1274 unsigned int backlog = cl->leaf.q->qstats.backlog;
1291 1275
1292 qdisc_reset(cl->un.leaf.q); 1276 qdisc_reset(cl->leaf.q);
1293 qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog); 1277 qdisc_tree_reduce_backlog(cl->leaf.q, qlen, backlog);
1294 } 1278 }
1295 1279
1296 /* delete from hash and active; remainder in destroy_class */ 1280 /* delete from hash and active; remainder in destroy_class */
@@ -1419,13 +1403,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1419 classid, NULL); 1403 classid, NULL);
1420 sch_tree_lock(sch); 1404 sch_tree_lock(sch);
1421 if (parent && !parent->level) { 1405 if (parent && !parent->level) {
1422 unsigned int qlen = parent->un.leaf.q->q.qlen; 1406 unsigned int qlen = parent->leaf.q->q.qlen;
1423 unsigned int backlog = parent->un.leaf.q->qstats.backlog; 1407 unsigned int backlog = parent->leaf.q->qstats.backlog;
1424 1408
1425 /* turn parent into inner node */ 1409 /* turn parent into inner node */
1426 qdisc_reset(parent->un.leaf.q); 1410 qdisc_reset(parent->leaf.q);
1427 qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog); 1411 qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog);
1428 qdisc_destroy(parent->un.leaf.q); 1412 qdisc_put(parent->leaf.q);
1429 if (parent->prio_activity) 1413 if (parent->prio_activity)
1430 htb_deactivate(q, parent); 1414 htb_deactivate(q, parent);
1431 1415
@@ -1436,10 +1420,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1436 } 1420 }
1437 parent->level = (parent->parent ? parent->parent->level 1421 parent->level = (parent->parent ? parent->parent->level
1438 : TC_HTB_MAXDEPTH) - 1; 1422 : TC_HTB_MAXDEPTH) - 1;
1439 memset(&parent->un.inner, 0, sizeof(parent->un.inner)); 1423 memset(&parent->inner, 0, sizeof(parent->inner));
1440 } 1424 }
1441 /* leaf (we) needs elementary qdisc */ 1425 /* leaf (we) needs elementary qdisc */
1442 cl->un.leaf.q = new_q ? new_q : &noop_qdisc; 1426 cl->leaf.q = new_q ? new_q : &noop_qdisc;
1443 1427
1444 cl->common.classid = classid; 1428 cl->common.classid = classid;
1445 cl->parent = parent; 1429 cl->parent = parent;
@@ -1455,8 +1439,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1455 qdisc_class_hash_insert(&q->clhash, &cl->common); 1439 qdisc_class_hash_insert(&q->clhash, &cl->common);
1456 if (parent) 1440 if (parent)
1457 parent->children++; 1441 parent->children++;
1458 if (cl->un.leaf.q != &noop_qdisc) 1442 if (cl->leaf.q != &noop_qdisc)
1459 qdisc_hash_add(cl->un.leaf.q, true); 1443 qdisc_hash_add(cl->leaf.q, true);
1460 } else { 1444 } else {
1461 if (tca[TCA_RATE]) { 1445 if (tca[TCA_RATE]) {
1462 err = gen_replace_estimator(&cl->bstats, NULL, 1446 err = gen_replace_estimator(&cl->bstats, NULL,
@@ -1478,7 +1462,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1478 psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); 1462 psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1479 1463
1480 /* it used to be a nasty bug here, we have to check that node 1464 /* it used to be a nasty bug here, we have to check that node
1481 * is really leaf before changing cl->un.leaf ! 1465 * is really leaf before changing cl->leaf !
1482 */ 1466 */
1483 if (!cl->level) { 1467 if (!cl->level) {
1484 u64 quantum = cl->rate.rate_bytes_ps; 1468 u64 quantum = cl->rate.rate_bytes_ps;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index d6b8ae4ed7a3..f20f3a0f8424 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -65,7 +65,7 @@ static void mq_destroy(struct Qdisc *sch)
65 if (!priv->qdiscs) 65 if (!priv->qdiscs)
66 return; 66 return;
67 for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) 67 for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
68 qdisc_destroy(priv->qdiscs[ntx]); 68 qdisc_put(priv->qdiscs[ntx]);
69 kfree(priv->qdiscs); 69 kfree(priv->qdiscs);
70} 70}
71 71
@@ -119,7 +119,7 @@ static void mq_attach(struct Qdisc *sch)
119 qdisc = priv->qdiscs[ntx]; 119 qdisc = priv->qdiscs[ntx];
120 old = dev_graft_qdisc(qdisc->dev_queue, qdisc); 120 old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
121 if (old) 121 if (old)
122 qdisc_destroy(old); 122 qdisc_put(old);
123#ifdef CONFIG_NET_SCHED 123#ifdef CONFIG_NET_SCHED
124 if (ntx < dev->real_num_tx_queues) 124 if (ntx < dev->real_num_tx_queues)
125 qdisc_hash_add(qdisc, false); 125 qdisc_hash_add(qdisc, false);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 0e9d761cdd80..d364e63c396d 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -40,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
40 for (ntx = 0; 40 for (ntx = 0;
41 ntx < dev->num_tx_queues && priv->qdiscs[ntx]; 41 ntx < dev->num_tx_queues && priv->qdiscs[ntx];
42 ntx++) 42 ntx++)
43 qdisc_destroy(priv->qdiscs[ntx]); 43 qdisc_put(priv->qdiscs[ntx]);
44 kfree(priv->qdiscs); 44 kfree(priv->qdiscs);
45 } 45 }
46 46
@@ -300,7 +300,7 @@ static void mqprio_attach(struct Qdisc *sch)
300 qdisc = priv->qdiscs[ntx]; 300 qdisc = priv->qdiscs[ntx];
301 old = dev_graft_qdisc(qdisc->dev_queue, qdisc); 301 old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
302 if (old) 302 if (old)
303 qdisc_destroy(old); 303 qdisc_put(old);
304 if (ntx < dev->real_num_tx_queues) 304 if (ntx < dev->real_num_tx_queues)
305 qdisc_hash_add(qdisc, false); 305 qdisc_hash_add(qdisc, false);
306 } 306 }
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1da7ea8de0ad..7410ce4d0321 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -175,7 +175,7 @@ multiq_destroy(struct Qdisc *sch)
175 175
176 tcf_block_put(q->block); 176 tcf_block_put(q->block);
177 for (band = 0; band < q->bands; band++) 177 for (band = 0; band < q->bands; band++)
178 qdisc_destroy(q->queues[band]); 178 qdisc_put(q->queues[band]);
179 179
180 kfree(q->queues); 180 kfree(q->queues);
181} 181}
@@ -204,7 +204,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
204 q->queues[i] = &noop_qdisc; 204 q->queues[i] = &noop_qdisc;
205 qdisc_tree_reduce_backlog(child, child->q.qlen, 205 qdisc_tree_reduce_backlog(child, child->q.qlen,
206 child->qstats.backlog); 206 child->qstats.backlog);
207 qdisc_destroy(child); 207 qdisc_put(child);
208 } 208 }
209 } 209 }
210 210
@@ -228,7 +228,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
228 qdisc_tree_reduce_backlog(old, 228 qdisc_tree_reduce_backlog(old,
229 old->q.qlen, 229 old->q.qlen,
230 old->qstats.backlog); 230 old->qstats.backlog);
231 qdisc_destroy(old); 231 qdisc_put(old);
232 } 232 }
233 sch_tree_unlock(sch); 233 sch_tree_unlock(sch);
234 } 234 }
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ad18a2052416..57b3ad9394ad 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -412,16 +412,6 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
412 return segs; 412 return segs;
413} 413}
414 414
415static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
416{
417 skb->next = qh->head;
418
419 if (!qh->head)
420 qh->tail = skb;
421 qh->head = skb;
422 qh->qlen++;
423}
424
425/* 415/*
426 * Insert one skb into qdisc. 416 * Insert one skb into qdisc.
427 * Note: parent depends on return value to account for queue length. 417 * Note: parent depends on return value to account for queue length.
@@ -570,7 +560,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
570 cb->time_to_send = ktime_get_ns(); 560 cb->time_to_send = ktime_get_ns();
571 q->counter = 0; 561 q->counter = 0;
572 562
573 netem_enqueue_skb_head(&sch->q, skb); 563 __qdisc_enqueue_head(skb, &sch->q);
574 sch->qstats.requeues++; 564 sch->qstats.requeues++;
575 } 565 }
576 566
@@ -578,7 +568,7 @@ finish_segs:
578 if (segs) { 568 if (segs) {
579 while (segs) { 569 while (segs) {
580 skb2 = segs->next; 570 skb2 = segs->next;
581 segs->next = NULL; 571 skb_mark_not_on_list(segs);
582 qdisc_skb_cb(segs)->pkt_len = segs->len; 572 qdisc_skb_cb(segs)->pkt_len = segs->len;
583 last_len = segs->len; 573 last_len = segs->len;
584 rc = qdisc_enqueue(segs, sch, to_free); 574 rc = qdisc_enqueue(segs, sch, to_free);
@@ -1032,7 +1022,7 @@ static void netem_destroy(struct Qdisc *sch)
1032 1022
1033 qdisc_watchdog_cancel(&q->watchdog); 1023 qdisc_watchdog_cancel(&q->watchdog);
1034 if (q->qdisc) 1024 if (q->qdisc)
1035 qdisc_destroy(q->qdisc); 1025 qdisc_put(q->qdisc);
1036 dist_free(q->delay_dist); 1026 dist_free(q->delay_dist);
1037 dist_free(q->slot_dist); 1027 dist_free(q->slot_dist);
1038} 1028}
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 18d30bb86881..d1429371592f 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -110,8 +110,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
110 /* If current delay is less than half of target, and 110 /* If current delay is less than half of target, and
111 * if drop prob is low already, disable early_drop 111 * if drop prob is low already, disable early_drop
112 */ 112 */
113 if ((q->vars.qdelay < q->params.target / 2) 113 if ((q->vars.qdelay < q->params.target / 2) &&
114 && (q->vars.prob < MAX_PROB / 5)) 114 (q->vars.prob < MAX_PROB / 5))
115 return false; 115 return false;
116 116
117 /* If we have fewer than 2 mtu-sized packets, disable drop_early, 117 /* If we have fewer than 2 mtu-sized packets, disable drop_early,
@@ -209,7 +209,8 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
209 209
210 /* tupdate is in jiffies */ 210 /* tupdate is in jiffies */
211 if (tb[TCA_PIE_TUPDATE]) 211 if (tb[TCA_PIE_TUPDATE])
212 q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); 212 q->params.tupdate =
213 usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
213 214
214 if (tb[TCA_PIE_LIMIT]) { 215 if (tb[TCA_PIE_LIMIT]) {
215 u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); 216 u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
@@ -247,7 +248,6 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
247 248
248static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) 249static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
249{ 250{
250
251 struct pie_sched_data *q = qdisc_priv(sch); 251 struct pie_sched_data *q = qdisc_priv(sch);
252 int qlen = sch->qstats.backlog; /* current queue size in bytes */ 252 int qlen = sch->qstats.backlog; /* current queue size in bytes */
253 253
@@ -294,9 +294,9 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
294 * dq_count to 0 to re-enter the if block when the next 294 * dq_count to 0 to re-enter the if block when the next
295 * packet is dequeued 295 * packet is dequeued
296 */ 296 */
297 if (qlen < QUEUE_THRESHOLD) 297 if (qlen < QUEUE_THRESHOLD) {
298 q->vars.dq_count = DQCOUNT_INVALID; 298 q->vars.dq_count = DQCOUNT_INVALID;
299 else { 299 } else {
300 q->vars.dq_count = 0; 300 q->vars.dq_count = 0;
301 q->vars.dq_tstamp = psched_get_time(); 301 q->vars.dq_tstamp = psched_get_time();
302 } 302 }
@@ -370,7 +370,7 @@ static void calculate_probability(struct Qdisc *sch)
370 oldprob = q->vars.prob; 370 oldprob = q->vars.prob;
371 371
372 /* to ensure we increase probability in steps of no more than 2% */ 372 /* to ensure we increase probability in steps of no more than 2% */
373 if (delta > (s32) (MAX_PROB / (100 / 2)) && 373 if (delta > (s32)(MAX_PROB / (100 / 2)) &&
374 q->vars.prob >= MAX_PROB / 10) 374 q->vars.prob >= MAX_PROB / 10)
375 delta = (MAX_PROB / 100) * 2; 375 delta = (MAX_PROB / 100) * 2;
376 376
@@ -405,7 +405,7 @@ static void calculate_probability(struct Qdisc *sch)
405 * delay is 0 for 2 consecutive Tupdate periods. 405 * delay is 0 for 2 consecutive Tupdate periods.
406 */ 406 */
407 407
408 if ((qdelay == 0) && (qdelay_old == 0) && update_prob) 408 if (qdelay == 0 && qdelay_old == 0 && update_prob)
409 q->vars.prob = (q->vars.prob * 98) / 100; 409 q->vars.prob = (q->vars.prob * 98) / 100;
410 410
411 q->vars.qdelay = qdelay; 411 q->vars.qdelay = qdelay;
@@ -419,8 +419,8 @@ static void calculate_probability(struct Qdisc *sch)
419 */ 419 */
420 if ((q->vars.qdelay < q->params.target / 2) && 420 if ((q->vars.qdelay < q->params.target / 2) &&
421 (q->vars.qdelay_old < q->params.target / 2) && 421 (q->vars.qdelay_old < q->params.target / 2) &&
422 (q->vars.prob == 0) && 422 q->vars.prob == 0 &&
423 (q->vars.avg_dq_rate > 0)) 423 q->vars.avg_dq_rate > 0)
424 pie_vars_init(&q->vars); 424 pie_vars_init(&q->vars);
425} 425}
426 426
@@ -437,7 +437,6 @@ static void pie_timer(struct timer_list *t)
437 if (q->params.tupdate) 437 if (q->params.tupdate)
438 mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); 438 mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
439 spin_unlock(root_lock); 439 spin_unlock(root_lock);
440
441} 440}
442 441
443static int pie_init(struct Qdisc *sch, struct nlattr *opt, 442static int pie_init(struct Qdisc *sch, struct nlattr *opt,
@@ -469,15 +468,16 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
469 struct nlattr *opts; 468 struct nlattr *opts;
470 469
471 opts = nla_nest_start(skb, TCA_OPTIONS); 470 opts = nla_nest_start(skb, TCA_OPTIONS);
472 if (opts == NULL) 471 if (!opts)
473 goto nla_put_failure; 472 goto nla_put_failure;
474 473
475 /* convert target from pschedtime to us */ 474 /* convert target from pschedtime to us */
476 if (nla_put_u32(skb, TCA_PIE_TARGET, 475 if (nla_put_u32(skb, TCA_PIE_TARGET,
477 ((u32) PSCHED_TICKS2NS(q->params.target)) / 476 ((u32)PSCHED_TICKS2NS(q->params.target)) /
478 NSEC_PER_USEC) || 477 NSEC_PER_USEC) ||
479 nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || 478 nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
480 nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || 479 nla_put_u32(skb, TCA_PIE_TUPDATE,
480 jiffies_to_usecs(q->params.tupdate)) ||
481 nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || 481 nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
482 nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || 482 nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
483 nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || 483 nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
@@ -489,7 +489,6 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
489nla_put_failure: 489nla_put_failure:
490 nla_nest_cancel(skb, opts); 490 nla_nest_cancel(skb, opts);
491 return -1; 491 return -1;
492
493} 492}
494 493
495static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 494static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
@@ -497,7 +496,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
497 struct pie_sched_data *q = qdisc_priv(sch); 496 struct pie_sched_data *q = qdisc_priv(sch);
498 struct tc_pie_xstats st = { 497 struct tc_pie_xstats st = {
499 .prob = q->vars.prob, 498 .prob = q->vars.prob,
500 .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / 499 .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
501 NSEC_PER_USEC, 500 NSEC_PER_USEC,
502 /* unscale and return dq_rate in bytes per sec */ 501 /* unscale and return dq_rate in bytes per sec */
503 .avg_dq_rate = q->vars.avg_dq_rate * 502 .avg_dq_rate = q->vars.avg_dq_rate *
@@ -514,8 +513,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
514 513
515static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) 514static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
516{ 515{
517 struct sk_buff *skb; 516 struct sk_buff *skb = qdisc_dequeue_head(sch);
518 skb = qdisc_dequeue_head(sch);
519 517
520 if (!skb) 518 if (!skb)
521 return NULL; 519 return NULL;
@@ -527,6 +525,7 @@ static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
527static void pie_reset(struct Qdisc *sch) 525static void pie_reset(struct Qdisc *sch)
528{ 526{
529 struct pie_sched_data *q = qdisc_priv(sch); 527 struct pie_sched_data *q = qdisc_priv(sch);
528
530 qdisc_reset_queue(sch); 529 qdisc_reset_queue(sch);
531 pie_vars_init(&q->vars); 530 pie_vars_init(&q->vars);
532} 531}
@@ -534,6 +533,7 @@ static void pie_reset(struct Qdisc *sch)
534static void pie_destroy(struct Qdisc *sch) 533static void pie_destroy(struct Qdisc *sch)
535{ 534{
536 struct pie_sched_data *q = qdisc_priv(sch); 535 struct pie_sched_data *q = qdisc_priv(sch);
536
537 q->params.tupdate = 0; 537 q->params.tupdate = 0;
538 del_timer_sync(&q->adapt_timer); 538 del_timer_sync(&q->adapt_timer);
539} 539}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 222e53d3d27a..f8af98621179 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -175,7 +175,7 @@ prio_destroy(struct Qdisc *sch)
175 tcf_block_put(q->block); 175 tcf_block_put(q->block);
176 prio_offload(sch, NULL); 176 prio_offload(sch, NULL);
177 for (prio = 0; prio < q->bands; prio++) 177 for (prio = 0; prio < q->bands; prio++)
178 qdisc_destroy(q->queues[prio]); 178 qdisc_put(q->queues[prio]);
179} 179}
180 180
181static int prio_tune(struct Qdisc *sch, struct nlattr *opt, 181static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
@@ -205,7 +205,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
205 extack); 205 extack);
206 if (!queues[i]) { 206 if (!queues[i]) {
207 while (i > oldbands) 207 while (i > oldbands)
208 qdisc_destroy(queues[--i]); 208 qdisc_put(queues[--i]);
209 return -ENOMEM; 209 return -ENOMEM;
210 } 210 }
211 } 211 }
@@ -220,7 +220,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
220 220
221 qdisc_tree_reduce_backlog(child, child->q.qlen, 221 qdisc_tree_reduce_backlog(child, child->q.qlen,
222 child->qstats.backlog); 222 child->qstats.backlog);
223 qdisc_destroy(child); 223 qdisc_put(child);
224 } 224 }
225 225
226 for (i = oldbands; i < q->bands; i++) { 226 for (i = oldbands; i < q->bands; i++) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c11fc54..dc37c4ead439 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -526,7 +526,7 @@ set_change_agg:
526 return 0; 526 return 0;
527 527
528destroy_class: 528destroy_class:
529 qdisc_destroy(cl->qdisc); 529 qdisc_put(cl->qdisc);
530 kfree(cl); 530 kfree(cl);
531 return err; 531 return err;
532} 532}
@@ -537,7 +537,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
537 537
538 qfq_rm_from_agg(q, cl); 538 qfq_rm_from_agg(q, cl);
539 gen_kill_estimator(&cl->rate_est); 539 gen_kill_estimator(&cl->rate_est);
540 qdisc_destroy(cl->qdisc); 540 qdisc_put(cl->qdisc);
541 kfree(cl); 541 kfree(cl);
542} 542}
543 543
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 56c181c3feeb..3ce6c0a2c493 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -181,7 +181,7 @@ static void red_destroy(struct Qdisc *sch)
181 181
182 del_timer_sync(&q->adapt_timer); 182 del_timer_sync(&q->adapt_timer);
183 red_offload(sch, false); 183 red_offload(sch, false);
184 qdisc_destroy(q->qdisc); 184 qdisc_put(q->qdisc);
185} 185}
186 186
187static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { 187static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
@@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
233 if (child) { 233 if (child) {
234 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, 234 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
235 q->qdisc->qstats.backlog); 235 q->qdisc->qstats.backlog);
236 qdisc_destroy(q->qdisc); 236 qdisc_put(q->qdisc);
237 q->qdisc = child; 237 q->qdisc = child;
238 } 238 }
239 239
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8419b7..bab506b01a32 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -469,7 +469,7 @@ static void sfb_destroy(struct Qdisc *sch)
469 struct sfb_sched_data *q = qdisc_priv(sch); 469 struct sfb_sched_data *q = qdisc_priv(sch);
470 470
471 tcf_block_put(q->block); 471 tcf_block_put(q->block);
472 qdisc_destroy(q->qdisc); 472 qdisc_put(q->qdisc);
473} 473}
474 474
475static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { 475static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
@@ -523,7 +523,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
523 523
524 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, 524 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
525 q->qdisc->qstats.backlog); 525 q->qdisc->qstats.backlog);
526 qdisc_destroy(q->qdisc); 526 qdisc_put(q->qdisc);
527 q->qdisc = child; 527 q->qdisc = child;
528 528
529 q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); 529 q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
new file mode 100644
index 000000000000..206e4dbed12f
--- /dev/null
+++ b/net/sched/sch_taprio.c
@@ -0,0 +1,962 @@
1// SPDX-License-Identifier: GPL-2.0
2
3/* net/sched/sch_taprio.c Time Aware Priority Scheduler
4 *
5 * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
6 *
7 */
8
9#include <linux/types.h>
10#include <linux/slab.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/list.h>
14#include <linux/errno.h>
15#include <linux/skbuff.h>
16#include <linux/module.h>
17#include <linux/spinlock.h>
18#include <net/netlink.h>
19#include <net/pkt_sched.h>
20#include <net/pkt_cls.h>
21#include <net/sch_generic.h>
22
23#define TAPRIO_ALL_GATES_OPEN -1
24
25struct sched_entry {
26 struct list_head list;
27
28 /* The instant that this entry "closes" and the next one
29 * should open, the qdisc will make some effort so that no
30 * packet leaves after this time.
31 */
32 ktime_t close_time;
33 atomic_t budget;
34 int index;
35 u32 gate_mask;
36 u32 interval;
37 u8 command;
38};
39
40struct taprio_sched {
41 struct Qdisc **qdiscs;
42 struct Qdisc *root;
43 s64 base_time;
44 int clockid;
45 int picos_per_byte; /* Using picoseconds because for 10Gbps+
46 * speeds it's sub-nanoseconds per byte
47 */
48 size_t num_entries;
49
50 /* Protects the update side of the RCU protected current_entry */
51 spinlock_t current_entry_lock;
52 struct sched_entry __rcu *current_entry;
53 struct list_head entries;
54 ktime_t (*get_time)(void);
55 struct hrtimer advance_timer;
56};
57
58static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
59 struct sk_buff **to_free)
60{
61 struct taprio_sched *q = qdisc_priv(sch);
62 struct Qdisc *child;
63 int queue;
64
65 queue = skb_get_queue_mapping(skb);
66
67 child = q->qdiscs[queue];
68 if (unlikely(!child))
69 return qdisc_drop(skb, sch, to_free);
70
71 qdisc_qstats_backlog_inc(sch, skb);
72 sch->q.qlen++;
73
74 return qdisc_enqueue(skb, child, to_free);
75}
76
77static struct sk_buff *taprio_peek(struct Qdisc *sch)
78{
79 struct taprio_sched *q = qdisc_priv(sch);
80 struct net_device *dev = qdisc_dev(sch);
81 struct sched_entry *entry;
82 struct sk_buff *skb;
83 u32 gate_mask;
84 int i;
85
86 rcu_read_lock();
87 entry = rcu_dereference(q->current_entry);
88 gate_mask = entry ? entry->gate_mask : -1;
89 rcu_read_unlock();
90
91 if (!gate_mask)
92 return NULL;
93
94 for (i = 0; i < dev->num_tx_queues; i++) {
95 struct Qdisc *child = q->qdiscs[i];
96 int prio;
97 u8 tc;
98
99 if (unlikely(!child))
100 continue;
101
102 skb = child->ops->peek(child);
103 if (!skb)
104 continue;
105
106 prio = skb->priority;
107 tc = netdev_get_prio_tc_map(dev, prio);
108
109 if (!(gate_mask & BIT(tc)))
110 return NULL;
111
112 return skb;
113 }
114
115 return NULL;
116}
117
118static inline int length_to_duration(struct taprio_sched *q, int len)
119{
120 return (len * q->picos_per_byte) / 1000;
121}
122
123static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
124{
125 struct taprio_sched *q = qdisc_priv(sch);
126 struct net_device *dev = qdisc_dev(sch);
127 struct sched_entry *entry;
128 struct sk_buff *skb;
129 u32 gate_mask;
130 int i;
131
132 rcu_read_lock();
133 entry = rcu_dereference(q->current_entry);
134 /* if there's no entry, it means that the schedule didn't
135 * start yet, so force all gates to be open, this is in
136 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
137 * "AdminGateSates"
138 */
139 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
140 rcu_read_unlock();
141
142 if (!gate_mask)
143 return NULL;
144
145 for (i = 0; i < dev->num_tx_queues; i++) {
146 struct Qdisc *child = q->qdiscs[i];
147 ktime_t guard;
148 int prio;
149 int len;
150 u8 tc;
151
152 if (unlikely(!child))
153 continue;
154
155 skb = child->ops->peek(child);
156 if (!skb)
157 continue;
158
159 prio = skb->priority;
160 tc = netdev_get_prio_tc_map(dev, prio);
161
162 if (!(gate_mask & BIT(tc)))
163 continue;
164
165 len = qdisc_pkt_len(skb);
166 guard = ktime_add_ns(q->get_time(),
167 length_to_duration(q, len));
168
169 /* In the case that there's no gate entry, there's no
170 * guard band ...
171 */
172 if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
173 ktime_after(guard, entry->close_time))
174 return NULL;
175
176 /* ... and no budget. */
177 if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
178 atomic_sub_return(len, &entry->budget) < 0)
179 return NULL;
180
181 skb = child->ops->dequeue(child);
182 if (unlikely(!skb))
183 return NULL;
184
185 qdisc_bstats_update(sch, skb);
186 qdisc_qstats_backlog_dec(sch, skb);
187 sch->q.qlen--;
188
189 return skb;
190 }
191
192 return NULL;
193}
194
195static bool should_restart_cycle(const struct taprio_sched *q,
196 const struct sched_entry *entry)
197{
198 WARN_ON(!entry);
199
200 return list_is_last(&entry->list, &q->entries);
201}
202
203static enum hrtimer_restart advance_sched(struct hrtimer *timer)
204{
205 struct taprio_sched *q = container_of(timer, struct taprio_sched,
206 advance_timer);
207 struct sched_entry *entry, *next;
208 struct Qdisc *sch = q->root;
209 ktime_t close_time;
210
211 spin_lock(&q->current_entry_lock);
212 entry = rcu_dereference_protected(q->current_entry,
213 lockdep_is_held(&q->current_entry_lock));
214
215 /* This is the case that it's the first time that the schedule
216 * runs, so it only happens once per schedule. The first entry
217 * is pre-calculated during the schedule initialization.
218 */
219 if (unlikely(!entry)) {
220 next = list_first_entry(&q->entries, struct sched_entry,
221 list);
222 close_time = next->close_time;
223 goto first_run;
224 }
225
226 if (should_restart_cycle(q, entry))
227 next = list_first_entry(&q->entries, struct sched_entry,
228 list);
229 else
230 next = list_next_entry(entry, list);
231
232 close_time = ktime_add_ns(entry->close_time, next->interval);
233
234 next->close_time = close_time;
235 atomic_set(&next->budget,
236 (next->interval * 1000) / q->picos_per_byte);
237
238first_run:
239 rcu_assign_pointer(q->current_entry, next);
240 spin_unlock(&q->current_entry_lock);
241
242 hrtimer_set_expires(&q->advance_timer, close_time);
243
244 rcu_read_lock();
245 __netif_schedule(sch);
246 rcu_read_unlock();
247
248 return HRTIMER_RESTART;
249}
250
251static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
252 [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 },
253 [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 },
254 [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
255 [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
256};
257
258static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
259 [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
260};
261
262static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
263 [TCA_TAPRIO_ATTR_PRIOMAP] = {
264 .len = sizeof(struct tc_mqprio_qopt)
265 },
266 [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED },
267 [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 },
268 [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED },
269 [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
270};
271
272static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
273 struct netlink_ext_ack *extack)
274{
275 u32 interval = 0;
276
277 if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
278 entry->command = nla_get_u8(
279 tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
280
281 if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
282 entry->gate_mask = nla_get_u32(
283 tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
284
285 if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
286 interval = nla_get_u32(
287 tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
288
289 if (interval == 0) {
290 NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
291 return -EINVAL;
292 }
293
294 entry->interval = interval;
295
296 return 0;
297}
298
299static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
300 int index, struct netlink_ext_ack *extack)
301{
302 struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
303 int err;
304
305 err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
306 entry_policy, NULL);
307 if (err < 0) {
308 NL_SET_ERR_MSG(extack, "Could not parse nested entry");
309 return -EINVAL;
310 }
311
312 entry->index = index;
313
314 return fill_sched_entry(tb, entry, extack);
315}
316
317/* Returns the number of entries in case of success */
318static int parse_sched_single_entry(struct nlattr *n,
319 struct taprio_sched *q,
320 struct netlink_ext_ack *extack)
321{
322 struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
323 struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { };
324 struct sched_entry *entry;
325 bool found = false;
326 u32 index;
327 int err;
328
329 err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX,
330 n, entry_list_policy, NULL);
331 if (err < 0) {
332 NL_SET_ERR_MSG(extack, "Could not parse nested entry");
333 return -EINVAL;
334 }
335
336 if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) {
337 NL_SET_ERR_MSG(extack, "Single-entry must include an entry");
338 return -EINVAL;
339 }
340
341 err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX,
342 tb_list[TCA_TAPRIO_SCHED_ENTRY],
343 entry_policy, NULL);
344 if (err < 0) {
345 NL_SET_ERR_MSG(extack, "Could not parse nested entry");
346 return -EINVAL;
347 }
348
349 if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) {
350 NL_SET_ERR_MSG(extack, "Entry must specify an index\n");
351 return -EINVAL;
352 }
353
354 index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]);
355 if (index >= q->num_entries) {
356 NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule");
357 return -EINVAL;
358 }
359
360 list_for_each_entry(entry, &q->entries, list) {
361 if (entry->index == index) {
362 found = true;
363 break;
364 }
365 }
366
367 if (!found) {
368 NL_SET_ERR_MSG(extack, "Could not find entry");
369 return -ENOENT;
370 }
371
372 err = fill_sched_entry(tb_entry, entry, extack);
373 if (err < 0)
374 return err;
375
376 return q->num_entries;
377}
378
379static int parse_sched_list(struct nlattr *list,
380 struct taprio_sched *q,
381 struct netlink_ext_ack *extack)
382{
383 struct nlattr *n;
384 int err, rem;
385 int i = 0;
386
387 if (!list)
388 return -EINVAL;
389
390 nla_for_each_nested(n, list, rem) {
391 struct sched_entry *entry;
392
393 if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
394 NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
395 continue;
396 }
397
398 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
399 if (!entry) {
400 NL_SET_ERR_MSG(extack, "Not enough memory for entry");
401 return -ENOMEM;
402 }
403
404 err = parse_sched_entry(n, entry, i, extack);
405 if (err < 0) {
406 kfree(entry);
407 return err;
408 }
409
410 list_add_tail(&entry->list, &q->entries);
411 i++;
412 }
413
414 q->num_entries = i;
415
416 return i;
417}
418
419/* Returns the number of entries in case of success */
420static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q,
421 struct netlink_ext_ack *extack)
422{
423 int err = 0;
424 int clockid;
425
426 if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] &&
427 tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
428 return -EINVAL;
429
430 if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0)
431 return -EINVAL;
432
433 if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID])
434 return -EINVAL;
435
436 if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
437 q->base_time = nla_get_s64(
438 tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
439
440 if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
441 clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
442
443 /* We only support static clockids and we don't allow
444 * for it to be modified after the first init.
445 */
446 if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid))
447 return -EINVAL;
448
449 q->clockid = clockid;
450 }
451
452 if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
453 err = parse_sched_list(
454 tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack);
455 else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
456 err = parse_sched_single_entry(
457 tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack);
458
459 /* parse_sched_* return the number of entries in the schedule,
460 * a schedule with zero entries is an error.
461 */
462 if (err == 0) {
463 NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry");
464 return -EINVAL;
465 }
466
467 return err;
468}
469
470static int taprio_parse_mqprio_opt(struct net_device *dev,
471 struct tc_mqprio_qopt *qopt,
472 struct netlink_ext_ack *extack)
473{
474 int i, j;
475
476 if (!qopt) {
477 NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
478 return -EINVAL;
479 }
480
481 /* Verify num_tc is not out of max range */
482 if (qopt->num_tc > TC_MAX_QUEUE) {
483 NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
484 return -EINVAL;
485 }
486
487 /* taprio imposes that traffic classes map 1:n to tx queues */
488 if (qopt->num_tc > dev->num_tx_queues) {
489 NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
490 return -EINVAL;
491 }
492
493 /* Verify priority mapping uses valid tcs */
494 for (i = 0; i < TC_BITMASK + 1; i++) {
495 if (qopt->prio_tc_map[i] >= qopt->num_tc) {
496 NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
497 return -EINVAL;
498 }
499 }
500
501 for (i = 0; i < qopt->num_tc; i++) {
502 unsigned int last = qopt->offset[i] + qopt->count[i];
503
504 /* Verify the queue count is in tx range being equal to the
505 * real_num_tx_queues indicates the last queue is in use.
506 */
507 if (qopt->offset[i] >= dev->num_tx_queues ||
508 !qopt->count[i] ||
509 last > dev->real_num_tx_queues) {
510 NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
511 return -EINVAL;
512 }
513
514 /* Verify that the offset and counts do not overlap */
515 for (j = i + 1; j < qopt->num_tc; j++) {
516 if (last > qopt->offset[j]) {
517 NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
518 return -EINVAL;
519 }
520 }
521 }
522
523 return 0;
524}
525
526static ktime_t taprio_get_start_time(struct Qdisc *sch)
527{
528 struct taprio_sched *q = qdisc_priv(sch);
529 struct sched_entry *entry;
530 ktime_t now, base, cycle;
531 s64 n;
532
533 base = ns_to_ktime(q->base_time);
534 cycle = 0;
535
536 /* Calculate the cycle_time, by summing all the intervals.
537 */
538 list_for_each_entry(entry, &q->entries, list)
539 cycle = ktime_add_ns(cycle, entry->interval);
540
541 if (!cycle)
542 return base;
543
544 now = q->get_time();
545
546 if (ktime_after(base, now))
547 return base;
548
549 /* Schedule the start time for the beginning of the next
550 * cycle.
551 */
552 n = div64_s64(ktime_sub_ns(now, base), cycle);
553
554 return ktime_add_ns(base, (n + 1) * cycle);
555}
556
557static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
558{
559 struct taprio_sched *q = qdisc_priv(sch);
560 struct sched_entry *first;
561 unsigned long flags;
562
563 spin_lock_irqsave(&q->current_entry_lock, flags);
564
565 first = list_first_entry(&q->entries, struct sched_entry,
566 list);
567
568 first->close_time = ktime_add_ns(start, first->interval);
569 atomic_set(&first->budget,
570 (first->interval * 1000) / q->picos_per_byte);
571 rcu_assign_pointer(q->current_entry, NULL);
572
573 spin_unlock_irqrestore(&q->current_entry_lock, flags);
574
575 hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
576}
577
578static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
579 struct netlink_ext_ack *extack)
580{
581 struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
582 struct taprio_sched *q = qdisc_priv(sch);
583 struct net_device *dev = qdisc_dev(sch);
584 struct tc_mqprio_qopt *mqprio = NULL;
585 struct ethtool_link_ksettings ecmd;
586 int i, err, size;
587 s64 link_speed;
588 ktime_t start;
589
590 err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt,
591 taprio_policy, extack);
592 if (err < 0)
593 return err;
594
595 err = -EINVAL;
596 if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
597 mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
598
599 err = taprio_parse_mqprio_opt(dev, mqprio, extack);
600 if (err < 0)
601 return err;
602
603 /* A schedule with less than one entry is an error */
604 size = parse_taprio_opt(tb, q, extack);
605 if (size < 0)
606 return size;
607
608 hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
609 q->advance_timer.function = advance_sched;
610
611 switch (q->clockid) {
612 case CLOCK_REALTIME:
613 q->get_time = ktime_get_real;
614 break;
615 case CLOCK_MONOTONIC:
616 q->get_time = ktime_get;
617 break;
618 case CLOCK_BOOTTIME:
619 q->get_time = ktime_get_boottime;
620 break;
621 case CLOCK_TAI:
622 q->get_time = ktime_get_clocktai;
623 break;
624 default:
625 return -ENOTSUPP;
626 }
627
628 for (i = 0; i < dev->num_tx_queues; i++) {
629 struct netdev_queue *dev_queue;
630 struct Qdisc *qdisc;
631
632 dev_queue = netdev_get_tx_queue(dev, i);
633 qdisc = qdisc_create_dflt(dev_queue,
634 &pfifo_qdisc_ops,
635 TC_H_MAKE(TC_H_MAJ(sch->handle),
636 TC_H_MIN(i + 1)),
637 extack);
638 if (!qdisc)
639 return -ENOMEM;
640
641 if (i < dev->real_num_tx_queues)
642 qdisc_hash_add(qdisc, false);
643
644 q->qdiscs[i] = qdisc;
645 }
646
647 if (mqprio) {
648 netdev_set_num_tc(dev, mqprio->num_tc);
649 for (i = 0; i < mqprio->num_tc; i++)
650 netdev_set_tc_queue(dev, i,
651 mqprio->count[i],
652 mqprio->offset[i]);
653
654 /* Always use supplied priority mappings */
655 for (i = 0; i < TC_BITMASK + 1; i++)
656 netdev_set_prio_tc_map(dev, i,
657 mqprio->prio_tc_map[i]);
658 }
659
660 if (!__ethtool_get_link_ksettings(dev, &ecmd))
661 link_speed = ecmd.base.speed;
662 else
663 link_speed = SPEED_1000;
664
665 q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
666 link_speed * 1000 * 1000);
667
668 start = taprio_get_start_time(sch);
669 if (!start)
670 return 0;
671
672 taprio_start_sched(sch, start);
673
674 return 0;
675}
676
677static void taprio_destroy(struct Qdisc *sch)
678{
679 struct taprio_sched *q = qdisc_priv(sch);
680 struct net_device *dev = qdisc_dev(sch);
681 struct sched_entry *entry, *n;
682 unsigned int i;
683
684 hrtimer_cancel(&q->advance_timer);
685
686 if (q->qdiscs) {
687 for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
688 qdisc_put(q->qdiscs[i]);
689
690 kfree(q->qdiscs);
691 }
692 q->qdiscs = NULL;
693
694 netdev_set_num_tc(dev, 0);
695
696 list_for_each_entry_safe(entry, n, &q->entries, list) {
697 list_del(&entry->list);
698 kfree(entry);
699 }
700}
701
702static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
703 struct netlink_ext_ack *extack)
704{
705 struct taprio_sched *q = qdisc_priv(sch);
706 struct net_device *dev = qdisc_dev(sch);
707
708 INIT_LIST_HEAD(&q->entries);
709 spin_lock_init(&q->current_entry_lock);
710
711 /* We may overwrite the configuration later */
712 hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
713
714 q->root = sch;
715
716 /* We only support static clockids. Use an invalid value as default
717 * and get the valid one on taprio_change().
718 */
719 q->clockid = -1;
720
721 if (sch->parent != TC_H_ROOT)
722 return -EOPNOTSUPP;
723
724 if (!netif_is_multiqueue(dev))
725 return -EOPNOTSUPP;
726
727 /* pre-allocate qdisc, attachment can't fail */
728 q->qdiscs = kcalloc(dev->num_tx_queues,
729 sizeof(q->qdiscs[0]),
730 GFP_KERNEL);
731
732 if (!q->qdiscs)
733 return -ENOMEM;
734
735 if (!opt)
736 return -EINVAL;
737
738 return taprio_change(sch, opt, extack);
739}
740
741static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
742 unsigned long cl)
743{
744 struct net_device *dev = qdisc_dev(sch);
745 unsigned long ntx = cl - 1;
746
747 if (ntx >= dev->num_tx_queues)
748 return NULL;
749
750 return netdev_get_tx_queue(dev, ntx);
751}
752
753static int taprio_graft(struct Qdisc *sch, unsigned long cl,
754 struct Qdisc *new, struct Qdisc **old,
755 struct netlink_ext_ack *extack)
756{
757 struct taprio_sched *q = qdisc_priv(sch);
758 struct net_device *dev = qdisc_dev(sch);
759 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
760
761 if (!dev_queue)
762 return -EINVAL;
763
764 if (dev->flags & IFF_UP)
765 dev_deactivate(dev);
766
767 *old = q->qdiscs[cl - 1];
768 q->qdiscs[cl - 1] = new;
769
770 if (new)
771 new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
772
773 if (dev->flags & IFF_UP)
774 dev_activate(dev);
775
776 return 0;
777}
778
779static int dump_entry(struct sk_buff *msg,
780 const struct sched_entry *entry)
781{
782 struct nlattr *item;
783
784 item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY);
785 if (!item)
786 return -ENOSPC;
787
788 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
789 goto nla_put_failure;
790
791 if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
792 goto nla_put_failure;
793
794 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
795 entry->gate_mask))
796 goto nla_put_failure;
797
798 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
799 entry->interval))
800 goto nla_put_failure;
801
802 return nla_nest_end(msg, item);
803
804nla_put_failure:
805 nla_nest_cancel(msg, item);
806 return -1;
807}
808
809static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
810{
811 struct taprio_sched *q = qdisc_priv(sch);
812 struct net_device *dev = qdisc_dev(sch);
813 struct tc_mqprio_qopt opt = { 0 };
814 struct nlattr *nest, *entry_list;
815 struct sched_entry *entry;
816 unsigned int i;
817
818 opt.num_tc = netdev_get_num_tc(dev);
819 memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
820
821 for (i = 0; i < netdev_get_num_tc(dev); i++) {
822 opt.count[i] = dev->tc_to_txq[i].count;
823 opt.offset[i] = dev->tc_to_txq[i].offset;
824 }
825
826 nest = nla_nest_start(skb, TCA_OPTIONS);
827 if (!nest)
828 return -ENOSPC;
829
830 if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
831 goto options_error;
832
833 if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
834 q->base_time, TCA_TAPRIO_PAD))
835 goto options_error;
836
837 if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
838 goto options_error;
839
840 entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
841 if (!entry_list)
842 goto options_error;
843
844 list_for_each_entry(entry, &q->entries, list) {
845 if (dump_entry(skb, entry) < 0)
846 goto options_error;
847 }
848
849 nla_nest_end(skb, entry_list);
850
851 return nla_nest_end(skb, nest);
852
853options_error:
854 nla_nest_cancel(skb, nest);
855 return -1;
856}
857
858static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
859{
860 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
861
862 if (!dev_queue)
863 return NULL;
864
865 return dev_queue->qdisc_sleeping;
866}
867
868static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
869{
870 unsigned int ntx = TC_H_MIN(classid);
871
872 if (!taprio_queue_get(sch, ntx))
873 return 0;
874 return ntx;
875}
876
877static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
878 struct sk_buff *skb, struct tcmsg *tcm)
879{
880 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
881
882 tcm->tcm_parent = TC_H_ROOT;
883 tcm->tcm_handle |= TC_H_MIN(cl);
884 tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
885
886 return 0;
887}
888
889static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
890 struct gnet_dump *d)
891 __releases(d->lock)
892 __acquires(d->lock)
893{
894 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
895
896 sch = dev_queue->qdisc_sleeping;
897 if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
898 gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
899 return -1;
900 return 0;
901}
902
903static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
904{
905 struct net_device *dev = qdisc_dev(sch);
906 unsigned long ntx;
907
908 if (arg->stop)
909 return;
910
911 arg->count = arg->skip;
912 for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
913 if (arg->fn(sch, ntx + 1, arg) < 0) {
914 arg->stop = 1;
915 break;
916 }
917 arg->count++;
918 }
919}
920
921static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
922 struct tcmsg *tcm)
923{
924 return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
925}
926
927static const struct Qdisc_class_ops taprio_class_ops = {
928 .graft = taprio_graft,
929 .leaf = taprio_leaf,
930 .find = taprio_find,
931 .walk = taprio_walk,
932 .dump = taprio_dump_class,
933 .dump_stats = taprio_dump_class_stats,
934 .select_queue = taprio_select_queue,
935};
936
937static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
938 .cl_ops = &taprio_class_ops,
939 .id = "taprio",
940 .priv_size = sizeof(struct taprio_sched),
941 .init = taprio_init,
942 .destroy = taprio_destroy,
943 .peek = taprio_peek,
944 .dequeue = taprio_dequeue,
945 .enqueue = taprio_enqueue,
946 .dump = taprio_dump,
947 .owner = THIS_MODULE,
948};
949
950static int __init taprio_module_init(void)
951{
952 return register_qdisc(&taprio_qdisc_ops);
953}
954
955static void __exit taprio_module_exit(void)
956{
957 unregister_qdisc(&taprio_qdisc_ops);
958}
959
960module_init(taprio_module_init);
961module_exit(taprio_module_exit);
962MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6f74a426f159..942dcca09cf2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -162,7 +162,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
162 nb = 0; 162 nb = 0;
163 while (segs) { 163 while (segs) {
164 nskb = segs->next; 164 nskb = segs->next;
165 segs->next = NULL; 165 skb_mark_not_on_list(segs);
166 qdisc_skb_cb(segs)->pkt_len = segs->len; 166 qdisc_skb_cb(segs)->pkt_len = segs->len;
167 len += segs->len; 167 len += segs->len;
168 ret = qdisc_enqueue(segs, q->qdisc, to_free); 168 ret = qdisc_enqueue(segs, q->qdisc, to_free);
@@ -392,7 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
392 if (child) { 392 if (child) {
393 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, 393 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
394 q->qdisc->qstats.backlog); 394 q->qdisc->qstats.backlog);
395 qdisc_destroy(q->qdisc); 395 qdisc_put(q->qdisc);
396 q->qdisc = child; 396 q->qdisc = child;
397 } 397 }
398 q->limit = qopt->limit; 398 q->limit = qopt->limit;
@@ -438,7 +438,7 @@ static void tbf_destroy(struct Qdisc *sch)
438 struct tbf_sched_data *q = qdisc_priv(sch); 438 struct tbf_sched_data *q = qdisc_priv(sch);
439 439
440 qdisc_watchdog_cancel(&q->watchdog); 440 qdisc_watchdog_cancel(&q->watchdog);
441 qdisc_destroy(q->qdisc); 441 qdisc_put(q->qdisc);
442} 442}
443 443
444static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) 444static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 42191ed9902b..9cb854b05342 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -385,9 +385,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
385 asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); 385 asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
386 } 386 }
387 387
388 msg_len -= SCTP_DATA_SNDSIZE(chk) + 388 msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
389 sizeof(struct sk_buff) +
390 sizeof(struct sctp_chunk);
391 if (msg_len <= 0) 389 if (msg_len <= 0)
392 break; 390 break;
393 } 391 }
@@ -421,9 +419,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
421 streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; 419 streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
422 } 420 }
423 421
424 msg_len -= SCTP_DATA_SNDSIZE(chk) + 422 msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
425 sizeof(struct sk_buff) +
426 sizeof(struct sctp_chunk);
427 sctp_chunk_free(chk); 423 sctp_chunk_free(chk);
428 if (msg_len <= 0) 424 if (msg_len <= 0)
429 break; 425 break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c1c1bda334a4..fc0386e8ff23 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -83,7 +83,7 @@
83#include <net/sctp/stream_sched.h> 83#include <net/sctp/stream_sched.h>
84 84
85/* Forward declarations for internal helper functions. */ 85/* Forward declarations for internal helper functions. */
86static int sctp_writeable(struct sock *sk); 86static bool sctp_writeable(struct sock *sk);
87static void sctp_wfree(struct sk_buff *skb); 87static void sctp_wfree(struct sk_buff *skb);
88static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, 88static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
89 size_t msg_len); 89 size_t msg_len);
@@ -119,25 +119,10 @@ static void sctp_enter_memory_pressure(struct sock *sk)
119/* Get the sndbuf space available at the time on the association. */ 119/* Get the sndbuf space available at the time on the association. */
120static inline int sctp_wspace(struct sctp_association *asoc) 120static inline int sctp_wspace(struct sctp_association *asoc)
121{ 121{
122 int amt; 122 struct sock *sk = asoc->base.sk;
123 123
124 if (asoc->ep->sndbuf_policy) 124 return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used
125 amt = asoc->sndbuf_used; 125 : sk_stream_wspace(sk);
126 else
127 amt = sk_wmem_alloc_get(asoc->base.sk);
128
129 if (amt >= asoc->base.sk->sk_sndbuf) {
130 if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK)
131 amt = 0;
132 else {
133 amt = sk_stream_wspace(asoc->base.sk);
134 if (amt < 0)
135 amt = 0;
136 }
137 } else {
138 amt = asoc->base.sk->sk_sndbuf - amt;
139 }
140 return amt;
141} 126}
142 127
143/* Increment the used sndbuf space count of the corresponding association by 128/* Increment the used sndbuf space count of the corresponding association by
@@ -166,12 +151,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
166 /* Save the chunk pointer in skb for sctp_wfree to use later. */ 151 /* Save the chunk pointer in skb for sctp_wfree to use later. */
167 skb_shinfo(chunk->skb)->destructor_arg = chunk; 152 skb_shinfo(chunk->skb)->destructor_arg = chunk;
168 153
169 asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
170 sizeof(struct sk_buff) +
171 sizeof(struct sctp_chunk);
172
173 refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); 154 refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
174 sk->sk_wmem_queued += chunk->skb->truesize; 155 asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk);
156 sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk);
175 sk_mem_charge(sk, chunk->skb->truesize); 157 sk_mem_charge(sk, chunk->skb->truesize);
176} 158}
177 159
@@ -1927,10 +1909,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
1927 asoc->pmtu_pending = 0; 1909 asoc->pmtu_pending = 0;
1928 } 1910 }
1929 1911
1930 if (sctp_wspace(asoc) < msg_len) 1912 if (sctp_wspace(asoc) < (int)msg_len)
1931 sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); 1913 sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
1932 1914
1933 if (!sctp_wspace(asoc)) { 1915 if (sctp_wspace(asoc) <= 0) {
1934 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1916 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1935 err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); 1917 err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
1936 if (err) 1918 if (err)
@@ -8461,17 +8443,11 @@ static void sctp_wfree(struct sk_buff *skb)
8461 struct sctp_association *asoc = chunk->asoc; 8443 struct sctp_association *asoc = chunk->asoc;
8462 struct sock *sk = asoc->base.sk; 8444 struct sock *sk = asoc->base.sk;
8463 8445
8464 asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
8465 sizeof(struct sk_buff) +
8466 sizeof(struct sctp_chunk);
8467
8468 WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc));
8469
8470 /*
8471 * This undoes what is done via sctp_set_owner_w and sk_mem_charge
8472 */
8473 sk->sk_wmem_queued -= skb->truesize;
8474 sk_mem_uncharge(sk, skb->truesize); 8446 sk_mem_uncharge(sk, skb->truesize);
8447 sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk);
8448 asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk);
8449 WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk),
8450 &sk->sk_wmem_alloc));
8475 8451
8476 if (chunk->shkey) { 8452 if (chunk->shkey) {
8477 struct sctp_shared_key *shkey = chunk->shkey; 8453 struct sctp_shared_key *shkey = chunk->shkey;
@@ -8545,7 +8521,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
8545 goto do_error; 8521 goto do_error;
8546 if (signal_pending(current)) 8522 if (signal_pending(current))
8547 goto do_interrupted; 8523 goto do_interrupted;
8548 if (msg_len <= sctp_wspace(asoc)) 8524 if ((int)msg_len <= sctp_wspace(asoc))
8549 break; 8525 break;
8550 8526
8551 /* Let another process have a go. Since we are going 8527 /* Let another process have a go. Since we are going
@@ -8620,14 +8596,9 @@ void sctp_write_space(struct sock *sk)
8620 * UDP-style sockets or TCP-style sockets, this code should work. 8596 * UDP-style sockets or TCP-style sockets, this code should work.
8621 * - Daisy 8597 * - Daisy
8622 */ 8598 */
8623static int sctp_writeable(struct sock *sk) 8599static bool sctp_writeable(struct sock *sk)
8624{ 8600{
8625 int amt = 0; 8601 return sk->sk_sndbuf > sk->sk_wmem_queued;
8626
8627 amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
8628 if (amt < 0)
8629 amt = 0;
8630 return amt;
8631} 8602}
8632 8603
8633/* Wait for an association to go into ESTABLISHED state. If timeout is 0, 8604/* Wait for an association to go into ESTABLISHED state. If timeout is 0,
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0b427100b0d4..331cc734e3db 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -459,7 +459,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul
459 * element in the queue, then count it towards 459 * element in the queue, then count it towards
460 * possible PD. 460 * possible PD.
461 */ 461 */
462 if (pos == ulpq->reasm.next) { 462 if (skb_queue_is_first(&ulpq->reasm, pos)) {
463 pd_first = pos; 463 pd_first = pos;
464 pd_last = pos; 464 pd_last = pos;
465 pd_len = pos->len; 465 pd_len = pos->len;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 015231789ed2..80e2119f1c70 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1543,7 +1543,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
1543 mask |= EPOLLERR; 1543 mask |= EPOLLERR;
1544 } else { 1544 } else {
1545 if (sk->sk_state != SMC_CLOSED) 1545 if (sk->sk_state != SMC_CLOSED)
1546 sock_poll_wait(file, wait); 1546 sock_poll_wait(file, sock, wait);
1547 if (sk->sk_err) 1547 if (sk->sk_err)
1548 mask |= EPOLLERR; 1548 mask |= EPOLLERR;
1549 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1549 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e871368500e3..18daebcef181 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -122,22 +122,17 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn)
122 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ 122 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
123} 123}
124 124
125/* Unregister connection and trigger lgr freeing if applicable 125/* Unregister connection from lgr
126 */ 126 */
127static void smc_lgr_unregister_conn(struct smc_connection *conn) 127static void smc_lgr_unregister_conn(struct smc_connection *conn)
128{ 128{
129 struct smc_link_group *lgr = conn->lgr; 129 struct smc_link_group *lgr = conn->lgr;
130 int reduced = 0;
131 130
132 write_lock_bh(&lgr->conns_lock); 131 write_lock_bh(&lgr->conns_lock);
133 if (conn->alert_token_local) { 132 if (conn->alert_token_local) {
134 reduced = 1;
135 __smc_lgr_unregister_conn(conn); 133 __smc_lgr_unregister_conn(conn);
136 } 134 }
137 write_unlock_bh(&lgr->conns_lock); 135 write_unlock_bh(&lgr->conns_lock);
138 if (!reduced || lgr->conns_num)
139 return;
140 smc_lgr_schedule_free_work(lgr);
141} 136}
142 137
143/* Send delete link, either as client to request the initiation 138/* Send delete link, either as client to request the initiation
@@ -291,7 +286,8 @@ out:
291 return rc; 286 return rc;
292} 287}
293 288
294static void smc_buf_unuse(struct smc_connection *conn) 289static void smc_buf_unuse(struct smc_connection *conn,
290 struct smc_link_group *lgr)
295{ 291{
296 if (conn->sndbuf_desc) 292 if (conn->sndbuf_desc)
297 conn->sndbuf_desc->used = 0; 293 conn->sndbuf_desc->used = 0;
@@ -301,8 +297,6 @@ static void smc_buf_unuse(struct smc_connection *conn)
301 conn->rmb_desc->used = 0; 297 conn->rmb_desc->used = 0;
302 } else { 298 } else {
303 /* buf registration failed, reuse not possible */ 299 /* buf registration failed, reuse not possible */
304 struct smc_link_group *lgr = conn->lgr;
305
306 write_lock_bh(&lgr->rmbs_lock); 300 write_lock_bh(&lgr->rmbs_lock);
307 list_del(&conn->rmb_desc->list); 301 list_del(&conn->rmb_desc->list);
308 write_unlock_bh(&lgr->rmbs_lock); 302 write_unlock_bh(&lgr->rmbs_lock);
@@ -315,16 +309,21 @@ static void smc_buf_unuse(struct smc_connection *conn)
315/* remove a finished connection from its link group */ 309/* remove a finished connection from its link group */
316void smc_conn_free(struct smc_connection *conn) 310void smc_conn_free(struct smc_connection *conn)
317{ 311{
318 if (!conn->lgr) 312 struct smc_link_group *lgr = conn->lgr;
313
314 if (!lgr)
319 return; 315 return;
320 if (conn->lgr->is_smcd) { 316 if (lgr->is_smcd) {
321 smc_ism_unset_conn(conn); 317 smc_ism_unset_conn(conn);
322 tasklet_kill(&conn->rx_tsklet); 318 tasklet_kill(&conn->rx_tsklet);
323 } else { 319 } else {
324 smc_cdc_tx_dismiss_slots(conn); 320 smc_cdc_tx_dismiss_slots(conn);
325 } 321 }
326 smc_lgr_unregister_conn(conn); 322 smc_lgr_unregister_conn(conn); /* unsets conn->lgr */
327 smc_buf_unuse(conn); 323 smc_buf_unuse(conn, lgr); /* allow buffer reuse */
324
325 if (!lgr->conns_num)
326 smc_lgr_schedule_free_work(lgr);
328} 327}
329 328
330static void smc_link_clear(struct smc_link *lnk) 329static void smc_link_clear(struct smc_link *lnk)
diff --git a/net/socket.c b/net/socket.c
index 390a8ecef4bf..99c96851469f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1475,7 +1475,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
1475 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1475 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1476 if (sock) { 1476 if (sock) {
1477 err = move_addr_to_kernel(umyaddr, addrlen, &address); 1477 err = move_addr_to_kernel(umyaddr, addrlen, &address);
1478 if (err >= 0) { 1478 if (!err) {
1479 err = security_socket_bind(sock, 1479 err = security_socket_bind(sock,
1480 (struct sockaddr *)&address, 1480 (struct sockaddr *)&address,
1481 addrlen); 1481 addrlen);
@@ -2342,7 +2342,7 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
2342 */ 2342 */
2343 2343
2344int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, 2344int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
2345 unsigned int flags, struct timespec *timeout) 2345 unsigned int flags, struct timespec64 *timeout)
2346{ 2346{
2347 int fput_needed, err, datagrams; 2347 int fput_needed, err, datagrams;
2348 struct socket *sock; 2348 struct socket *sock;
@@ -2407,8 +2407,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
2407 2407
2408 if (timeout) { 2408 if (timeout) {
2409 ktime_get_ts64(&timeout64); 2409 ktime_get_ts64(&timeout64);
2410 *timeout = timespec64_to_timespec( 2410 *timeout = timespec64_sub(end_time, timeout64);
2411 timespec64_sub(end_time, timeout64));
2412 if (timeout->tv_sec < 0) { 2411 if (timeout->tv_sec < 0) {
2413 timeout->tv_sec = timeout->tv_nsec = 0; 2412 timeout->tv_sec = timeout->tv_nsec = 0;
2414 break; 2413 break;
@@ -2454,10 +2453,10 @@ out_put:
2454 2453
2455static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, 2454static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
2456 unsigned int vlen, unsigned int flags, 2455 unsigned int vlen, unsigned int flags,
2457 struct timespec __user *timeout) 2456 struct __kernel_timespec __user *timeout)
2458{ 2457{
2459 int datagrams; 2458 int datagrams;
2460 struct timespec timeout_sys; 2459 struct timespec64 timeout_sys;
2461 2460
2462 if (flags & MSG_CMSG_COMPAT) 2461 if (flags & MSG_CMSG_COMPAT)
2463 return -EINVAL; 2462 return -EINVAL;
@@ -2465,13 +2464,12 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
2465 if (!timeout) 2464 if (!timeout)
2466 return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); 2465 return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
2467 2466
2468 if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys))) 2467 if (get_timespec64(&timeout_sys, timeout))
2469 return -EFAULT; 2468 return -EFAULT;
2470 2469
2471 datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); 2470 datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
2472 2471
2473 if (datagrams > 0 && 2472 if (datagrams > 0 && put_timespec64(&timeout_sys, timeout))
2474 copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
2475 datagrams = -EFAULT; 2473 datagrams = -EFAULT;
2476 2474
2477 return datagrams; 2475 return datagrams;
@@ -2479,7 +2477,7 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
2479 2477
2480SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, 2478SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
2481 unsigned int, vlen, unsigned int, flags, 2479 unsigned int, vlen, unsigned int, flags,
2482 struct timespec __user *, timeout) 2480 struct __kernel_timespec __user *, timeout)
2483{ 2481{
2484 return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); 2482 return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
2485} 2483}
@@ -2603,7 +2601,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
2603 break; 2601 break;
2604 case SYS_RECVMMSG: 2602 case SYS_RECVMMSG:
2605 err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], 2603 err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2],
2606 a[3], (struct timespec __user *)a[4]); 2604 a[3], (struct __kernel_timespec __user *)a[4]);
2607 break; 2605 break;
2608 case SYS_ACCEPT4: 2606 case SYS_ACCEPT4:
2609 err = __sys_accept4(a0, (struct sockaddr __user *)a1, 2607 err = __sys_accept4(a0, (struct sockaddr __user *)a1,
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
index 6cff3f6d0c3a..94da19a2a220 100644
--- a/net/strparser/Kconfig
+++ b/net/strparser/Kconfig
@@ -1,4 +1,2 @@
1
2config STREAM_PARSER 1config STREAM_PARSER
3 tristate 2 def_bool n
4 default n
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 305ecea92170..ad8ead738981 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -30,10 +30,9 @@ struct rpc_cred_cache {
30 30
31static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; 31static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
32 32
33static DEFINE_SPINLOCK(rpc_authflavor_lock); 33static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
34static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { 34 [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops,
35 &authnull_ops, /* AUTH_NULL */ 35 [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops,
36 &authunix_ops, /* AUTH_UNIX */
37 NULL, /* others can be loadable modules */ 36 NULL, /* others can be loadable modules */
38}; 37};
39 38
@@ -93,39 +92,65 @@ pseudoflavor_to_flavor(u32 flavor) {
93int 92int
94rpcauth_register(const struct rpc_authops *ops) 93rpcauth_register(const struct rpc_authops *ops)
95{ 94{
95 const struct rpc_authops *old;
96 rpc_authflavor_t flavor; 96 rpc_authflavor_t flavor;
97 int ret = -EPERM;
98 97
99 if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) 98 if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
100 return -EINVAL; 99 return -EINVAL;
101 spin_lock(&rpc_authflavor_lock); 100 old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops);
102 if (auth_flavors[flavor] == NULL) { 101 if (old == NULL || old == ops)
103 auth_flavors[flavor] = ops; 102 return 0;
104 ret = 0; 103 return -EPERM;
105 }
106 spin_unlock(&rpc_authflavor_lock);
107 return ret;
108} 104}
109EXPORT_SYMBOL_GPL(rpcauth_register); 105EXPORT_SYMBOL_GPL(rpcauth_register);
110 106
111int 107int
112rpcauth_unregister(const struct rpc_authops *ops) 108rpcauth_unregister(const struct rpc_authops *ops)
113{ 109{
110 const struct rpc_authops *old;
114 rpc_authflavor_t flavor; 111 rpc_authflavor_t flavor;
115 int ret = -EPERM;
116 112
117 if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) 113 if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
118 return -EINVAL; 114 return -EINVAL;
119 spin_lock(&rpc_authflavor_lock); 115
120 if (auth_flavors[flavor] == ops) { 116 old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL);
121 auth_flavors[flavor] = NULL; 117 if (old == ops || old == NULL)
122 ret = 0; 118 return 0;
123 } 119 return -EPERM;
124 spin_unlock(&rpc_authflavor_lock);
125 return ret;
126} 120}
127EXPORT_SYMBOL_GPL(rpcauth_unregister); 121EXPORT_SYMBOL_GPL(rpcauth_unregister);
128 122
123static const struct rpc_authops *
124rpcauth_get_authops(rpc_authflavor_t flavor)
125{
126 const struct rpc_authops *ops;
127
128 if (flavor >= RPC_AUTH_MAXFLAVOR)
129 return NULL;
130
131 rcu_read_lock();
132 ops = rcu_dereference(auth_flavors[flavor]);
133 if (ops == NULL) {
134 rcu_read_unlock();
135 request_module("rpc-auth-%u", flavor);
136 rcu_read_lock();
137 ops = rcu_dereference(auth_flavors[flavor]);
138 if (ops == NULL)
139 goto out;
140 }
141 if (!try_module_get(ops->owner))
142 ops = NULL;
143out:
144 rcu_read_unlock();
145 return ops;
146}
147
148static void
149rpcauth_put_authops(const struct rpc_authops *ops)
150{
151 module_put(ops->owner);
152}
153
129/** 154/**
130 * rpcauth_get_pseudoflavor - check if security flavor is supported 155 * rpcauth_get_pseudoflavor - check if security flavor is supported
131 * @flavor: a security flavor 156 * @flavor: a security flavor
@@ -138,25 +163,16 @@ EXPORT_SYMBOL_GPL(rpcauth_unregister);
138rpc_authflavor_t 163rpc_authflavor_t
139rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) 164rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
140{ 165{
141 const struct rpc_authops *ops; 166 const struct rpc_authops *ops = rpcauth_get_authops(flavor);
142 rpc_authflavor_t pseudoflavor; 167 rpc_authflavor_t pseudoflavor;
143 168
144 ops = auth_flavors[flavor]; 169 if (!ops)
145 if (ops == NULL)
146 request_module("rpc-auth-%u", flavor);
147 spin_lock(&rpc_authflavor_lock);
148 ops = auth_flavors[flavor];
149 if (ops == NULL || !try_module_get(ops->owner)) {
150 spin_unlock(&rpc_authflavor_lock);
151 return RPC_AUTH_MAXFLAVOR; 170 return RPC_AUTH_MAXFLAVOR;
152 }
153 spin_unlock(&rpc_authflavor_lock);
154
155 pseudoflavor = flavor; 171 pseudoflavor = flavor;
156 if (ops->info2flavor != NULL) 172 if (ops->info2flavor != NULL)
157 pseudoflavor = ops->info2flavor(info); 173 pseudoflavor = ops->info2flavor(info);
158 174
159 module_put(ops->owner); 175 rpcauth_put_authops(ops);
160 return pseudoflavor; 176 return pseudoflavor;
161} 177}
162EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); 178EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
@@ -176,25 +192,15 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info)
176 const struct rpc_authops *ops; 192 const struct rpc_authops *ops;
177 int result; 193 int result;
178 194
179 if (flavor >= RPC_AUTH_MAXFLAVOR) 195 ops = rpcauth_get_authops(flavor);
180 return -EINVAL;
181
182 ops = auth_flavors[flavor];
183 if (ops == NULL) 196 if (ops == NULL)
184 request_module("rpc-auth-%u", flavor);
185 spin_lock(&rpc_authflavor_lock);
186 ops = auth_flavors[flavor];
187 if (ops == NULL || !try_module_get(ops->owner)) {
188 spin_unlock(&rpc_authflavor_lock);
189 return -ENOENT; 197 return -ENOENT;
190 }
191 spin_unlock(&rpc_authflavor_lock);
192 198
193 result = -ENOENT; 199 result = -ENOENT;
194 if (ops->flavor2info != NULL) 200 if (ops->flavor2info != NULL)
195 result = ops->flavor2info(pseudoflavor, info); 201 result = ops->flavor2info(pseudoflavor, info);
196 202
197 module_put(ops->owner); 203 rpcauth_put_authops(ops);
198 return result; 204 return result;
199} 205}
200EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); 206EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
@@ -212,15 +218,13 @@ EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
212int 218int
213rpcauth_list_flavors(rpc_authflavor_t *array, int size) 219rpcauth_list_flavors(rpc_authflavor_t *array, int size)
214{ 220{
215 rpc_authflavor_t flavor; 221 const struct rpc_authops *ops;
216 int result = 0; 222 rpc_authflavor_t flavor, pseudos[4];
223 int i, len, result = 0;
217 224
218 spin_lock(&rpc_authflavor_lock); 225 rcu_read_lock();
219 for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { 226 for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
220 const struct rpc_authops *ops = auth_flavors[flavor]; 227 ops = rcu_dereference(auth_flavors[flavor]);
221 rpc_authflavor_t pseudos[4];
222 int i, len;
223
224 if (result >= size) { 228 if (result >= size) {
225 result = -ENOMEM; 229 result = -ENOMEM;
226 break; 230 break;
@@ -245,7 +249,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
245 array[result++] = pseudos[i]; 249 array[result++] = pseudos[i];
246 } 250 }
247 } 251 }
248 spin_unlock(&rpc_authflavor_lock); 252 rcu_read_unlock();
249 253
250 dprintk("RPC: %s returns %d\n", __func__, result); 254 dprintk("RPC: %s returns %d\n", __func__, result);
251 return result; 255 return result;
@@ -255,25 +259,17 @@ EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
255struct rpc_auth * 259struct rpc_auth *
256rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 260rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
257{ 261{
258 struct rpc_auth *auth; 262 struct rpc_auth *auth = ERR_PTR(-EINVAL);
259 const struct rpc_authops *ops; 263 const struct rpc_authops *ops;
260 u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); 264 u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor);
261 265
262 auth = ERR_PTR(-EINVAL); 266 ops = rpcauth_get_authops(flavor);
263 if (flavor >= RPC_AUTH_MAXFLAVOR) 267 if (ops == NULL)
264 goto out; 268 goto out;
265 269
266 if ((ops = auth_flavors[flavor]) == NULL)
267 request_module("rpc-auth-%u", flavor);
268 spin_lock(&rpc_authflavor_lock);
269 ops = auth_flavors[flavor];
270 if (ops == NULL || !try_module_get(ops->owner)) {
271 spin_unlock(&rpc_authflavor_lock);
272 goto out;
273 }
274 spin_unlock(&rpc_authflavor_lock);
275 auth = ops->create(args, clnt); 270 auth = ops->create(args, clnt);
276 module_put(ops->owner); 271
272 rpcauth_put_authops(ops);
277 if (IS_ERR(auth)) 273 if (IS_ERR(auth))
278 return auth; 274 return auth;
279 if (clnt->cl_auth) 275 if (clnt->cl_auth)
@@ -288,32 +284,37 @@ EXPORT_SYMBOL_GPL(rpcauth_create);
288void 284void
289rpcauth_release(struct rpc_auth *auth) 285rpcauth_release(struct rpc_auth *auth)
290{ 286{
291 if (!atomic_dec_and_test(&auth->au_count)) 287 if (!refcount_dec_and_test(&auth->au_count))
292 return; 288 return;
293 auth->au_ops->destroy(auth); 289 auth->au_ops->destroy(auth);
294} 290}
295 291
296static DEFINE_SPINLOCK(rpc_credcache_lock); 292static DEFINE_SPINLOCK(rpc_credcache_lock);
297 293
298static void 294/*
295 * On success, the caller is responsible for freeing the reference
296 * held by the hashtable
297 */
298static bool
299rpcauth_unhash_cred_locked(struct rpc_cred *cred) 299rpcauth_unhash_cred_locked(struct rpc_cred *cred)
300{ 300{
301 if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
302 return false;
301 hlist_del_rcu(&cred->cr_hash); 303 hlist_del_rcu(&cred->cr_hash);
302 smp_mb__before_atomic(); 304 return true;
303 clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
304} 305}
305 306
306static int 307static bool
307rpcauth_unhash_cred(struct rpc_cred *cred) 308rpcauth_unhash_cred(struct rpc_cred *cred)
308{ 309{
309 spinlock_t *cache_lock; 310 spinlock_t *cache_lock;
310 int ret; 311 bool ret;
311 312
313 if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
314 return false;
312 cache_lock = &cred->cr_auth->au_credcache->lock; 315 cache_lock = &cred->cr_auth->au_credcache->lock;
313 spin_lock(cache_lock); 316 spin_lock(cache_lock);
314 ret = atomic_read(&cred->cr_count) == 0; 317 ret = rpcauth_unhash_cred_locked(cred);
315 if (ret)
316 rpcauth_unhash_cred_locked(cred);
317 spin_unlock(cache_lock); 318 spin_unlock(cache_lock);
318 return ret; 319 return ret;
319} 320}
@@ -392,6 +393,44 @@ void rpcauth_destroy_credlist(struct list_head *head)
392 } 393 }
393} 394}
394 395
396static void
397rpcauth_lru_add_locked(struct rpc_cred *cred)
398{
399 if (!list_empty(&cred->cr_lru))
400 return;
401 number_cred_unused++;
402 list_add_tail(&cred->cr_lru, &cred_unused);
403}
404
405static void
406rpcauth_lru_add(struct rpc_cred *cred)
407{
408 if (!list_empty(&cred->cr_lru))
409 return;
410 spin_lock(&rpc_credcache_lock);
411 rpcauth_lru_add_locked(cred);
412 spin_unlock(&rpc_credcache_lock);
413}
414
415static void
416rpcauth_lru_remove_locked(struct rpc_cred *cred)
417{
418 if (list_empty(&cred->cr_lru))
419 return;
420 number_cred_unused--;
421 list_del_init(&cred->cr_lru);
422}
423
424static void
425rpcauth_lru_remove(struct rpc_cred *cred)
426{
427 if (list_empty(&cred->cr_lru))
428 return;
429 spin_lock(&rpc_credcache_lock);
430 rpcauth_lru_remove_locked(cred);
431 spin_unlock(&rpc_credcache_lock);
432}
433
395/* 434/*
396 * Clear the RPC credential cache, and delete those credentials 435 * Clear the RPC credential cache, and delete those credentials
397 * that are not referenced. 436 * that are not referenced.
@@ -411,13 +450,10 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
411 head = &cache->hashtable[i]; 450 head = &cache->hashtable[i];
412 while (!hlist_empty(head)) { 451 while (!hlist_empty(head)) {
413 cred = hlist_entry(head->first, struct rpc_cred, cr_hash); 452 cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
414 get_rpccred(cred);
415 if (!list_empty(&cred->cr_lru)) {
416 list_del(&cred->cr_lru);
417 number_cred_unused--;
418 }
419 list_add_tail(&cred->cr_lru, &free);
420 rpcauth_unhash_cred_locked(cred); 453 rpcauth_unhash_cred_locked(cred);
454 /* Note: We now hold a reference to cred */
455 rpcauth_lru_remove_locked(cred);
456 list_add_tail(&cred->cr_lru, &free);
421 } 457 }
422 } 458 }
423 spin_unlock(&cache->lock); 459 spin_unlock(&cache->lock);
@@ -451,7 +487,6 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
451static long 487static long
452rpcauth_prune_expired(struct list_head *free, int nr_to_scan) 488rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
453{ 489{
454 spinlock_t *cache_lock;
455 struct rpc_cred *cred, *next; 490 struct rpc_cred *cred, *next;
456 unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; 491 unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
457 long freed = 0; 492 long freed = 0;
@@ -460,32 +495,24 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
460 495
461 if (nr_to_scan-- == 0) 496 if (nr_to_scan-- == 0)
462 break; 497 break;
498 if (refcount_read(&cred->cr_count) > 1) {
499 rpcauth_lru_remove_locked(cred);
500 continue;
501 }
463 /* 502 /*
464 * Enforce a 60 second garbage collection moratorium 503 * Enforce a 60 second garbage collection moratorium
465 * Note that the cred_unused list must be time-ordered. 504 * Note that the cred_unused list must be time-ordered.
466 */ 505 */
467 if (time_in_range(cred->cr_expire, expired, jiffies) && 506 if (!time_in_range(cred->cr_expire, expired, jiffies))
468 test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { 507 continue;
469 freed = SHRINK_STOP; 508 if (!rpcauth_unhash_cred(cred))
470 break;
471 }
472
473 list_del_init(&cred->cr_lru);
474 number_cred_unused--;
475 freed++;
476 if (atomic_read(&cred->cr_count) != 0)
477 continue; 509 continue;
478 510
479 cache_lock = &cred->cr_auth->au_credcache->lock; 511 rpcauth_lru_remove_locked(cred);
480 spin_lock(cache_lock); 512 freed++;
481 if (atomic_read(&cred->cr_count) == 0) { 513 list_add_tail(&cred->cr_lru, free);
482 get_rpccred(cred);
483 list_add_tail(&cred->cr_lru, free);
484 rpcauth_unhash_cred_locked(cred);
485 }
486 spin_unlock(cache_lock);
487 } 514 }
488 return freed; 515 return freed ? freed : SHRINK_STOP;
489} 516}
490 517
491static unsigned long 518static unsigned long
@@ -561,19 +588,15 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
561 if (!entry->cr_ops->crmatch(acred, entry, flags)) 588 if (!entry->cr_ops->crmatch(acred, entry, flags))
562 continue; 589 continue;
563 if (flags & RPCAUTH_LOOKUP_RCU) { 590 if (flags & RPCAUTH_LOOKUP_RCU) {
564 if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && 591 if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) ||
565 !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) 592 refcount_read(&entry->cr_count) == 0)
566 cred = entry; 593 continue;
594 cred = entry;
567 break; 595 break;
568 } 596 }
569 spin_lock(&cache->lock);
570 if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
571 spin_unlock(&cache->lock);
572 continue;
573 }
574 cred = get_rpccred(entry); 597 cred = get_rpccred(entry);
575 spin_unlock(&cache->lock); 598 if (cred)
576 break; 599 break;
577 } 600 }
578 rcu_read_unlock(); 601 rcu_read_unlock();
579 602
@@ -594,11 +617,13 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
594 if (!entry->cr_ops->crmatch(acred, entry, flags)) 617 if (!entry->cr_ops->crmatch(acred, entry, flags))
595 continue; 618 continue;
596 cred = get_rpccred(entry); 619 cred = get_rpccred(entry);
597 break; 620 if (cred)
621 break;
598 } 622 }
599 if (cred == NULL) { 623 if (cred == NULL) {
600 cred = new; 624 cred = new;
601 set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); 625 set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
626 refcount_inc(&cred->cr_count);
602 hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); 627 hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
603 } else 628 } else
604 list_add_tail(&new->cr_lru, &free); 629 list_add_tail(&new->cr_lru, &free);
@@ -645,7 +670,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
645{ 670{
646 INIT_HLIST_NODE(&cred->cr_hash); 671 INIT_HLIST_NODE(&cred->cr_hash);
647 INIT_LIST_HEAD(&cred->cr_lru); 672 INIT_LIST_HEAD(&cred->cr_lru);
648 atomic_set(&cred->cr_count, 1); 673 refcount_set(&cred->cr_count, 1);
649 cred->cr_auth = auth; 674 cred->cr_auth = auth;
650 cred->cr_ops = ops; 675 cred->cr_ops = ops;
651 cred->cr_expire = jiffies; 676 cred->cr_expire = jiffies;
@@ -713,36 +738,29 @@ put_rpccred(struct rpc_cred *cred)
713{ 738{
714 if (cred == NULL) 739 if (cred == NULL)
715 return; 740 return;
716 /* Fast path for unhashed credentials */ 741 rcu_read_lock();
717 if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { 742 if (refcount_dec_and_test(&cred->cr_count))
718 if (atomic_dec_and_test(&cred->cr_count)) 743 goto destroy;
719 cred->cr_ops->crdestroy(cred); 744 if (refcount_read(&cred->cr_count) != 1 ||
720 return; 745 !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
721 } 746 goto out;
722 747 if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
723 if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) 748 cred->cr_expire = jiffies;
724 return; 749 rpcauth_lru_add(cred);
725 if (!list_empty(&cred->cr_lru)) { 750 /* Race breaker */
726 number_cred_unused--; 751 if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)))
727 list_del_init(&cred->cr_lru); 752 rpcauth_lru_remove(cred);
728 } 753 } else if (rpcauth_unhash_cred(cred)) {
729 if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { 754 rpcauth_lru_remove(cred);
730 if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { 755 if (refcount_dec_and_test(&cred->cr_count))
731 cred->cr_expire = jiffies; 756 goto destroy;
732 list_add_tail(&cred->cr_lru, &cred_unused);
733 number_cred_unused++;
734 goto out_nodestroy;
735 }
736 if (!rpcauth_unhash_cred(cred)) {
737 /* We were hashed and someone looked us up... */
738 goto out_nodestroy;
739 }
740 } 757 }
741 spin_unlock(&rpc_credcache_lock); 758out:
742 cred->cr_ops->crdestroy(cred); 759 rcu_read_unlock();
743 return; 760 return;
744out_nodestroy: 761destroy:
745 spin_unlock(&rpc_credcache_lock); 762 rcu_read_unlock();
763 cred->cr_ops->crdestroy(cred);
746} 764}
747EXPORT_SYMBOL_GPL(put_rpccred); 765EXPORT_SYMBOL_GPL(put_rpccred);
748 766
@@ -817,6 +835,16 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
817 return rpcauth_unwrap_req_decode(decode, rqstp, data, obj); 835 return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
818} 836}
819 837
838bool
839rpcauth_xmit_need_reencode(struct rpc_task *task)
840{
841 struct rpc_cred *cred = task->tk_rqstp->rq_cred;
842
843 if (!cred || !cred->cr_ops->crneed_reencode)
844 return false;
845 return cred->cr_ops->crneed_reencode(task);
846}
847
820int 848int
821rpcauth_refreshcred(struct rpc_task *task) 849rpcauth_refreshcred(struct rpc_task *task)
822{ 850{
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index f1df9837f1ac..d8831b988b1e 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -274,7 +274,7 @@ static const struct rpc_authops generic_auth_ops = {
274 274
275static struct rpc_auth generic_auth = { 275static struct rpc_auth generic_auth = {
276 .au_ops = &generic_auth_ops, 276 .au_ops = &generic_auth_ops,
277 .au_count = ATOMIC_INIT(0), 277 .au_count = REFCOUNT_INIT(1),
278}; 278};
279 279
280static bool generic_key_to_expire(struct rpc_cred *cred) 280static bool generic_key_to_expire(struct rpc_cred *cred)
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 21c0aa0a0d1d..30f970cdc7f6 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1058,7 +1058,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
1058 auth->au_flavor = flavor; 1058 auth->au_flavor = flavor;
1059 if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) 1059 if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
1060 auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; 1060 auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
1061 atomic_set(&auth->au_count, 1); 1061 refcount_set(&auth->au_count, 1);
1062 kref_init(&gss_auth->kref); 1062 kref_init(&gss_auth->kref);
1063 1063
1064 err = rpcauth_init_credcache(auth); 1064 err = rpcauth_init_credcache(auth);
@@ -1187,7 +1187,7 @@ gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args,
1187 if (strcmp(gss_auth->target_name, args->target_name)) 1187 if (strcmp(gss_auth->target_name, args->target_name))
1188 continue; 1188 continue;
1189 } 1189 }
1190 if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count)) 1190 if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count))
1191 continue; 1191 continue;
1192 goto out; 1192 goto out;
1193 } 1193 }
@@ -1984,6 +1984,46 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
1984 return decode(rqstp, &xdr, obj); 1984 return decode(rqstp, &xdr, obj);
1985} 1985}
1986 1986
1987static bool
1988gss_seq_is_newer(u32 new, u32 old)
1989{
1990 return (s32)(new - old) > 0;
1991}
1992
1993static bool
1994gss_xmit_need_reencode(struct rpc_task *task)
1995{
1996 struct rpc_rqst *req = task->tk_rqstp;
1997 struct rpc_cred *cred = req->rq_cred;
1998 struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
1999 u32 win, seq_xmit;
2000 bool ret = true;
2001
2002 if (!ctx)
2003 return true;
2004
2005 if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq)))
2006 goto out;
2007
2008 seq_xmit = READ_ONCE(ctx->gc_seq_xmit);
2009 while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) {
2010 u32 tmp = seq_xmit;
2011
2012 seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno);
2013 if (seq_xmit == tmp) {
2014 ret = false;
2015 goto out;
2016 }
2017 }
2018
2019 win = ctx->gc_win;
2020 if (win > 0)
2021 ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win);
2022out:
2023 gss_put_ctx(ctx);
2024 return ret;
2025}
2026
1987static int 2027static int
1988gss_unwrap_resp(struct rpc_task *task, 2028gss_unwrap_resp(struct rpc_task *task,
1989 kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj) 2029 kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
@@ -2052,6 +2092,7 @@ static const struct rpc_credops gss_credops = {
2052 .crunwrap_resp = gss_unwrap_resp, 2092 .crunwrap_resp = gss_unwrap_resp,
2053 .crkey_timeout = gss_key_timeout, 2093 .crkey_timeout = gss_key_timeout,
2054 .crstringify_acceptor = gss_stringify_acceptor, 2094 .crstringify_acceptor = gss_stringify_acceptor,
2095 .crneed_reencode = gss_xmit_need_reencode,
2055}; 2096};
2056 2097
2057static const struct rpc_credops gss_nullops = { 2098static const struct rpc_credops gss_nullops = {
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 0220e1ca5280..4f43383971ba 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -53,7 +53,7 @@
53 53
54u32 54u32
55krb5_encrypt( 55krb5_encrypt(
56 struct crypto_skcipher *tfm, 56 struct crypto_sync_skcipher *tfm,
57 void * iv, 57 void * iv,
58 void * in, 58 void * in,
59 void * out, 59 void * out,
@@ -62,24 +62,24 @@ krb5_encrypt(
62 u32 ret = -EINVAL; 62 u32 ret = -EINVAL;
63 struct scatterlist sg[1]; 63 struct scatterlist sg[1];
64 u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; 64 u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
65 SKCIPHER_REQUEST_ON_STACK(req, tfm); 65 SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
66 66
67 if (length % crypto_skcipher_blocksize(tfm) != 0) 67 if (length % crypto_sync_skcipher_blocksize(tfm) != 0)
68 goto out; 68 goto out;
69 69
70 if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { 70 if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
71 dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", 71 dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n",
72 crypto_skcipher_ivsize(tfm)); 72 crypto_sync_skcipher_ivsize(tfm));
73 goto out; 73 goto out;
74 } 74 }
75 75
76 if (iv) 76 if (iv)
77 memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm)); 77 memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
78 78
79 memcpy(out, in, length); 79 memcpy(out, in, length);
80 sg_init_one(sg, out, length); 80 sg_init_one(sg, out, length);
81 81
82 skcipher_request_set_tfm(req, tfm); 82 skcipher_request_set_sync_tfm(req, tfm);
83 skcipher_request_set_callback(req, 0, NULL, NULL); 83 skcipher_request_set_callback(req, 0, NULL, NULL);
84 skcipher_request_set_crypt(req, sg, sg, length, local_iv); 84 skcipher_request_set_crypt(req, sg, sg, length, local_iv);
85 85
@@ -92,7 +92,7 @@ out:
92 92
93u32 93u32
94krb5_decrypt( 94krb5_decrypt(
95 struct crypto_skcipher *tfm, 95 struct crypto_sync_skcipher *tfm,
96 void * iv, 96 void * iv,
97 void * in, 97 void * in,
98 void * out, 98 void * out,
@@ -101,23 +101,23 @@ krb5_decrypt(
101 u32 ret = -EINVAL; 101 u32 ret = -EINVAL;
102 struct scatterlist sg[1]; 102 struct scatterlist sg[1];
103 u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; 103 u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
104 SKCIPHER_REQUEST_ON_STACK(req, tfm); 104 SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
105 105
106 if (length % crypto_skcipher_blocksize(tfm) != 0) 106 if (length % crypto_sync_skcipher_blocksize(tfm) != 0)
107 goto out; 107 goto out;
108 108
109 if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { 109 if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
110 dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", 110 dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n",
111 crypto_skcipher_ivsize(tfm)); 111 crypto_sync_skcipher_ivsize(tfm));
112 goto out; 112 goto out;
113 } 113 }
114 if (iv) 114 if (iv)
115 memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm)); 115 memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
116 116
117 memcpy(out, in, length); 117 memcpy(out, in, length);
118 sg_init_one(sg, out, length); 118 sg_init_one(sg, out, length);
119 119
120 skcipher_request_set_tfm(req, tfm); 120 skcipher_request_set_sync_tfm(req, tfm);
121 skcipher_request_set_callback(req, 0, NULL, NULL); 121 skcipher_request_set_callback(req, 0, NULL, NULL);
122 skcipher_request_set_crypt(req, sg, sg, length, local_iv); 122 skcipher_request_set_crypt(req, sg, sg, length, local_iv);
123 123
@@ -466,7 +466,8 @@ encryptor(struct scatterlist *sg, void *data)
466{ 466{
467 struct encryptor_desc *desc = data; 467 struct encryptor_desc *desc = data;
468 struct xdr_buf *outbuf = desc->outbuf; 468 struct xdr_buf *outbuf = desc->outbuf;
469 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); 469 struct crypto_sync_skcipher *tfm =
470 crypto_sync_skcipher_reqtfm(desc->req);
470 struct page *in_page; 471 struct page *in_page;
471 int thislen = desc->fraglen + sg->length; 472 int thislen = desc->fraglen + sg->length;
472 int fraglen, ret; 473 int fraglen, ret;
@@ -492,7 +493,7 @@ encryptor(struct scatterlist *sg, void *data)
492 desc->fraglen += sg->length; 493 desc->fraglen += sg->length;
493 desc->pos += sg->length; 494 desc->pos += sg->length;
494 495
495 fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); 496 fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1);
496 thislen -= fraglen; 497 thislen -= fraglen;
497 498
498 if (thislen == 0) 499 if (thislen == 0)
@@ -526,16 +527,16 @@ encryptor(struct scatterlist *sg, void *data)
526} 527}
527 528
528int 529int
529gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, 530gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
530 int offset, struct page **pages) 531 int offset, struct page **pages)
531{ 532{
532 int ret; 533 int ret;
533 struct encryptor_desc desc; 534 struct encryptor_desc desc;
534 SKCIPHER_REQUEST_ON_STACK(req, tfm); 535 SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
535 536
536 BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); 537 BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
537 538
538 skcipher_request_set_tfm(req, tfm); 539 skcipher_request_set_sync_tfm(req, tfm);
539 skcipher_request_set_callback(req, 0, NULL, NULL); 540 skcipher_request_set_callback(req, 0, NULL, NULL);
540 541
541 memset(desc.iv, 0, sizeof(desc.iv)); 542 memset(desc.iv, 0, sizeof(desc.iv));
@@ -567,7 +568,8 @@ decryptor(struct scatterlist *sg, void *data)
567{ 568{
568 struct decryptor_desc *desc = data; 569 struct decryptor_desc *desc = data;
569 int thislen = desc->fraglen + sg->length; 570 int thislen = desc->fraglen + sg->length;
570 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); 571 struct crypto_sync_skcipher *tfm =
572 crypto_sync_skcipher_reqtfm(desc->req);
571 int fraglen, ret; 573 int fraglen, ret;
572 574
573 /* Worst case is 4 fragments: head, end of page 1, start 575 /* Worst case is 4 fragments: head, end of page 1, start
@@ -578,7 +580,7 @@ decryptor(struct scatterlist *sg, void *data)
578 desc->fragno++; 580 desc->fragno++;
579 desc->fraglen += sg->length; 581 desc->fraglen += sg->length;
580 582
581 fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); 583 fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1);
582 thislen -= fraglen; 584 thislen -= fraglen;
583 585
584 if (thislen == 0) 586 if (thislen == 0)
@@ -608,17 +610,17 @@ decryptor(struct scatterlist *sg, void *data)
608} 610}
609 611
610int 612int
611gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, 613gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
612 int offset) 614 int offset)
613{ 615{
614 int ret; 616 int ret;
615 struct decryptor_desc desc; 617 struct decryptor_desc desc;
616 SKCIPHER_REQUEST_ON_STACK(req, tfm); 618 SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
617 619
618 /* XXXJBF: */ 620 /* XXXJBF: */
619 BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); 621 BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
620 622
621 skcipher_request_set_tfm(req, tfm); 623 skcipher_request_set_sync_tfm(req, tfm);
622 skcipher_request_set_callback(req, 0, NULL, NULL); 624 skcipher_request_set_callback(req, 0, NULL, NULL);
623 625
624 memset(desc.iv, 0, sizeof(desc.iv)); 626 memset(desc.iv, 0, sizeof(desc.iv));
@@ -672,12 +674,12 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen)
672} 674}
673 675
674static u32 676static u32
675gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, 677gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf,
676 u32 offset, u8 *iv, struct page **pages, int encrypt) 678 u32 offset, u8 *iv, struct page **pages, int encrypt)
677{ 679{
678 u32 ret; 680 u32 ret;
679 struct scatterlist sg[1]; 681 struct scatterlist sg[1];
680 SKCIPHER_REQUEST_ON_STACK(req, cipher); 682 SYNC_SKCIPHER_REQUEST_ON_STACK(req, cipher);
681 u8 *data; 683 u8 *data;
682 struct page **save_pages; 684 struct page **save_pages;
683 u32 len = buf->len - offset; 685 u32 len = buf->len - offset;
@@ -706,7 +708,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
706 708
707 sg_init_one(sg, data, len); 709 sg_init_one(sg, data, len);
708 710
709 skcipher_request_set_tfm(req, cipher); 711 skcipher_request_set_sync_tfm(req, cipher);
710 skcipher_request_set_callback(req, 0, NULL, NULL); 712 skcipher_request_set_callback(req, 0, NULL, NULL);
711 skcipher_request_set_crypt(req, sg, sg, len, iv); 713 skcipher_request_set_crypt(req, sg, sg, len, iv);
712 714
@@ -735,7 +737,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
735 struct xdr_netobj hmac; 737 struct xdr_netobj hmac;
736 u8 *cksumkey; 738 u8 *cksumkey;
737 u8 *ecptr; 739 u8 *ecptr;
738 struct crypto_skcipher *cipher, *aux_cipher; 740 struct crypto_sync_skcipher *cipher, *aux_cipher;
739 int blocksize; 741 int blocksize;
740 struct page **save_pages; 742 struct page **save_pages;
741 int nblocks, nbytes; 743 int nblocks, nbytes;
@@ -754,7 +756,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
754 cksumkey = kctx->acceptor_integ; 756 cksumkey = kctx->acceptor_integ;
755 usage = KG_USAGE_ACCEPTOR_SEAL; 757 usage = KG_USAGE_ACCEPTOR_SEAL;
756 } 758 }
757 blocksize = crypto_skcipher_blocksize(cipher); 759 blocksize = crypto_sync_skcipher_blocksize(cipher);
758 760
759 /* hide the gss token header and insert the confounder */ 761 /* hide the gss token header and insert the confounder */
760 offset += GSS_KRB5_TOK_HDR_LEN; 762 offset += GSS_KRB5_TOK_HDR_LEN;
@@ -807,7 +809,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
807 memset(desc.iv, 0, sizeof(desc.iv)); 809 memset(desc.iv, 0, sizeof(desc.iv));
808 810
809 if (cbcbytes) { 811 if (cbcbytes) {
810 SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); 812 SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
811 813
812 desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; 814 desc.pos = offset + GSS_KRB5_TOK_HDR_LEN;
813 desc.fragno = 0; 815 desc.fragno = 0;
@@ -816,7 +818,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
816 desc.outbuf = buf; 818 desc.outbuf = buf;
817 desc.req = req; 819 desc.req = req;
818 820
819 skcipher_request_set_tfm(req, aux_cipher); 821 skcipher_request_set_sync_tfm(req, aux_cipher);
820 skcipher_request_set_callback(req, 0, NULL, NULL); 822 skcipher_request_set_callback(req, 0, NULL, NULL);
821 823
822 sg_init_table(desc.infrags, 4); 824 sg_init_table(desc.infrags, 4);
@@ -855,7 +857,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
855 struct xdr_buf subbuf; 857 struct xdr_buf subbuf;
856 u32 ret = 0; 858 u32 ret = 0;
857 u8 *cksum_key; 859 u8 *cksum_key;
858 struct crypto_skcipher *cipher, *aux_cipher; 860 struct crypto_sync_skcipher *cipher, *aux_cipher;
859 struct xdr_netobj our_hmac_obj; 861 struct xdr_netobj our_hmac_obj;
860 u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; 862 u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN];
861 u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; 863 u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN];
@@ -874,7 +876,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
874 cksum_key = kctx->initiator_integ; 876 cksum_key = kctx->initiator_integ;
875 usage = KG_USAGE_INITIATOR_SEAL; 877 usage = KG_USAGE_INITIATOR_SEAL;
876 } 878 }
877 blocksize = crypto_skcipher_blocksize(cipher); 879 blocksize = crypto_sync_skcipher_blocksize(cipher);
878 880
879 881
880 /* create a segment skipping the header and leaving out the checksum */ 882 /* create a segment skipping the header and leaving out the checksum */
@@ -891,13 +893,13 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
891 memset(desc.iv, 0, sizeof(desc.iv)); 893 memset(desc.iv, 0, sizeof(desc.iv));
892 894
893 if (cbcbytes) { 895 if (cbcbytes) {
894 SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); 896 SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
895 897
896 desc.fragno = 0; 898 desc.fragno = 0;
897 desc.fraglen = 0; 899 desc.fraglen = 0;
898 desc.req = req; 900 desc.req = req;
899 901
900 skcipher_request_set_tfm(req, aux_cipher); 902 skcipher_request_set_sync_tfm(req, aux_cipher);
901 skcipher_request_set_callback(req, 0, NULL, NULL); 903 skcipher_request_set_callback(req, 0, NULL, NULL);
902 904
903 sg_init_table(desc.frags, 4); 905 sg_init_table(desc.frags, 4);
@@ -946,7 +948,8 @@ out_err:
946 * Set the key of the given cipher. 948 * Set the key of the given cipher.
947 */ 949 */
948int 950int
949krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, 951krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
952 struct crypto_sync_skcipher *cipher,
950 unsigned char *cksum) 953 unsigned char *cksum)
951{ 954{
952 struct crypto_shash *hmac; 955 struct crypto_shash *hmac;
@@ -994,7 +997,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
994 if (err) 997 if (err)
995 goto out_err; 998 goto out_err;
996 999
997 err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); 1000 err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
998 if (err) 1001 if (err)
999 goto out_err; 1002 goto out_err;
1000 1003
@@ -1012,7 +1015,8 @@ out_err:
1012 * Set the key of cipher kctx->enc. 1015 * Set the key of cipher kctx->enc.
1013 */ 1016 */
1014int 1017int
1015krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, 1018krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
1019 struct crypto_sync_skcipher *cipher,
1016 s32 seqnum) 1020 s32 seqnum)
1017{ 1021{
1018 struct crypto_shash *hmac; 1022 struct crypto_shash *hmac;
@@ -1069,7 +1073,8 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
1069 if (err) 1073 if (err)
1070 goto out_err; 1074 goto out_err;
1071 1075
1072 err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); 1076 err = crypto_sync_skcipher_setkey(cipher, Kcrypt,
1077 kctx->gk5e->keylength);
1073 if (err) 1078 if (err)
1074 goto out_err; 1079 goto out_err;
1075 1080
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index f7fe2d2b851f..550fdf18d3b3 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -147,7 +147,7 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
147 size_t blocksize, keybytes, keylength, n; 147 size_t blocksize, keybytes, keylength, n;
148 unsigned char *inblockdata, *outblockdata, *rawkey; 148 unsigned char *inblockdata, *outblockdata, *rawkey;
149 struct xdr_netobj inblock, outblock; 149 struct xdr_netobj inblock, outblock;
150 struct crypto_skcipher *cipher; 150 struct crypto_sync_skcipher *cipher;
151 u32 ret = EINVAL; 151 u32 ret = EINVAL;
152 152
153 blocksize = gk5e->blocksize; 153 blocksize = gk5e->blocksize;
@@ -157,11 +157,10 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
157 if ((inkey->len != keylength) || (outkey->len != keylength)) 157 if ((inkey->len != keylength) || (outkey->len != keylength))
158 goto err_return; 158 goto err_return;
159 159
160 cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0, 160 cipher = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0);
161 CRYPTO_ALG_ASYNC);
162 if (IS_ERR(cipher)) 161 if (IS_ERR(cipher))
163 goto err_return; 162 goto err_return;
164 if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len)) 163 if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len))
165 goto err_return; 164 goto err_return;
166 165
167 /* allocate and set up buffers */ 166 /* allocate and set up buffers */
@@ -238,7 +237,7 @@ err_free_in:
238 memset(inblockdata, 0, blocksize); 237 memset(inblockdata, 0, blocksize);
239 kfree(inblockdata); 238 kfree(inblockdata);
240err_free_cipher: 239err_free_cipher:
241 crypto_free_skcipher(cipher); 240 crypto_free_sync_skcipher(cipher);
242err_return: 241err_return:
243 return ret; 242 return ret;
244} 243}
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 7bb2514aadd9..7f0424dfa8f6 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -218,7 +218,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
218 218
219static inline const void * 219static inline const void *
220get_key(const void *p, const void *end, 220get_key(const void *p, const void *end,
221 struct krb5_ctx *ctx, struct crypto_skcipher **res) 221 struct krb5_ctx *ctx, struct crypto_sync_skcipher **res)
222{ 222{
223 struct xdr_netobj key; 223 struct xdr_netobj key;
224 int alg; 224 int alg;
@@ -246,15 +246,14 @@ get_key(const void *p, const void *end,
246 if (IS_ERR(p)) 246 if (IS_ERR(p))
247 goto out_err; 247 goto out_err;
248 248
249 *res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, 249 *res = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
250 CRYPTO_ALG_ASYNC);
251 if (IS_ERR(*res)) { 250 if (IS_ERR(*res)) {
252 printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " 251 printk(KERN_WARNING "gss_kerberos_mech: unable to initialize "
253 "crypto algorithm %s\n", ctx->gk5e->encrypt_name); 252 "crypto algorithm %s\n", ctx->gk5e->encrypt_name);
254 *res = NULL; 253 *res = NULL;
255 goto out_err_free_key; 254 goto out_err_free_key;
256 } 255 }
257 if (crypto_skcipher_setkey(*res, key.data, key.len)) { 256 if (crypto_sync_skcipher_setkey(*res, key.data, key.len)) {
258 printk(KERN_WARNING "gss_kerberos_mech: error setting key for " 257 printk(KERN_WARNING "gss_kerberos_mech: error setting key for "
259 "crypto algorithm %s\n", ctx->gk5e->encrypt_name); 258 "crypto algorithm %s\n", ctx->gk5e->encrypt_name);
260 goto out_err_free_tfm; 259 goto out_err_free_tfm;
@@ -264,7 +263,7 @@ get_key(const void *p, const void *end,
264 return p; 263 return p;
265 264
266out_err_free_tfm: 265out_err_free_tfm:
267 crypto_free_skcipher(*res); 266 crypto_free_sync_skcipher(*res);
268out_err_free_key: 267out_err_free_key:
269 kfree(key.data); 268 kfree(key.data);
270 p = ERR_PTR(-EINVAL); 269 p = ERR_PTR(-EINVAL);
@@ -336,30 +335,30 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
336 return 0; 335 return 0;
337 336
338out_err_free_key2: 337out_err_free_key2:
339 crypto_free_skcipher(ctx->seq); 338 crypto_free_sync_skcipher(ctx->seq);
340out_err_free_key1: 339out_err_free_key1:
341 crypto_free_skcipher(ctx->enc); 340 crypto_free_sync_skcipher(ctx->enc);
342out_err_free_mech: 341out_err_free_mech:
343 kfree(ctx->mech_used.data); 342 kfree(ctx->mech_used.data);
344out_err: 343out_err:
345 return PTR_ERR(p); 344 return PTR_ERR(p);
346} 345}
347 346
348static struct crypto_skcipher * 347static struct crypto_sync_skcipher *
349context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) 348context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key)
350{ 349{
351 struct crypto_skcipher *cp; 350 struct crypto_sync_skcipher *cp;
352 351
353 cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC); 352 cp = crypto_alloc_sync_skcipher(cname, 0, 0);
354 if (IS_ERR(cp)) { 353 if (IS_ERR(cp)) {
355 dprintk("gss_kerberos_mech: unable to initialize " 354 dprintk("gss_kerberos_mech: unable to initialize "
356 "crypto algorithm %s\n", cname); 355 "crypto algorithm %s\n", cname);
357 return NULL; 356 return NULL;
358 } 357 }
359 if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { 358 if (crypto_sync_skcipher_setkey(cp, key, ctx->gk5e->keylength)) {
360 dprintk("gss_kerberos_mech: error setting key for " 359 dprintk("gss_kerberos_mech: error setting key for "
361 "crypto algorithm %s\n", cname); 360 "crypto algorithm %s\n", cname);
362 crypto_free_skcipher(cp); 361 crypto_free_sync_skcipher(cp);
363 return NULL; 362 return NULL;
364 } 363 }
365 return cp; 364 return cp;
@@ -413,9 +412,9 @@ context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask)
413 return 0; 412 return 0;
414 413
415out_free_enc: 414out_free_enc:
416 crypto_free_skcipher(ctx->enc); 415 crypto_free_sync_skcipher(ctx->enc);
417out_free_seq: 416out_free_seq:
418 crypto_free_skcipher(ctx->seq); 417 crypto_free_sync_skcipher(ctx->seq);
419out_err: 418out_err:
420 return -EINVAL; 419 return -EINVAL;
421} 420}
@@ -469,17 +468,15 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
469 /* 468 /*
470 * allocate hash, and skciphers for data and seqnum encryption 469 * allocate hash, and skciphers for data and seqnum encryption
471 */ 470 */
472 ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, 471 ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
473 CRYPTO_ALG_ASYNC);
474 if (IS_ERR(ctx->enc)) { 472 if (IS_ERR(ctx->enc)) {
475 err = PTR_ERR(ctx->enc); 473 err = PTR_ERR(ctx->enc);
476 goto out_err_free_hmac; 474 goto out_err_free_hmac;
477 } 475 }
478 476
479 ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, 477 ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
480 CRYPTO_ALG_ASYNC);
481 if (IS_ERR(ctx->seq)) { 478 if (IS_ERR(ctx->seq)) {
482 crypto_free_skcipher(ctx->enc); 479 crypto_free_sync_skcipher(ctx->enc);
483 err = PTR_ERR(ctx->seq); 480 err = PTR_ERR(ctx->seq);
484 goto out_err_free_hmac; 481 goto out_err_free_hmac;
485 } 482 }
@@ -591,7 +588,7 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
591 context_v2_alloc_cipher(ctx, "cbc(aes)", 588 context_v2_alloc_cipher(ctx, "cbc(aes)",
592 ctx->acceptor_seal); 589 ctx->acceptor_seal);
593 if (ctx->acceptor_enc_aux == NULL) { 590 if (ctx->acceptor_enc_aux == NULL) {
594 crypto_free_skcipher(ctx->initiator_enc_aux); 591 crypto_free_sync_skcipher(ctx->initiator_enc_aux);
595 goto out_free_acceptor_enc; 592 goto out_free_acceptor_enc;
596 } 593 }
597 } 594 }
@@ -599,9 +596,9 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
599 return 0; 596 return 0;
600 597
601out_free_acceptor_enc: 598out_free_acceptor_enc:
602 crypto_free_skcipher(ctx->acceptor_enc); 599 crypto_free_sync_skcipher(ctx->acceptor_enc);
603out_free_initiator_enc: 600out_free_initiator_enc:
604 crypto_free_skcipher(ctx->initiator_enc); 601 crypto_free_sync_skcipher(ctx->initiator_enc);
605out_err: 602out_err:
606 return -EINVAL; 603 return -EINVAL;
607} 604}
@@ -713,12 +710,12 @@ static void
713gss_delete_sec_context_kerberos(void *internal_ctx) { 710gss_delete_sec_context_kerberos(void *internal_ctx) {
714 struct krb5_ctx *kctx = internal_ctx; 711 struct krb5_ctx *kctx = internal_ctx;
715 712
716 crypto_free_skcipher(kctx->seq); 713 crypto_free_sync_skcipher(kctx->seq);
717 crypto_free_skcipher(kctx->enc); 714 crypto_free_sync_skcipher(kctx->enc);
718 crypto_free_skcipher(kctx->acceptor_enc); 715 crypto_free_sync_skcipher(kctx->acceptor_enc);
719 crypto_free_skcipher(kctx->initiator_enc); 716 crypto_free_sync_skcipher(kctx->initiator_enc);
720 crypto_free_skcipher(kctx->acceptor_enc_aux); 717 crypto_free_sync_skcipher(kctx->acceptor_enc_aux);
721 crypto_free_skcipher(kctx->initiator_enc_aux); 718 crypto_free_sync_skcipher(kctx->initiator_enc_aux);
722 kfree(kctx->mech_used.data); 719 kfree(kctx->mech_used.data);
723 kfree(kctx); 720 kfree(kctx);
724} 721}
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index eaad9bc7a0bd..b4adeb06660b 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -63,13 +63,12 @@
63#include <linux/sunrpc/gss_krb5.h> 63#include <linux/sunrpc/gss_krb5.h>
64#include <linux/random.h> 64#include <linux/random.h>
65#include <linux/crypto.h> 65#include <linux/crypto.h>
66#include <linux/atomic.h>
66 67
67#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 68#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
68# define RPCDBG_FACILITY RPCDBG_AUTH 69# define RPCDBG_FACILITY RPCDBG_AUTH
69#endif 70#endif
70 71
71DEFINE_SPINLOCK(krb5_seq_lock);
72
73static void * 72static void *
74setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) 73setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
75{ 74{
@@ -124,6 +123,30 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
124 return krb5_hdr; 123 return krb5_hdr;
125} 124}
126 125
126u32
127gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx)
128{
129 u32 old, seq_send = READ_ONCE(ctx->seq_send);
130
131 do {
132 old = seq_send;
133 seq_send = cmpxchg(&ctx->seq_send, old, old + 1);
134 } while (old != seq_send);
135 return seq_send;
136}
137
138u64
139gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx)
140{
141 u64 old, seq_send = READ_ONCE(ctx->seq_send);
142
143 do {
144 old = seq_send;
145 seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1);
146 } while (old != seq_send);
147 return seq_send;
148}
149
127static u32 150static u32
128gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, 151gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
129 struct xdr_netobj *token) 152 struct xdr_netobj *token)
@@ -154,9 +177,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
154 177
155 memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); 178 memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
156 179
157 spin_lock(&krb5_seq_lock); 180 seq_send = gss_seq_send_fetch_and_inc(ctx);
158 seq_send = ctx->seq_send++;
159 spin_unlock(&krb5_seq_lock);
160 181
161 if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, 182 if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff,
162 seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) 183 seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))
@@ -174,7 +195,6 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
174 .data = cksumdata}; 195 .data = cksumdata};
175 void *krb5_hdr; 196 void *krb5_hdr;
176 s32 now; 197 s32 now;
177 u64 seq_send;
178 u8 *cksumkey; 198 u8 *cksumkey;
179 unsigned int cksum_usage; 199 unsigned int cksum_usage;
180 __be64 seq_send_be64; 200 __be64 seq_send_be64;
@@ -185,11 +205,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
185 205
186 /* Set up the sequence number. Now 64-bits in clear 206 /* Set up the sequence number. Now 64-bits in clear
187 * text and w/o direction indicator */ 207 * text and w/o direction indicator */
188 spin_lock(&krb5_seq_lock); 208 seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx));
189 seq_send = ctx->seq_send64++;
190 spin_unlock(&krb5_seq_lock);
191
192 seq_send_be64 = cpu_to_be64(seq_send);
193 memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); 209 memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8);
194 210
195 if (ctx->initiate) { 211 if (ctx->initiate) {
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index c8b9082f4a9d..fb6656295204 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -43,13 +43,12 @@ static s32
43krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, 43krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
44 unsigned char *cksum, unsigned char *buf) 44 unsigned char *cksum, unsigned char *buf)
45{ 45{
46 struct crypto_skcipher *cipher; 46 struct crypto_sync_skcipher *cipher;
47 unsigned char plain[8]; 47 unsigned char plain[8];
48 s32 code; 48 s32 code;
49 49
50 dprintk("RPC: %s:\n", __func__); 50 dprintk("RPC: %s:\n", __func__);
51 cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, 51 cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
52 CRYPTO_ALG_ASYNC);
53 if (IS_ERR(cipher)) 52 if (IS_ERR(cipher))
54 return PTR_ERR(cipher); 53 return PTR_ERR(cipher);
55 54
@@ -68,12 +67,12 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
68 67
69 code = krb5_encrypt(cipher, cksum, plain, buf, 8); 68 code = krb5_encrypt(cipher, cksum, plain, buf, 8);
70out: 69out:
71 crypto_free_skcipher(cipher); 70 crypto_free_sync_skcipher(cipher);
72 return code; 71 return code;
73} 72}
74s32 73s32
75krb5_make_seq_num(struct krb5_ctx *kctx, 74krb5_make_seq_num(struct krb5_ctx *kctx,
76 struct crypto_skcipher *key, 75 struct crypto_sync_skcipher *key,
77 int direction, 76 int direction,
78 u32 seqnum, 77 u32 seqnum,
79 unsigned char *cksum, unsigned char *buf) 78 unsigned char *cksum, unsigned char *buf)
@@ -101,13 +100,12 @@ static s32
101krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, 100krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
102 unsigned char *buf, int *direction, s32 *seqnum) 101 unsigned char *buf, int *direction, s32 *seqnum)
103{ 102{
104 struct crypto_skcipher *cipher; 103 struct crypto_sync_skcipher *cipher;
105 unsigned char plain[8]; 104 unsigned char plain[8];
106 s32 code; 105 s32 code;
107 106
108 dprintk("RPC: %s:\n", __func__); 107 dprintk("RPC: %s:\n", __func__);
109 cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, 108 cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
110 CRYPTO_ALG_ASYNC);
111 if (IS_ERR(cipher)) 109 if (IS_ERR(cipher))
112 return PTR_ERR(cipher); 110 return PTR_ERR(cipher);
113 111
@@ -130,7 +128,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
130 *seqnum = ((plain[0] << 24) | (plain[1] << 16) | 128 *seqnum = ((plain[0] << 24) | (plain[1] << 16) |
131 (plain[2] << 8) | (plain[3])); 129 (plain[2] << 8) | (plain[3]));
132out: 130out:
133 crypto_free_skcipher(cipher); 131 crypto_free_sync_skcipher(cipher);
134 return code; 132 return code;
135} 133}
136 134
@@ -142,7 +140,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
142{ 140{
143 s32 code; 141 s32 code;
144 unsigned char plain[8]; 142 unsigned char plain[8];
145 struct crypto_skcipher *key = kctx->seq; 143 struct crypto_sync_skcipher *key = kctx->seq;
146 144
147 dprintk("RPC: krb5_get_seq_num:\n"); 145 dprintk("RPC: krb5_get_seq_num:\n");
148 146
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 39a2e672900b..962fa84e6db1 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -174,7 +174,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
174 174
175 now = get_seconds(); 175 now = get_seconds();
176 176
177 blocksize = crypto_skcipher_blocksize(kctx->enc); 177 blocksize = crypto_sync_skcipher_blocksize(kctx->enc);
178 gss_krb5_add_padding(buf, offset, blocksize); 178 gss_krb5_add_padding(buf, offset, blocksize);
179 BUG_ON((buf->len - offset) % blocksize); 179 BUG_ON((buf->len - offset) % blocksize);
180 plainlen = conflen + buf->len - offset; 180 plainlen = conflen + buf->len - offset;
@@ -228,9 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
228 228
229 memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); 229 memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
230 230
231 spin_lock(&krb5_seq_lock); 231 seq_send = gss_seq_send_fetch_and_inc(kctx);
232 seq_send = kctx->seq_send++;
233 spin_unlock(&krb5_seq_lock);
234 232
235 /* XXX would probably be more efficient to compute checksum 233 /* XXX would probably be more efficient to compute checksum
236 * and encrypt at the same time: */ 234 * and encrypt at the same time: */
@@ -239,10 +237,10 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
239 return GSS_S_FAILURE; 237 return GSS_S_FAILURE;
240 238
241 if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { 239 if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
242 struct crypto_skcipher *cipher; 240 struct crypto_sync_skcipher *cipher;
243 int err; 241 int err;
244 cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, 242 cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
245 CRYPTO_ALG_ASYNC); 243 0, 0);
246 if (IS_ERR(cipher)) 244 if (IS_ERR(cipher))
247 return GSS_S_FAILURE; 245 return GSS_S_FAILURE;
248 246
@@ -250,7 +248,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
250 248
251 err = gss_encrypt_xdr_buf(cipher, buf, 249 err = gss_encrypt_xdr_buf(cipher, buf,
252 offset + headlen - conflen, pages); 250 offset + headlen - conflen, pages);
253 crypto_free_skcipher(cipher); 251 crypto_free_sync_skcipher(cipher);
254 if (err) 252 if (err)
255 return GSS_S_FAILURE; 253 return GSS_S_FAILURE;
256 } else { 254 } else {
@@ -327,18 +325,18 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
327 return GSS_S_BAD_SIG; 325 return GSS_S_BAD_SIG;
328 326
329 if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { 327 if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
330 struct crypto_skcipher *cipher; 328 struct crypto_sync_skcipher *cipher;
331 int err; 329 int err;
332 330
333 cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, 331 cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
334 CRYPTO_ALG_ASYNC); 332 0, 0);
335 if (IS_ERR(cipher)) 333 if (IS_ERR(cipher))
336 return GSS_S_FAILURE; 334 return GSS_S_FAILURE;
337 335
338 krb5_rc4_setup_enc_key(kctx, cipher, seqnum); 336 krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
339 337
340 err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); 338 err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
341 crypto_free_skcipher(cipher); 339 crypto_free_sync_skcipher(cipher);
342 if (err) 340 if (err)
343 return GSS_S_DEFECTIVE_TOKEN; 341 return GSS_S_DEFECTIVE_TOKEN;
344 } else { 342 } else {
@@ -371,7 +369,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
371 /* Copy the data back to the right position. XXX: Would probably be 369 /* Copy the data back to the right position. XXX: Would probably be
372 * better to copy and encrypt at the same time. */ 370 * better to copy and encrypt at the same time. */
373 371
374 blocksize = crypto_skcipher_blocksize(kctx->enc); 372 blocksize = crypto_sync_skcipher_blocksize(kctx->enc);
375 data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + 373 data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) +
376 conflen; 374 conflen;
377 orig_start = buf->head[0].iov_base + offset; 375 orig_start = buf->head[0].iov_base + offset;
@@ -477,9 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
477 *be16ptr++ = 0; 475 *be16ptr++ = 0;
478 476
479 be64ptr = (__be64 *)be16ptr; 477 be64ptr = (__be64 *)be16ptr;
480 spin_lock(&krb5_seq_lock); 478 *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx));
481 *be64ptr = cpu_to_be64(kctx->seq_send64++);
482 spin_unlock(&krb5_seq_lock);
483 479
484 err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); 480 err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
485 if (err) 481 if (err)
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 5fec3abbe19b..16ac0f4cb7d8 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -117,7 +117,7 @@ int gss_mech_register(struct gss_api_mech *gm)
117 if (status) 117 if (status)
118 return status; 118 return status;
119 spin_lock(&registered_mechs_lock); 119 spin_lock(&registered_mechs_lock);
120 list_add(&gm->gm_list, &registered_mechs); 120 list_add_rcu(&gm->gm_list, &registered_mechs);
121 spin_unlock(&registered_mechs_lock); 121 spin_unlock(&registered_mechs_lock);
122 dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); 122 dprintk("RPC: registered gss mechanism %s\n", gm->gm_name);
123 return 0; 123 return 0;
@@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(gss_mech_register);
132void gss_mech_unregister(struct gss_api_mech *gm) 132void gss_mech_unregister(struct gss_api_mech *gm)
133{ 133{
134 spin_lock(&registered_mechs_lock); 134 spin_lock(&registered_mechs_lock);
135 list_del(&gm->gm_list); 135 list_del_rcu(&gm->gm_list);
136 spin_unlock(&registered_mechs_lock); 136 spin_unlock(&registered_mechs_lock);
137 dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); 137 dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name);
138 gss_mech_free(gm); 138 gss_mech_free(gm);
@@ -151,15 +151,15 @@ _gss_mech_get_by_name(const char *name)
151{ 151{
152 struct gss_api_mech *pos, *gm = NULL; 152 struct gss_api_mech *pos, *gm = NULL;
153 153
154 spin_lock(&registered_mechs_lock); 154 rcu_read_lock();
155 list_for_each_entry(pos, &registered_mechs, gm_list) { 155 list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
156 if (0 == strcmp(name, pos->gm_name)) { 156 if (0 == strcmp(name, pos->gm_name)) {
157 if (try_module_get(pos->gm_owner)) 157 if (try_module_get(pos->gm_owner))
158 gm = pos; 158 gm = pos;
159 break; 159 break;
160 } 160 }
161 } 161 }
162 spin_unlock(&registered_mechs_lock); 162 rcu_read_unlock();
163 return gm; 163 return gm;
164 164
165} 165}
@@ -186,8 +186,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
186 dprintk("RPC: %s(%s)\n", __func__, buf); 186 dprintk("RPC: %s(%s)\n", __func__, buf);
187 request_module("rpc-auth-gss-%s", buf); 187 request_module("rpc-auth-gss-%s", buf);
188 188
189 spin_lock(&registered_mechs_lock); 189 rcu_read_lock();
190 list_for_each_entry(pos, &registered_mechs, gm_list) { 190 list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
191 if (obj->len == pos->gm_oid.len) { 191 if (obj->len == pos->gm_oid.len) {
192 if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { 192 if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) {
193 if (try_module_get(pos->gm_owner)) 193 if (try_module_get(pos->gm_owner))
@@ -196,7 +196,7 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
196 } 196 }
197 } 197 }
198 } 198 }
199 spin_unlock(&registered_mechs_lock); 199 rcu_read_unlock();
200 return gm; 200 return gm;
201} 201}
202 202
@@ -216,15 +216,15 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
216{ 216{
217 struct gss_api_mech *gm = NULL, *pos; 217 struct gss_api_mech *gm = NULL, *pos;
218 218
219 spin_lock(&registered_mechs_lock); 219 rcu_read_lock();
220 list_for_each_entry(pos, &registered_mechs, gm_list) { 220 list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
221 if (!mech_supports_pseudoflavor(pos, pseudoflavor)) 221 if (!mech_supports_pseudoflavor(pos, pseudoflavor))
222 continue; 222 continue;
223 if (try_module_get(pos->gm_owner)) 223 if (try_module_get(pos->gm_owner))
224 gm = pos; 224 gm = pos;
225 break; 225 break;
226 } 226 }
227 spin_unlock(&registered_mechs_lock); 227 rcu_read_unlock();
228 return gm; 228 return gm;
229} 229}
230 230
@@ -257,8 +257,8 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
257 struct gss_api_mech *pos = NULL; 257 struct gss_api_mech *pos = NULL;
258 int j, i = 0; 258 int j, i = 0;
259 259
260 spin_lock(&registered_mechs_lock); 260 rcu_read_lock();
261 list_for_each_entry(pos, &registered_mechs, gm_list) { 261 list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
262 for (j = 0; j < pos->gm_pf_num; j++) { 262 for (j = 0; j < pos->gm_pf_num; j++) {
263 if (i >= size) { 263 if (i >= size) {
264 spin_unlock(&registered_mechs_lock); 264 spin_unlock(&registered_mechs_lock);
@@ -267,7 +267,7 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
267 array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; 267 array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
268 } 268 }
269 } 269 }
270 spin_unlock(&registered_mechs_lock); 270 rcu_read_unlock();
271 return i; 271 return i;
272} 272}
273 273
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 444380f968f1..006062ad5f58 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -784,6 +784,7 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req,
784 xdr_inline_pages(&req->rq_rcv_buf, 784 xdr_inline_pages(&req->rq_rcv_buf,
785 PAGE_SIZE/2 /* pretty arbitrary */, 785 PAGE_SIZE/2 /* pretty arbitrary */,
786 arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); 786 arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
787 req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
787done: 788done:
788 if (err) 789 if (err)
789 dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); 790 dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4b48228ee8c7..2694a1bc026b 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -21,7 +21,7 @@ static struct rpc_cred null_cred;
21static struct rpc_auth * 21static struct rpc_auth *
22nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 22nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
23{ 23{
24 atomic_inc(&null_auth.au_count); 24 refcount_inc(&null_auth.au_count);
25 return &null_auth; 25 return &null_auth;
26} 26}
27 27
@@ -119,7 +119,7 @@ struct rpc_auth null_auth = {
119 .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, 119 .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
120 .au_ops = &authnull_ops, 120 .au_ops = &authnull_ops,
121 .au_flavor = RPC_AUTH_NULL, 121 .au_flavor = RPC_AUTH_NULL,
122 .au_count = ATOMIC_INIT(0), 122 .au_count = REFCOUNT_INIT(1),
123}; 123};
124 124
125static 125static
@@ -138,6 +138,6 @@ struct rpc_cred null_cred = {
138 .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), 138 .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru),
139 .cr_auth = &null_auth, 139 .cr_auth = &null_auth,
140 .cr_ops = &null_credops, 140 .cr_ops = &null_credops,
141 .cr_count = ATOMIC_INIT(1), 141 .cr_count = REFCOUNT_INIT(2),
142 .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, 142 .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
143}; 143};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 185e56d4f9ae..4c1c7e56288f 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -34,7 +34,7 @@ unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
34{ 34{
35 dprintk("RPC: creating UNIX authenticator for client %p\n", 35 dprintk("RPC: creating UNIX authenticator for client %p\n",
36 clnt); 36 clnt);
37 atomic_inc(&unix_auth.au_count); 37 refcount_inc(&unix_auth.au_count);
38 return &unix_auth; 38 return &unix_auth;
39} 39}
40 40
@@ -239,7 +239,7 @@ struct rpc_auth unix_auth = {
239 .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, 239 .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
240 .au_ops = &authunix_ops, 240 .au_ops = &authunix_ops,
241 .au_flavor = RPC_AUTH_UNIX, 241 .au_flavor = RPC_AUTH_UNIX,
242 .au_count = ATOMIC_INIT(0), 242 .au_count = REFCOUNT_INIT(1),
243}; 243};
244 244
245static 245static
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 3c15a99b9700..fa5ba6ed3197 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -91,7 +91,6 @@ struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
91 return NULL; 91 return NULL;
92 92
93 req->rq_xprt = xprt; 93 req->rq_xprt = xprt;
94 INIT_LIST_HEAD(&req->rq_list);
95 INIT_LIST_HEAD(&req->rq_bc_list); 94 INIT_LIST_HEAD(&req->rq_bc_list);
96 95
97 /* Preallocate one XDR receive buffer */ 96 /* Preallocate one XDR receive buffer */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8ea2f5fadd96..ae3b8145da35 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -61,6 +61,7 @@ static void call_start(struct rpc_task *task);
61static void call_reserve(struct rpc_task *task); 61static void call_reserve(struct rpc_task *task);
62static void call_reserveresult(struct rpc_task *task); 62static void call_reserveresult(struct rpc_task *task);
63static void call_allocate(struct rpc_task *task); 63static void call_allocate(struct rpc_task *task);
64static void call_encode(struct rpc_task *task);
64static void call_decode(struct rpc_task *task); 65static void call_decode(struct rpc_task *task);
65static void call_bind(struct rpc_task *task); 66static void call_bind(struct rpc_task *task);
66static void call_bind_status(struct rpc_task *task); 67static void call_bind_status(struct rpc_task *task);
@@ -1137,10 +1138,10 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
1137struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) 1138struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
1138{ 1139{
1139 struct rpc_task *task; 1140 struct rpc_task *task;
1140 struct xdr_buf *xbufp = &req->rq_snd_buf;
1141 struct rpc_task_setup task_setup_data = { 1141 struct rpc_task_setup task_setup_data = {
1142 .callback_ops = &rpc_default_ops, 1142 .callback_ops = &rpc_default_ops,
1143 .flags = RPC_TASK_SOFTCONN, 1143 .flags = RPC_TASK_SOFTCONN |
1144 RPC_TASK_NO_RETRANS_TIMEOUT,
1144 }; 1145 };
1145 1146
1146 dprintk("RPC: rpc_run_bc_task req= %p\n", req); 1147 dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1148,14 +1149,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
1148 * Create an rpc_task to send the data 1149 * Create an rpc_task to send the data
1149 */ 1150 */
1150 task = rpc_new_task(&task_setup_data); 1151 task = rpc_new_task(&task_setup_data);
1151 task->tk_rqstp = req; 1152 xprt_init_bc_request(req, task);
1152
1153 /*
1154 * Set up the xdr_buf length.
1155 * This also indicates that the buffer is XDR encoded already.
1156 */
1157 xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
1158 xbufp->tail[0].iov_len;
1159 1153
1160 task->tk_action = call_bc_transmit; 1154 task->tk_action = call_bc_transmit;
1161 atomic_inc(&task->tk_count); 1155 atomic_inc(&task->tk_count);
@@ -1558,7 +1552,6 @@ call_reserveresult(struct rpc_task *task)
1558 task->tk_status = 0; 1552 task->tk_status = 0;
1559 if (status >= 0) { 1553 if (status >= 0) {
1560 if (task->tk_rqstp) { 1554 if (task->tk_rqstp) {
1561 xprt_request_init(task);
1562 task->tk_action = call_refresh; 1555 task->tk_action = call_refresh;
1563 return; 1556 return;
1564 } 1557 }
@@ -1680,7 +1673,7 @@ call_allocate(struct rpc_task *task)
1680 dprint_status(task); 1673 dprint_status(task);
1681 1674
1682 task->tk_status = 0; 1675 task->tk_status = 0;
1683 task->tk_action = call_bind; 1676 task->tk_action = call_encode;
1684 1677
1685 if (req->rq_buffer) 1678 if (req->rq_buffer)
1686 return; 1679 return;
@@ -1721,22 +1714,15 @@ call_allocate(struct rpc_task *task)
1721 rpc_exit(task, -ERESTARTSYS); 1714 rpc_exit(task, -ERESTARTSYS);
1722} 1715}
1723 1716
1724static inline int 1717static int
1725rpc_task_need_encode(struct rpc_task *task) 1718rpc_task_need_encode(struct rpc_task *task)
1726{ 1719{
1727 return task->tk_rqstp->rq_snd_buf.len == 0; 1720 return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 &&
1721 (!(task->tk_flags & RPC_TASK_SENT) ||
1722 !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) ||
1723 xprt_request_need_retransmit(task));
1728} 1724}
1729 1725
1730static inline void
1731rpc_task_force_reencode(struct rpc_task *task)
1732{
1733 task->tk_rqstp->rq_snd_buf.len = 0;
1734 task->tk_rqstp->rq_bytes_sent = 0;
1735}
1736
1737/*
1738 * 3. Encode arguments of an RPC call
1739 */
1740static void 1726static void
1741rpc_xdr_encode(struct rpc_task *task) 1727rpc_xdr_encode(struct rpc_task *task)
1742{ 1728{
@@ -1752,6 +1738,7 @@ rpc_xdr_encode(struct rpc_task *task)
1752 xdr_buf_init(&req->rq_rcv_buf, 1738 xdr_buf_init(&req->rq_rcv_buf,
1753 req->rq_rbuffer, 1739 req->rq_rbuffer,
1754 req->rq_rcvsize); 1740 req->rq_rcvsize);
1741 req->rq_bytes_sent = 0;
1755 1742
1756 p = rpc_encode_header(task); 1743 p = rpc_encode_header(task);
1757 if (p == NULL) { 1744 if (p == NULL) {
@@ -1766,6 +1753,36 @@ rpc_xdr_encode(struct rpc_task *task)
1766 1753
1767 task->tk_status = rpcauth_wrap_req(task, encode, req, p, 1754 task->tk_status = rpcauth_wrap_req(task, encode, req, p,
1768 task->tk_msg.rpc_argp); 1755 task->tk_msg.rpc_argp);
1756 if (task->tk_status == 0)
1757 xprt_request_prepare(req);
1758}
1759
1760/*
1761 * 3. Encode arguments of an RPC call
1762 */
1763static void
1764call_encode(struct rpc_task *task)
1765{
1766 if (!rpc_task_need_encode(task))
1767 goto out;
1768 /* Encode here so that rpcsec_gss can use correct sequence number. */
1769 rpc_xdr_encode(task);
1770 /* Did the encode result in an error condition? */
1771 if (task->tk_status != 0) {
1772 /* Was the error nonfatal? */
1773 if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM)
1774 rpc_delay(task, HZ >> 4);
1775 else
1776 rpc_exit(task, task->tk_status);
1777 return;
1778 }
1779
1780 /* Add task to reply queue before transmission to avoid races */
1781 if (rpc_reply_expected(task))
1782 xprt_request_enqueue_receive(task);
1783 xprt_request_enqueue_transmit(task);
1784out:
1785 task->tk_action = call_bind;
1769} 1786}
1770 1787
1771/* 1788/*
@@ -1947,43 +1964,16 @@ call_connect_status(struct rpc_task *task)
1947static void 1964static void
1948call_transmit(struct rpc_task *task) 1965call_transmit(struct rpc_task *task)
1949{ 1966{
1950 int is_retrans = RPC_WAS_SENT(task);
1951
1952 dprint_status(task); 1967 dprint_status(task);
1953 1968
1954 task->tk_action = call_status; 1969 task->tk_status = 0;
1955 if (task->tk_status < 0) 1970 if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
1956 return; 1971 if (!xprt_prepare_transmit(task))
1957 if (!xprt_prepare_transmit(task))
1958 return;
1959 task->tk_action = call_transmit_status;
1960 /* Encode here so that rpcsec_gss can use correct sequence number. */
1961 if (rpc_task_need_encode(task)) {
1962 rpc_xdr_encode(task);
1963 /* Did the encode result in an error condition? */
1964 if (task->tk_status != 0) {
1965 /* Was the error nonfatal? */
1966 if (task->tk_status == -EAGAIN)
1967 rpc_delay(task, HZ >> 4);
1968 else
1969 rpc_exit(task, task->tk_status);
1970 return; 1972 return;
1971 } 1973 xprt_transmit(task);
1972 } 1974 }
1973 xprt_transmit(task); 1975 task->tk_action = call_transmit_status;
1974 if (task->tk_status < 0) 1976 xprt_end_transmit(task);
1975 return;
1976 if (is_retrans)
1977 task->tk_client->cl_stats->rpcretrans++;
1978 /*
1979 * On success, ensure that we call xprt_end_transmit() before sleeping
1980 * in order to allow access to the socket to other RPC requests.
1981 */
1982 call_transmit_status(task);
1983 if (rpc_reply_expected(task))
1984 return;
1985 task->tk_action = rpc_exit_task;
1986 rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task);
1987} 1977}
1988 1978
1989/* 1979/*
@@ -1999,19 +1989,17 @@ call_transmit_status(struct rpc_task *task)
1999 * test first. 1989 * test first.
2000 */ 1990 */
2001 if (task->tk_status == 0) { 1991 if (task->tk_status == 0) {
2002 xprt_end_transmit(task); 1992 xprt_request_wait_receive(task);
2003 rpc_task_force_reencode(task);
2004 return; 1993 return;
2005 } 1994 }
2006 1995
2007 switch (task->tk_status) { 1996 switch (task->tk_status) {
2008 case -EAGAIN:
2009 case -ENOBUFS:
2010 break;
2011 default: 1997 default:
2012 dprint_status(task); 1998 dprint_status(task);
2013 xprt_end_transmit(task); 1999 break;
2014 rpc_task_force_reencode(task); 2000 case -EBADMSG:
2001 task->tk_status = 0;
2002 task->tk_action = call_encode;
2015 break; 2003 break;
2016 /* 2004 /*
2017 * Special cases: if we've been waiting on the 2005 * Special cases: if we've been waiting on the
@@ -2019,6 +2007,14 @@ call_transmit_status(struct rpc_task *task)
2019 * socket just returned a connection error, 2007 * socket just returned a connection error,
2020 * then hold onto the transport lock. 2008 * then hold onto the transport lock.
2021 */ 2009 */
2010 case -ENOBUFS:
2011 rpc_delay(task, HZ>>2);
2012 /* fall through */
2013 case -EBADSLT:
2014 case -EAGAIN:
2015 task->tk_action = call_transmit;
2016 task->tk_status = 0;
2017 break;
2022 case -ECONNREFUSED: 2018 case -ECONNREFUSED:
2023 case -EHOSTDOWN: 2019 case -EHOSTDOWN:
2024 case -ENETDOWN: 2020 case -ENETDOWN:
@@ -2026,7 +2022,6 @@ call_transmit_status(struct rpc_task *task)
2026 case -ENETUNREACH: 2022 case -ENETUNREACH:
2027 case -EPERM: 2023 case -EPERM:
2028 if (RPC_IS_SOFTCONN(task)) { 2024 if (RPC_IS_SOFTCONN(task)) {
2029 xprt_end_transmit(task);
2030 if (!task->tk_msg.rpc_proc->p_proc) 2025 if (!task->tk_msg.rpc_proc->p_proc)
2031 trace_xprt_ping(task->tk_xprt, 2026 trace_xprt_ping(task->tk_xprt,
2032 task->tk_status); 2027 task->tk_status);
@@ -2039,7 +2034,7 @@ call_transmit_status(struct rpc_task *task)
2039 case -EADDRINUSE: 2034 case -EADDRINUSE:
2040 case -ENOTCONN: 2035 case -ENOTCONN:
2041 case -EPIPE: 2036 case -EPIPE:
2042 rpc_task_force_reencode(task); 2037 break;
2043 } 2038 }
2044} 2039}
2045 2040
@@ -2053,6 +2048,11 @@ call_bc_transmit(struct rpc_task *task)
2053{ 2048{
2054 struct rpc_rqst *req = task->tk_rqstp; 2049 struct rpc_rqst *req = task->tk_rqstp;
2055 2050
2051 if (rpc_task_need_encode(task))
2052 xprt_request_enqueue_transmit(task);
2053 if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
2054 goto out_wakeup;
2055
2056 if (!xprt_prepare_transmit(task)) 2056 if (!xprt_prepare_transmit(task))
2057 goto out_retry; 2057 goto out_retry;
2058 2058
@@ -2061,14 +2061,9 @@ call_bc_transmit(struct rpc_task *task)
2061 "error: %d\n", task->tk_status); 2061 "error: %d\n", task->tk_status);
2062 goto out_done; 2062 goto out_done;
2063 } 2063 }
2064 if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
2065 req->rq_bytes_sent = 0;
2066 2064
2067 xprt_transmit(task); 2065 xprt_transmit(task);
2068 2066
2069 if (task->tk_status == -EAGAIN)
2070 goto out_nospace;
2071
2072 xprt_end_transmit(task); 2067 xprt_end_transmit(task);
2073 dprint_status(task); 2068 dprint_status(task);
2074 switch (task->tk_status) { 2069 switch (task->tk_status) {
@@ -2084,6 +2079,8 @@ call_bc_transmit(struct rpc_task *task)
2084 case -ENOTCONN: 2079 case -ENOTCONN:
2085 case -EPIPE: 2080 case -EPIPE:
2086 break; 2081 break;
2082 case -EAGAIN:
2083 goto out_retry;
2087 case -ETIMEDOUT: 2084 case -ETIMEDOUT:
2088 /* 2085 /*
2089 * Problem reaching the server. Disconnect and let the 2086 * Problem reaching the server. Disconnect and let the
@@ -2107,12 +2104,11 @@ call_bc_transmit(struct rpc_task *task)
2107 "error: %d\n", task->tk_status); 2104 "error: %d\n", task->tk_status);
2108 break; 2105 break;
2109 } 2106 }
2107out_wakeup:
2110 rpc_wake_up_queued_task(&req->rq_xprt->pending, task); 2108 rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
2111out_done: 2109out_done:
2112 task->tk_action = rpc_exit_task; 2110 task->tk_action = rpc_exit_task;
2113 return; 2111 return;
2114out_nospace:
2115 req->rq_connect_cookie = req->rq_xprt->connect_cookie;
2116out_retry: 2112out_retry:
2117 task->tk_status = 0; 2113 task->tk_status = 0;
2118} 2114}
@@ -2125,15 +2121,11 @@ static void
2125call_status(struct rpc_task *task) 2121call_status(struct rpc_task *task)
2126{ 2122{
2127 struct rpc_clnt *clnt = task->tk_client; 2123 struct rpc_clnt *clnt = task->tk_client;
2128 struct rpc_rqst *req = task->tk_rqstp;
2129 int status; 2124 int status;
2130 2125
2131 if (!task->tk_msg.rpc_proc->p_proc) 2126 if (!task->tk_msg.rpc_proc->p_proc)
2132 trace_xprt_ping(task->tk_xprt, task->tk_status); 2127 trace_xprt_ping(task->tk_xprt, task->tk_status);
2133 2128
2134 if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
2135 task->tk_status = req->rq_reply_bytes_recvd;
2136
2137 dprint_status(task); 2129 dprint_status(task);
2138 2130
2139 status = task->tk_status; 2131 status = task->tk_status;
@@ -2173,13 +2165,8 @@ call_status(struct rpc_task *task)
2173 /* fall through */ 2165 /* fall through */
2174 case -EPIPE: 2166 case -EPIPE:
2175 case -ENOTCONN: 2167 case -ENOTCONN:
2176 task->tk_action = call_bind;
2177 break;
2178 case -ENOBUFS:
2179 rpc_delay(task, HZ>>2);
2180 /* fall through */
2181 case -EAGAIN: 2168 case -EAGAIN:
2182 task->tk_action = call_transmit; 2169 task->tk_action = call_encode;
2183 break; 2170 break;
2184 case -EIO: 2171 case -EIO:
2185 /* shutdown or soft timeout */ 2172 /* shutdown or soft timeout */
@@ -2244,7 +2231,7 @@ call_timeout(struct rpc_task *task)
2244 rpcauth_invalcred(task); 2231 rpcauth_invalcred(task);
2245 2232
2246retry: 2233retry:
2247 task->tk_action = call_bind; 2234 task->tk_action = call_encode;
2248 task->tk_status = 0; 2235 task->tk_status = 0;
2249} 2236}
2250 2237
@@ -2261,6 +2248,11 @@ call_decode(struct rpc_task *task)
2261 2248
2262 dprint_status(task); 2249 dprint_status(task);
2263 2250
2251 if (!decode) {
2252 task->tk_action = rpc_exit_task;
2253 return;
2254 }
2255
2264 if (task->tk_flags & RPC_CALL_MAJORSEEN) { 2256 if (task->tk_flags & RPC_CALL_MAJORSEEN) {
2265 if (clnt->cl_chatty) { 2257 if (clnt->cl_chatty) {
2266 printk(KERN_NOTICE "%s: server %s OK\n", 2258 printk(KERN_NOTICE "%s: server %s OK\n",
@@ -2283,7 +2275,7 @@ call_decode(struct rpc_task *task)
2283 2275
2284 if (req->rq_rcv_buf.len < 12) { 2276 if (req->rq_rcv_buf.len < 12) {
2285 if (!RPC_IS_SOFT(task)) { 2277 if (!RPC_IS_SOFT(task)) {
2286 task->tk_action = call_bind; 2278 task->tk_action = call_encode;
2287 goto out_retry; 2279 goto out_retry;
2288 } 2280 }
2289 dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", 2281 dprintk("RPC: %s: too small RPC reply size (%d bytes)\n",
@@ -2298,13 +2290,11 @@ call_decode(struct rpc_task *task)
2298 goto out_retry; 2290 goto out_retry;
2299 return; 2291 return;
2300 } 2292 }
2301
2302 task->tk_action = rpc_exit_task; 2293 task->tk_action = rpc_exit_task;
2303 2294
2304 if (decode) { 2295 task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
2305 task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, 2296 task->tk_msg.rpc_resp);
2306 task->tk_msg.rpc_resp); 2297
2307 }
2308 dprintk("RPC: %5u call_decode result %d\n", task->tk_pid, 2298 dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
2309 task->tk_status); 2299 task->tk_status);
2310 return; 2300 return;
@@ -2416,7 +2406,7 @@ rpc_verify_header(struct rpc_task *task)
2416 task->tk_garb_retry--; 2406 task->tk_garb_retry--;
2417 dprintk("RPC: %5u %s: retry garbled creds\n", 2407 dprintk("RPC: %5u %s: retry garbled creds\n",
2418 task->tk_pid, __func__); 2408 task->tk_pid, __func__);
2419 task->tk_action = call_bind; 2409 task->tk_action = call_encode;
2420 goto out_retry; 2410 goto out_retry;
2421 case RPC_AUTH_TOOWEAK: 2411 case RPC_AUTH_TOOWEAK:
2422 printk(KERN_NOTICE "RPC: server %s requires stronger " 2412 printk(KERN_NOTICE "RPC: server %s requires stronger "
@@ -2485,7 +2475,7 @@ out_garbage:
2485 task->tk_garb_retry--; 2475 task->tk_garb_retry--;
2486 dprintk("RPC: %5u %s: retrying\n", 2476 dprintk("RPC: %5u %s: retrying\n",
2487 task->tk_pid, __func__); 2477 task->tk_pid, __func__);
2488 task->tk_action = call_bind; 2478 task->tk_action = call_encode;
2489out_retry: 2479out_retry:
2490 return ERR_PTR(-EAGAIN); 2480 return ERR_PTR(-EAGAIN);
2491 } 2481 }
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 3fe5d60ab0e2..57ca5bead1cb 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -99,65 +99,79 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
99 list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); 99 list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
100} 100}
101 101
102static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue)
103{
104 struct list_head *q = &queue->tasks[queue->priority];
105 struct rpc_task *task;
106
107 if (!list_empty(q)) {
108 task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
109 if (task->tk_owner == queue->owner)
110 list_move_tail(&task->u.tk_wait.list, q);
111 }
112}
113
114static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) 102static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
115{ 103{
116 if (queue->priority != priority) { 104 if (queue->priority != priority) {
117 /* Fairness: rotate the list when changing priority */
118 rpc_rotate_queue_owner(queue);
119 queue->priority = priority; 105 queue->priority = priority;
106 queue->nr = 1U << priority;
120 } 107 }
121} 108}
122 109
123static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
124{
125 queue->owner = pid;
126 queue->nr = RPC_BATCH_COUNT;
127}
128
129static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) 110static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
130{ 111{
131 rpc_set_waitqueue_priority(queue, queue->maxpriority); 112 rpc_set_waitqueue_priority(queue, queue->maxpriority);
132 rpc_set_waitqueue_owner(queue, 0);
133} 113}
134 114
135/* 115/*
136 * Add new request to a priority queue. 116 * Add a request to a queue list
137 */ 117 */
138static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, 118static void
139 struct rpc_task *task, 119__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task)
140 unsigned char queue_priority)
141{ 120{
142 struct list_head *q;
143 struct rpc_task *t; 121 struct rpc_task *t;
144 122
145 INIT_LIST_HEAD(&task->u.tk_wait.links);
146 if (unlikely(queue_priority > queue->maxpriority))
147 queue_priority = queue->maxpriority;
148 if (queue_priority > queue->priority)
149 rpc_set_waitqueue_priority(queue, queue_priority);
150 q = &queue->tasks[queue_priority];
151 list_for_each_entry(t, q, u.tk_wait.list) { 123 list_for_each_entry(t, q, u.tk_wait.list) {
152 if (t->tk_owner == task->tk_owner) { 124 if (t->tk_owner == task->tk_owner) {
153 list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); 125 list_add_tail(&task->u.tk_wait.links,
126 &t->u.tk_wait.links);
127 /* Cache the queue head in task->u.tk_wait.list */
128 task->u.tk_wait.list.next = q;
129 task->u.tk_wait.list.prev = NULL;
154 return; 130 return;
155 } 131 }
156 } 132 }
133 INIT_LIST_HEAD(&task->u.tk_wait.links);
157 list_add_tail(&task->u.tk_wait.list, q); 134 list_add_tail(&task->u.tk_wait.list, q);
158} 135}
159 136
160/* 137/*
138 * Remove request from a queue list
139 */
140static void
141__rpc_list_dequeue_task(struct rpc_task *task)
142{
143 struct list_head *q;
144 struct rpc_task *t;
145
146 if (task->u.tk_wait.list.prev == NULL) {
147 list_del(&task->u.tk_wait.links);
148 return;
149 }
150 if (!list_empty(&task->u.tk_wait.links)) {
151 t = list_first_entry(&task->u.tk_wait.links,
152 struct rpc_task,
153 u.tk_wait.links);
154 /* Assume __rpc_list_enqueue_task() cached the queue head */
155 q = t->u.tk_wait.list.next;
156 list_add_tail(&t->u.tk_wait.list, q);
157 list_del(&task->u.tk_wait.links);
158 }
159 list_del(&task->u.tk_wait.list);
160}
161
162/*
163 * Add new request to a priority queue.
164 */
165static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
166 struct rpc_task *task,
167 unsigned char queue_priority)
168{
169 if (unlikely(queue_priority > queue->maxpriority))
170 queue_priority = queue->maxpriority;
171 __rpc_list_enqueue_task(&queue->tasks[queue_priority], task);
172}
173
174/*
161 * Add new request to wait queue. 175 * Add new request to wait queue.
162 * 176 *
163 * Swapper tasks always get inserted at the head of the queue. 177 * Swapper tasks always get inserted at the head of the queue.
@@ -194,13 +208,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
194 */ 208 */
195static void __rpc_remove_wait_queue_priority(struct rpc_task *task) 209static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
196{ 210{
197 struct rpc_task *t; 211 __rpc_list_dequeue_task(task);
198
199 if (!list_empty(&task->u.tk_wait.links)) {
200 t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
201 list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
202 list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
203 }
204} 212}
205 213
206/* 214/*
@@ -212,7 +220,8 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas
212 __rpc_disable_timer(queue, task); 220 __rpc_disable_timer(queue, task);
213 if (RPC_IS_PRIORITY(queue)) 221 if (RPC_IS_PRIORITY(queue))
214 __rpc_remove_wait_queue_priority(task); 222 __rpc_remove_wait_queue_priority(task);
215 list_del(&task->u.tk_wait.list); 223 else
224 list_del(&task->u.tk_wait.list);
216 queue->qlen--; 225 queue->qlen--;
217 dprintk("RPC: %5u removed from queue %p \"%s\"\n", 226 dprintk("RPC: %5u removed from queue %p \"%s\"\n",
218 task->tk_pid, queue, rpc_qname(queue)); 227 task->tk_pid, queue, rpc_qname(queue));
@@ -440,14 +449,28 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
440/* 449/*
441 * Wake up a queued task while the queue lock is being held 450 * Wake up a queued task while the queue lock is being held
442 */ 451 */
443static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, 452static struct rpc_task *
444 struct rpc_wait_queue *queue, struct rpc_task *task) 453rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq,
454 struct rpc_wait_queue *queue, struct rpc_task *task,
455 bool (*action)(struct rpc_task *, void *), void *data)
445{ 456{
446 if (RPC_IS_QUEUED(task)) { 457 if (RPC_IS_QUEUED(task)) {
447 smp_rmb(); 458 smp_rmb();
448 if (task->tk_waitqueue == queue) 459 if (task->tk_waitqueue == queue) {
449 __rpc_do_wake_up_task_on_wq(wq, queue, task); 460 if (action == NULL || action(task, data)) {
461 __rpc_do_wake_up_task_on_wq(wq, queue, task);
462 return task;
463 }
464 }
450 } 465 }
466 return NULL;
467}
468
469static void
470rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
471 struct rpc_wait_queue *queue, struct rpc_task *task)
472{
473 rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL);
451} 474}
452 475
453/* 476/*
@@ -465,6 +488,8 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
465 struct rpc_wait_queue *queue, 488 struct rpc_wait_queue *queue,
466 struct rpc_task *task) 489 struct rpc_task *task)
467{ 490{
491 if (!RPC_IS_QUEUED(task))
492 return;
468 spin_lock_bh(&queue->lock); 493 spin_lock_bh(&queue->lock);
469 rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); 494 rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
470 spin_unlock_bh(&queue->lock); 495 spin_unlock_bh(&queue->lock);
@@ -475,12 +500,48 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
475 */ 500 */
476void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) 501void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
477{ 502{
503 if (!RPC_IS_QUEUED(task))
504 return;
478 spin_lock_bh(&queue->lock); 505 spin_lock_bh(&queue->lock);
479 rpc_wake_up_task_queue_locked(queue, task); 506 rpc_wake_up_task_queue_locked(queue, task);
480 spin_unlock_bh(&queue->lock); 507 spin_unlock_bh(&queue->lock);
481} 508}
482EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); 509EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
483 510
511static bool rpc_task_action_set_status(struct rpc_task *task, void *status)
512{
513 task->tk_status = *(int *)status;
514 return true;
515}
516
517static void
518rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue,
519 struct rpc_task *task, int status)
520{
521 rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
522 task, rpc_task_action_set_status, &status);
523}
524
525/**
526 * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status
527 * @queue: pointer to rpc_wait_queue
528 * @task: pointer to rpc_task
529 * @status: integer error value
530 *
531 * If @task is queued on @queue, then it is woken up, and @task->tk_status is
532 * set to the value of @status.
533 */
534void
535rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue,
536 struct rpc_task *task, int status)
537{
538 if (!RPC_IS_QUEUED(task))
539 return;
540 spin_lock_bh(&queue->lock);
541 rpc_wake_up_task_queue_set_status_locked(queue, task, status);
542 spin_unlock_bh(&queue->lock);
543}
544
484/* 545/*
485 * Wake up the next task on a priority queue. 546 * Wake up the next task on a priority queue.
486 */ 547 */
@@ -493,17 +554,9 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
493 * Service a batch of tasks from a single owner. 554 * Service a batch of tasks from a single owner.
494 */ 555 */
495 q = &queue->tasks[queue->priority]; 556 q = &queue->tasks[queue->priority];
496 if (!list_empty(q)) { 557 if (!list_empty(q) && --queue->nr) {
497 task = list_entry(q->next, struct rpc_task, u.tk_wait.list); 558 task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
498 if (queue->owner == task->tk_owner) { 559 goto out;
499 if (--queue->nr)
500 goto out;
501 list_move_tail(&task->u.tk_wait.list, q);
502 }
503 /*
504 * Check if we need to switch queues.
505 */
506 goto new_owner;
507 } 560 }
508 561
509 /* 562 /*
@@ -515,7 +568,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
515 else 568 else
516 q = q - 1; 569 q = q - 1;
517 if (!list_empty(q)) { 570 if (!list_empty(q)) {
518 task = list_entry(q->next, struct rpc_task, u.tk_wait.list); 571 task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
519 goto new_queue; 572 goto new_queue;
520 } 573 }
521 } while (q != &queue->tasks[queue->priority]); 574 } while (q != &queue->tasks[queue->priority]);
@@ -525,8 +578,6 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
525 578
526new_queue: 579new_queue:
527 rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); 580 rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0]));
528new_owner:
529 rpc_set_waitqueue_owner(queue, task->tk_owner);
530out: 581out:
531 return task; 582 return task;
532} 583}
@@ -553,12 +604,9 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
553 queue, rpc_qname(queue)); 604 queue, rpc_qname(queue));
554 spin_lock_bh(&queue->lock); 605 spin_lock_bh(&queue->lock);
555 task = __rpc_find_next_queued(queue); 606 task = __rpc_find_next_queued(queue);
556 if (task != NULL) { 607 if (task != NULL)
557 if (func(task, data)) 608 task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue,
558 rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); 609 task, func, data);
559 else
560 task = NULL;
561 }
562 spin_unlock_bh(&queue->lock); 610 spin_unlock_bh(&queue->lock);
563 611
564 return task; 612 return task;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index f217c348b341..9062967575c4 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -26,7 +26,8 @@
26 * Possibly called several times to iterate over an sk_buff and copy 26 * Possibly called several times to iterate over an sk_buff and copy
27 * data out of it. 27 * data out of it.
28 */ 28 */
29size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) 29static size_t
30xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
30{ 31{
31 if (len > desc->count) 32 if (len > desc->count)
32 len = desc->count; 33 len = desc->count;
@@ -36,7 +37,6 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
36 desc->offset += len; 37 desc->offset += len;
37 return len; 38 return len;
38} 39}
39EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
40 40
41/** 41/**
42 * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer 42 * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -69,7 +69,8 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to,
69 * @copy_actor: virtual method for copying data 69 * @copy_actor: virtual method for copying data
70 * 70 *
71 */ 71 */
72ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) 72static ssize_t
73xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
73{ 74{
74 struct page **ppage = xdr->pages; 75 struct page **ppage = xdr->pages;
75 unsigned int len, pglen = xdr->page_len; 76 unsigned int len, pglen = xdr->page_len;
@@ -104,7 +105,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
104 105
105 /* ACL likes to be lazy in allocating pages - ACLs 106 /* ACL likes to be lazy in allocating pages - ACLs
106 * are small by default but can get huge. */ 107 * are small by default but can get huge. */
107 if (unlikely(*ppage == NULL)) { 108 if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) {
108 *ppage = alloc_page(GFP_ATOMIC); 109 *ppage = alloc_page(GFP_ATOMIC);
109 if (unlikely(*ppage == NULL)) { 110 if (unlikely(*ppage == NULL)) {
110 if (copied == 0) 111 if (copied == 0)
@@ -140,7 +141,6 @@ copy_tail:
140out: 141out:
141 return copied; 142 return copied;
142} 143}
143EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
144 144
145/** 145/**
146 * csum_partial_copy_to_xdr - checksum and copy data 146 * csum_partial_copy_to_xdr - checksum and copy data
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 5185efb9027b..87533fbb96cf 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -171,7 +171,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
171 mutex_init(&xprt->xpt_mutex); 171 mutex_init(&xprt->xpt_mutex);
172 spin_lock_init(&xprt->xpt_lock); 172 spin_lock_init(&xprt->xpt_lock);
173 set_bit(XPT_BUSY, &xprt->xpt_flags); 173 set_bit(XPT_BUSY, &xprt->xpt_flags);
174 rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
175 xprt->xpt_net = get_net(net); 174 xprt->xpt_net = get_net(net);
176 strcpy(xprt->xpt_remotebuf, "uninitialized"); 175 strcpy(xprt->xpt_remotebuf, "uninitialized");
177} 176}
@@ -895,7 +894,6 @@ int svc_send(struct svc_rqst *rqstp)
895 else 894 else
896 len = xprt->xpt_ops->xpo_sendto(rqstp); 895 len = xprt->xpt_ops->xpo_sendto(rqstp);
897 mutex_unlock(&xprt->xpt_mutex); 896 mutex_unlock(&xprt->xpt_mutex);
898 rpc_wake_up(&xprt->xpt_bc_pending);
899 trace_svc_send(rqstp, len); 897 trace_svc_send(rqstp, len);
900 svc_xprt_release(rqstp); 898 svc_xprt_release(rqstp);
901 899
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5445145e639c..db8bb6b3a2b0 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1004,7 +1004,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
1004 1004
1005 if (!bc_xprt) 1005 if (!bc_xprt)
1006 return -EAGAIN; 1006 return -EAGAIN;
1007 spin_lock(&bc_xprt->recv_lock); 1007 spin_lock(&bc_xprt->queue_lock);
1008 req = xprt_lookup_rqst(bc_xprt, xid); 1008 req = xprt_lookup_rqst(bc_xprt, xid);
1009 if (!req) 1009 if (!req)
1010 goto unlock_notfound; 1010 goto unlock_notfound;
@@ -1022,7 +1022,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
1022 memcpy(dst->iov_base, src->iov_base, src->iov_len); 1022 memcpy(dst->iov_base, src->iov_base, src->iov_len);
1023 xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); 1023 xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
1024 rqstp->rq_arg.len = 0; 1024 rqstp->rq_arg.len = 0;
1025 spin_unlock(&bc_xprt->recv_lock); 1025 spin_unlock(&bc_xprt->queue_lock);
1026 return 0; 1026 return 0;
1027unlock_notfound: 1027unlock_notfound:
1028 printk(KERN_NOTICE 1028 printk(KERN_NOTICE
@@ -1031,7 +1031,7 @@ unlock_notfound:
1031 __func__, ntohl(calldir), 1031 __func__, ntohl(calldir),
1032 bc_xprt, ntohl(xid)); 1032 bc_xprt, ntohl(xid));
1033unlock_eagain: 1033unlock_eagain:
1034 spin_unlock(&bc_xprt->recv_lock); 1034 spin_unlock(&bc_xprt->queue_lock);
1035 return -EAGAIN; 1035 return -EAGAIN;
1036} 1036}
1037 1037
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 30afbd236656..2bbb8d38d2bf 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -15,6 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/sunrpc/xdr.h> 16#include <linux/sunrpc/xdr.h>
17#include <linux/sunrpc/msg_prot.h> 17#include <linux/sunrpc/msg_prot.h>
18#include <linux/bvec.h>
18 19
19/* 20/*
20 * XDR functions for basic NFS types 21 * XDR functions for basic NFS types
@@ -128,6 +129,39 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
128} 129}
129EXPORT_SYMBOL_GPL(xdr_terminate_string); 130EXPORT_SYMBOL_GPL(xdr_terminate_string);
130 131
132size_t
133xdr_buf_pagecount(struct xdr_buf *buf)
134{
135 if (!buf->page_len)
136 return 0;
137 return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
138}
139
140int
141xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
142{
143 size_t i, n = xdr_buf_pagecount(buf);
144
145 if (n != 0 && buf->bvec == NULL) {
146 buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
147 if (!buf->bvec)
148 return -ENOMEM;
149 for (i = 0; i < n; i++) {
150 buf->bvec[i].bv_page = buf->pages[i];
151 buf->bvec[i].bv_len = PAGE_SIZE;
152 buf->bvec[i].bv_offset = 0;
153 }
154 }
155 return 0;
156}
157
158void
159xdr_free_bvec(struct xdr_buf *buf)
160{
161 kfree(buf->bvec);
162 buf->bvec = NULL;
163}
164
131void 165void
132xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, 166xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
133 struct page **pages, unsigned int base, unsigned int len) 167 struct page **pages, unsigned int base, unsigned int len)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a8db2e3f8904..86bea4520c4d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -68,8 +68,6 @@
68static void xprt_init(struct rpc_xprt *xprt, struct net *net); 68static void xprt_init(struct rpc_xprt *xprt, struct net *net);
69static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); 69static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
70static void xprt_connect_status(struct rpc_task *task); 70static void xprt_connect_status(struct rpc_task *task);
71static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
72static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
73static void xprt_destroy(struct rpc_xprt *xprt); 71static void xprt_destroy(struct rpc_xprt *xprt);
74 72
75static DEFINE_SPINLOCK(xprt_list_lock); 73static DEFINE_SPINLOCK(xprt_list_lock);
@@ -171,6 +169,17 @@ out:
171} 169}
172EXPORT_SYMBOL_GPL(xprt_load_transport); 170EXPORT_SYMBOL_GPL(xprt_load_transport);
173 171
172static void xprt_clear_locked(struct rpc_xprt *xprt)
173{
174 xprt->snd_task = NULL;
175 if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
176 smp_mb__before_atomic();
177 clear_bit(XPRT_LOCKED, &xprt->state);
178 smp_mb__after_atomic();
179 } else
180 queue_work(xprtiod_workqueue, &xprt->task_cleanup);
181}
182
174/** 183/**
175 * xprt_reserve_xprt - serialize write access to transports 184 * xprt_reserve_xprt - serialize write access to transports
176 * @task: task that is requesting access to the transport 185 * @task: task that is requesting access to the transport
@@ -183,44 +192,53 @@ EXPORT_SYMBOL_GPL(xprt_load_transport);
183int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) 192int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
184{ 193{
185 struct rpc_rqst *req = task->tk_rqstp; 194 struct rpc_rqst *req = task->tk_rqstp;
186 int priority;
187 195
188 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { 196 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
189 if (task == xprt->snd_task) 197 if (task == xprt->snd_task)
190 return 1; 198 return 1;
191 goto out_sleep; 199 goto out_sleep;
192 } 200 }
201 if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
202 goto out_unlock;
193 xprt->snd_task = task; 203 xprt->snd_task = task;
194 if (req != NULL)
195 req->rq_ntrans++;
196 204
197 return 1; 205 return 1;
198 206
207out_unlock:
208 xprt_clear_locked(xprt);
199out_sleep: 209out_sleep:
200 dprintk("RPC: %5u failed to lock transport %p\n", 210 dprintk("RPC: %5u failed to lock transport %p\n",
201 task->tk_pid, xprt); 211 task->tk_pid, xprt);
202 task->tk_timeout = 0; 212 task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
203 task->tk_status = -EAGAIN; 213 task->tk_status = -EAGAIN;
204 if (req == NULL) 214 rpc_sleep_on(&xprt->sending, task, NULL);
205 priority = RPC_PRIORITY_LOW;
206 else if (!req->rq_ntrans)
207 priority = RPC_PRIORITY_NORMAL;
208 else
209 priority = RPC_PRIORITY_HIGH;
210 rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
211 return 0; 215 return 0;
212} 216}
213EXPORT_SYMBOL_GPL(xprt_reserve_xprt); 217EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
214 218
215static void xprt_clear_locked(struct rpc_xprt *xprt) 219static bool
220xprt_need_congestion_window_wait(struct rpc_xprt *xprt)
216{ 221{
217 xprt->snd_task = NULL; 222 return test_bit(XPRT_CWND_WAIT, &xprt->state);
218 if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { 223}
219 smp_mb__before_atomic(); 224
220 clear_bit(XPRT_LOCKED, &xprt->state); 225static void
221 smp_mb__after_atomic(); 226xprt_set_congestion_window_wait(struct rpc_xprt *xprt)
222 } else 227{
223 queue_work(xprtiod_workqueue, &xprt->task_cleanup); 228 if (!list_empty(&xprt->xmit_queue)) {
229 /* Peek at head of queue to see if it can make progress */
230 if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst,
231 rq_xmit)->rq_cong)
232 return;
233 }
234 set_bit(XPRT_CWND_WAIT, &xprt->state);
235}
236
237static void
238xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt)
239{
240 if (!RPCXPRT_CONGESTED(xprt))
241 clear_bit(XPRT_CWND_WAIT, &xprt->state);
224} 242}
225 243
226/* 244/*
@@ -230,11 +248,11 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
230 * Same as xprt_reserve_xprt, but Van Jacobson congestion control is 248 * Same as xprt_reserve_xprt, but Van Jacobson congestion control is
231 * integrated into the decision of whether a request is allowed to be 249 * integrated into the decision of whether a request is allowed to be
232 * woken up and given access to the transport. 250 * woken up and given access to the transport.
251 * Note that the lock is only granted if we know there are free slots.
233 */ 252 */
234int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) 253int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
235{ 254{
236 struct rpc_rqst *req = task->tk_rqstp; 255 struct rpc_rqst *req = task->tk_rqstp;
237 int priority;
238 256
239 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { 257 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
240 if (task == xprt->snd_task) 258 if (task == xprt->snd_task)
@@ -245,25 +263,19 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
245 xprt->snd_task = task; 263 xprt->snd_task = task;
246 return 1; 264 return 1;
247 } 265 }
248 if (__xprt_get_cong(xprt, task)) { 266 if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
267 goto out_unlock;
268 if (!xprt_need_congestion_window_wait(xprt)) {
249 xprt->snd_task = task; 269 xprt->snd_task = task;
250 req->rq_ntrans++;
251 return 1; 270 return 1;
252 } 271 }
272out_unlock:
253 xprt_clear_locked(xprt); 273 xprt_clear_locked(xprt);
254out_sleep: 274out_sleep:
255 if (req)
256 __xprt_put_cong(xprt, req);
257 dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); 275 dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
258 task->tk_timeout = 0; 276 task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
259 task->tk_status = -EAGAIN; 277 task->tk_status = -EAGAIN;
260 if (req == NULL) 278 rpc_sleep_on(&xprt->sending, task, NULL);
261 priority = RPC_PRIORITY_LOW;
262 else if (!req->rq_ntrans)
263 priority = RPC_PRIORITY_NORMAL;
264 else
265 priority = RPC_PRIORITY_HIGH;
266 rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
267 return 0; 279 return 0;
268} 280}
269EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); 281EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -272,6 +284,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
272{ 284{
273 int retval; 285 int retval;
274 286
287 if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task)
288 return 1;
275 spin_lock_bh(&xprt->transport_lock); 289 spin_lock_bh(&xprt->transport_lock);
276 retval = xprt->ops->reserve_xprt(xprt, task); 290 retval = xprt->ops->reserve_xprt(xprt, task);
277 spin_unlock_bh(&xprt->transport_lock); 291 spin_unlock_bh(&xprt->transport_lock);
@@ -281,12 +295,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
281static bool __xprt_lock_write_func(struct rpc_task *task, void *data) 295static bool __xprt_lock_write_func(struct rpc_task *task, void *data)
282{ 296{
283 struct rpc_xprt *xprt = data; 297 struct rpc_xprt *xprt = data;
284 struct rpc_rqst *req;
285 298
286 req = task->tk_rqstp;
287 xprt->snd_task = task; 299 xprt->snd_task = task;
288 if (req)
289 req->rq_ntrans++;
290 return true; 300 return true;
291} 301}
292 302
@@ -294,53 +304,30 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
294{ 304{
295 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 305 if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
296 return; 306 return;
297 307 if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
308 goto out_unlock;
298 if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, 309 if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
299 __xprt_lock_write_func, xprt)) 310 __xprt_lock_write_func, xprt))
300 return; 311 return;
312out_unlock:
301 xprt_clear_locked(xprt); 313 xprt_clear_locked(xprt);
302} 314}
303 315
304static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data)
305{
306 struct rpc_xprt *xprt = data;
307 struct rpc_rqst *req;
308
309 req = task->tk_rqstp;
310 if (req == NULL) {
311 xprt->snd_task = task;
312 return true;
313 }
314 if (__xprt_get_cong(xprt, task)) {
315 xprt->snd_task = task;
316 req->rq_ntrans++;
317 return true;
318 }
319 return false;
320}
321
322static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) 316static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
323{ 317{
324 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 318 if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
325 return; 319 return;
326 if (RPCXPRT_CONGESTED(xprt)) 320 if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
321 goto out_unlock;
322 if (xprt_need_congestion_window_wait(xprt))
327 goto out_unlock; 323 goto out_unlock;
328 if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, 324 if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
329 __xprt_lock_write_cong_func, xprt)) 325 __xprt_lock_write_func, xprt))
330 return; 326 return;
331out_unlock: 327out_unlock:
332 xprt_clear_locked(xprt); 328 xprt_clear_locked(xprt);
333} 329}
334 330
335static void xprt_task_clear_bytes_sent(struct rpc_task *task)
336{
337 if (task != NULL) {
338 struct rpc_rqst *req = task->tk_rqstp;
339 if (req != NULL)
340 req->rq_bytes_sent = 0;
341 }
342}
343
344/** 331/**
345 * xprt_release_xprt - allow other requests to use a transport 332 * xprt_release_xprt - allow other requests to use a transport
346 * @xprt: transport with other tasks potentially waiting 333 * @xprt: transport with other tasks potentially waiting
@@ -351,7 +338,6 @@ static void xprt_task_clear_bytes_sent(struct rpc_task *task)
351void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) 338void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
352{ 339{
353 if (xprt->snd_task == task) { 340 if (xprt->snd_task == task) {
354 xprt_task_clear_bytes_sent(task);
355 xprt_clear_locked(xprt); 341 xprt_clear_locked(xprt);
356 __xprt_lock_write_next(xprt); 342 __xprt_lock_write_next(xprt);
357 } 343 }
@@ -369,7 +355,6 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
369void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) 355void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
370{ 356{
371 if (xprt->snd_task == task) { 357 if (xprt->snd_task == task) {
372 xprt_task_clear_bytes_sent(task);
373 xprt_clear_locked(xprt); 358 xprt_clear_locked(xprt);
374 __xprt_lock_write_next_cong(xprt); 359 __xprt_lock_write_next_cong(xprt);
375 } 360 }
@@ -378,6 +363,8 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
378 363
379static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) 364static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
380{ 365{
366 if (xprt->snd_task != task)
367 return;
381 spin_lock_bh(&xprt->transport_lock); 368 spin_lock_bh(&xprt->transport_lock);
382 xprt->ops->release_xprt(xprt, task); 369 xprt->ops->release_xprt(xprt, task);
383 spin_unlock_bh(&xprt->transport_lock); 370 spin_unlock_bh(&xprt->transport_lock);
@@ -388,16 +375,16 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta
388 * overflowed. Put the task to sleep if this is the case. 375 * overflowed. Put the task to sleep if this is the case.
389 */ 376 */
390static int 377static int
391__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) 378__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
392{ 379{
393 struct rpc_rqst *req = task->tk_rqstp;
394
395 if (req->rq_cong) 380 if (req->rq_cong)
396 return 1; 381 return 1;
397 dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", 382 dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
398 task->tk_pid, xprt->cong, xprt->cwnd); 383 req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
399 if (RPCXPRT_CONGESTED(xprt)) 384 if (RPCXPRT_CONGESTED(xprt)) {
385 xprt_set_congestion_window_wait(xprt);
400 return 0; 386 return 0;
387 }
401 req->rq_cong = 1; 388 req->rq_cong = 1;
402 xprt->cong += RPC_CWNDSCALE; 389 xprt->cong += RPC_CWNDSCALE;
403 return 1; 390 return 1;
@@ -414,10 +401,32 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
414 return; 401 return;
415 req->rq_cong = 0; 402 req->rq_cong = 0;
416 xprt->cong -= RPC_CWNDSCALE; 403 xprt->cong -= RPC_CWNDSCALE;
404 xprt_test_and_clear_congestion_window_wait(xprt);
417 __xprt_lock_write_next_cong(xprt); 405 __xprt_lock_write_next_cong(xprt);
418} 406}
419 407
420/** 408/**
409 * xprt_request_get_cong - Request congestion control credits
410 * @xprt: pointer to transport
411 * @req: pointer to RPC request
412 *
413 * Useful for transports that require congestion control.
414 */
415bool
416xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
417{
418 bool ret = false;
419
420 if (req->rq_cong)
421 return true;
422 spin_lock_bh(&xprt->transport_lock);
423 ret = __xprt_get_cong(xprt, req) != 0;
424 spin_unlock_bh(&xprt->transport_lock);
425 return ret;
426}
427EXPORT_SYMBOL_GPL(xprt_request_get_cong);
428
429/**
421 * xprt_release_rqst_cong - housekeeping when request is complete 430 * xprt_release_rqst_cong - housekeeping when request is complete
422 * @task: RPC request that recently completed 431 * @task: RPC request that recently completed
423 * 432 *
@@ -431,6 +440,20 @@ void xprt_release_rqst_cong(struct rpc_task *task)
431} 440}
432EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); 441EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
433 442
443/*
444 * Clear the congestion window wait flag and wake up the next
445 * entry on xprt->sending
446 */
447static void
448xprt_clear_congestion_window_wait(struct rpc_xprt *xprt)
449{
450 if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) {
451 spin_lock_bh(&xprt->transport_lock);
452 __xprt_lock_write_next_cong(xprt);
453 spin_unlock_bh(&xprt->transport_lock);
454 }
455}
456
434/** 457/**
435 * xprt_adjust_cwnd - adjust transport congestion window 458 * xprt_adjust_cwnd - adjust transport congestion window
436 * @xprt: pointer to xprt 459 * @xprt: pointer to xprt
@@ -488,39 +511,46 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
488 511
489/** 512/**
490 * xprt_wait_for_buffer_space - wait for transport output buffer to clear 513 * xprt_wait_for_buffer_space - wait for transport output buffer to clear
491 * @task: task to be put to sleep 514 * @xprt: transport
492 * @action: function pointer to be executed after wait
493 * 515 *
494 * Note that we only set the timer for the case of RPC_IS_SOFT(), since 516 * Note that we only set the timer for the case of RPC_IS_SOFT(), since
495 * we don't in general want to force a socket disconnection due to 517 * we don't in general want to force a socket disconnection due to
496 * an incomplete RPC call transmission. 518 * an incomplete RPC call transmission.
497 */ 519 */
498void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) 520void xprt_wait_for_buffer_space(struct rpc_xprt *xprt)
499{ 521{
500 struct rpc_rqst *req = task->tk_rqstp; 522 set_bit(XPRT_WRITE_SPACE, &xprt->state);
501 struct rpc_xprt *xprt = req->rq_xprt;
502
503 task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
504 rpc_sleep_on(&xprt->pending, task, action);
505} 523}
506EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); 524EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
507 525
526static bool
527xprt_clear_write_space_locked(struct rpc_xprt *xprt)
528{
529 if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) {
530 __xprt_lock_write_next(xprt);
531 dprintk("RPC: write space: waking waiting task on "
532 "xprt %p\n", xprt);
533 return true;
534 }
535 return false;
536}
537
508/** 538/**
509 * xprt_write_space - wake the task waiting for transport output buffer space 539 * xprt_write_space - wake the task waiting for transport output buffer space
510 * @xprt: transport with waiting tasks 540 * @xprt: transport with waiting tasks
511 * 541 *
512 * Can be called in a soft IRQ context, so xprt_write_space never sleeps. 542 * Can be called in a soft IRQ context, so xprt_write_space never sleeps.
513 */ 543 */
514void xprt_write_space(struct rpc_xprt *xprt) 544bool xprt_write_space(struct rpc_xprt *xprt)
515{ 545{
546 bool ret;
547
548 if (!test_bit(XPRT_WRITE_SPACE, &xprt->state))
549 return false;
516 spin_lock_bh(&xprt->transport_lock); 550 spin_lock_bh(&xprt->transport_lock);
517 if (xprt->snd_task) { 551 ret = xprt_clear_write_space_locked(xprt);
518 dprintk("RPC: write space: waking waiting task on "
519 "xprt %p\n", xprt);
520 rpc_wake_up_queued_task_on_wq(xprtiod_workqueue,
521 &xprt->pending, xprt->snd_task);
522 }
523 spin_unlock_bh(&xprt->transport_lock); 552 spin_unlock_bh(&xprt->transport_lock);
553 return ret;
524} 554}
525EXPORT_SYMBOL_GPL(xprt_write_space); 555EXPORT_SYMBOL_GPL(xprt_write_space);
526 556
@@ -631,6 +661,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
631 dprintk("RPC: disconnected transport %p\n", xprt); 661 dprintk("RPC: disconnected transport %p\n", xprt);
632 spin_lock_bh(&xprt->transport_lock); 662 spin_lock_bh(&xprt->transport_lock);
633 xprt_clear_connected(xprt); 663 xprt_clear_connected(xprt);
664 xprt_clear_write_space_locked(xprt);
634 xprt_wake_pending_tasks(xprt, -EAGAIN); 665 xprt_wake_pending_tasks(xprt, -EAGAIN);
635 spin_unlock_bh(&xprt->transport_lock); 666 spin_unlock_bh(&xprt->transport_lock);
636} 667}
@@ -654,6 +685,22 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
654} 685}
655EXPORT_SYMBOL_GPL(xprt_force_disconnect); 686EXPORT_SYMBOL_GPL(xprt_force_disconnect);
656 687
688static unsigned int
689xprt_connect_cookie(struct rpc_xprt *xprt)
690{
691 return READ_ONCE(xprt->connect_cookie);
692}
693
694static bool
695xprt_request_retransmit_after_disconnect(struct rpc_task *task)
696{
697 struct rpc_rqst *req = task->tk_rqstp;
698 struct rpc_xprt *xprt = req->rq_xprt;
699
700 return req->rq_connect_cookie != xprt_connect_cookie(xprt) ||
701 !xprt_connected(xprt);
702}
703
657/** 704/**
658 * xprt_conditional_disconnect - force a transport to disconnect 705 * xprt_conditional_disconnect - force a transport to disconnect
659 * @xprt: transport to disconnect 706 * @xprt: transport to disconnect
@@ -692,7 +739,7 @@ static void
692xprt_schedule_autodisconnect(struct rpc_xprt *xprt) 739xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
693 __must_hold(&xprt->transport_lock) 740 __must_hold(&xprt->transport_lock)
694{ 741{
695 if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) 742 if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt))
696 mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); 743 mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
697} 744}
698 745
@@ -702,7 +749,7 @@ xprt_init_autodisconnect(struct timer_list *t)
702 struct rpc_xprt *xprt = from_timer(xprt, t, timer); 749 struct rpc_xprt *xprt = from_timer(xprt, t, timer);
703 750
704 spin_lock(&xprt->transport_lock); 751 spin_lock(&xprt->transport_lock);
705 if (!list_empty(&xprt->recv)) 752 if (!RB_EMPTY_ROOT(&xprt->recv_queue))
706 goto out_abort; 753 goto out_abort;
707 /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ 754 /* Reset xprt->last_used to avoid connect/autodisconnect cycling */
708 xprt->last_used = jiffies; 755 xprt->last_used = jiffies;
@@ -726,7 +773,6 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
726 goto out; 773 goto out;
727 if (xprt->snd_task != task) 774 if (xprt->snd_task != task)
728 goto out; 775 goto out;
729 xprt_task_clear_bytes_sent(task);
730 xprt->snd_task = cookie; 776 xprt->snd_task = cookie;
731 ret = true; 777 ret = true;
732out: 778out:
@@ -772,7 +818,6 @@ void xprt_connect(struct rpc_task *task)
772 xprt->ops->close(xprt); 818 xprt->ops->close(xprt);
773 819
774 if (!xprt_connected(xprt)) { 820 if (!xprt_connected(xprt)) {
775 task->tk_rqstp->rq_bytes_sent = 0;
776 task->tk_timeout = task->tk_rqstp->rq_timeout; 821 task->tk_timeout = task->tk_rqstp->rq_timeout;
777 task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; 822 task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
778 rpc_sleep_on(&xprt->pending, task, xprt_connect_status); 823 rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
@@ -789,17 +834,11 @@ void xprt_connect(struct rpc_task *task)
789 834
790static void xprt_connect_status(struct rpc_task *task) 835static void xprt_connect_status(struct rpc_task *task)
791{ 836{
792 struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; 837 switch (task->tk_status) {
793 838 case 0:
794 if (task->tk_status == 0) {
795 xprt->stat.connect_count++;
796 xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start;
797 dprintk("RPC: %5u xprt_connect_status: connection established\n", 839 dprintk("RPC: %5u xprt_connect_status: connection established\n",
798 task->tk_pid); 840 task->tk_pid);
799 return; 841 break;
800 }
801
802 switch (task->tk_status) {
803 case -ECONNREFUSED: 842 case -ECONNREFUSED:
804 case -ECONNRESET: 843 case -ECONNRESET:
805 case -ECONNABORTED: 844 case -ECONNABORTED:
@@ -816,28 +855,97 @@ static void xprt_connect_status(struct rpc_task *task)
816 default: 855 default:
817 dprintk("RPC: %5u xprt_connect_status: error %d connecting to " 856 dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
818 "server %s\n", task->tk_pid, -task->tk_status, 857 "server %s\n", task->tk_pid, -task->tk_status,
819 xprt->servername); 858 task->tk_rqstp->rq_xprt->servername);
820 task->tk_status = -EIO; 859 task->tk_status = -EIO;
821 } 860 }
822} 861}
823 862
863enum xprt_xid_rb_cmp {
864 XID_RB_EQUAL,
865 XID_RB_LEFT,
866 XID_RB_RIGHT,
867};
868static enum xprt_xid_rb_cmp
869xprt_xid_cmp(__be32 xid1, __be32 xid2)
870{
871 if (xid1 == xid2)
872 return XID_RB_EQUAL;
873 if ((__force u32)xid1 < (__force u32)xid2)
874 return XID_RB_LEFT;
875 return XID_RB_RIGHT;
876}
877
878static struct rpc_rqst *
879xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid)
880{
881 struct rb_node *n = xprt->recv_queue.rb_node;
882 struct rpc_rqst *req;
883
884 while (n != NULL) {
885 req = rb_entry(n, struct rpc_rqst, rq_recv);
886 switch (xprt_xid_cmp(xid, req->rq_xid)) {
887 case XID_RB_LEFT:
888 n = n->rb_left;
889 break;
890 case XID_RB_RIGHT:
891 n = n->rb_right;
892 break;
893 case XID_RB_EQUAL:
894 return req;
895 }
896 }
897 return NULL;
898}
899
900static void
901xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new)
902{
903 struct rb_node **p = &xprt->recv_queue.rb_node;
904 struct rb_node *n = NULL;
905 struct rpc_rqst *req;
906
907 while (*p != NULL) {
908 n = *p;
909 req = rb_entry(n, struct rpc_rqst, rq_recv);
910 switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) {
911 case XID_RB_LEFT:
912 p = &n->rb_left;
913 break;
914 case XID_RB_RIGHT:
915 p = &n->rb_right;
916 break;
917 case XID_RB_EQUAL:
918 WARN_ON_ONCE(new != req);
919 return;
920 }
921 }
922 rb_link_node(&new->rq_recv, n, p);
923 rb_insert_color(&new->rq_recv, &xprt->recv_queue);
924}
925
926static void
927xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req)
928{
929 rb_erase(&req->rq_recv, &xprt->recv_queue);
930}
931
824/** 932/**
825 * xprt_lookup_rqst - find an RPC request corresponding to an XID 933 * xprt_lookup_rqst - find an RPC request corresponding to an XID
826 * @xprt: transport on which the original request was transmitted 934 * @xprt: transport on which the original request was transmitted
827 * @xid: RPC XID of incoming reply 935 * @xid: RPC XID of incoming reply
828 * 936 *
829 * Caller holds xprt->recv_lock. 937 * Caller holds xprt->queue_lock.
830 */ 938 */
831struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) 939struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
832{ 940{
833 struct rpc_rqst *entry; 941 struct rpc_rqst *entry;
834 942
835 list_for_each_entry(entry, &xprt->recv, rq_list) 943 entry = xprt_request_rb_find(xprt, xid);
836 if (entry->rq_xid == xid) { 944 if (entry != NULL) {
837 trace_xprt_lookup_rqst(xprt, xid, 0); 945 trace_xprt_lookup_rqst(xprt, xid, 0);
838 entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); 946 entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
839 return entry; 947 return entry;
840 } 948 }
841 949
842 dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", 950 dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n",
843 ntohl(xid)); 951 ntohl(xid));
@@ -847,16 +955,22 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
847} 955}
848EXPORT_SYMBOL_GPL(xprt_lookup_rqst); 956EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
849 957
958static bool
959xprt_is_pinned_rqst(struct rpc_rqst *req)
960{
961 return atomic_read(&req->rq_pin) != 0;
962}
963
850/** 964/**
851 * xprt_pin_rqst - Pin a request on the transport receive list 965 * xprt_pin_rqst - Pin a request on the transport receive list
852 * @req: Request to pin 966 * @req: Request to pin
853 * 967 *
854 * Caller must ensure this is atomic with the call to xprt_lookup_rqst() 968 * Caller must ensure this is atomic with the call to xprt_lookup_rqst()
855 * so should be holding the xprt transport lock. 969 * so should be holding the xprt receive lock.
856 */ 970 */
857void xprt_pin_rqst(struct rpc_rqst *req) 971void xprt_pin_rqst(struct rpc_rqst *req)
858{ 972{
859 set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); 973 atomic_inc(&req->rq_pin);
860} 974}
861EXPORT_SYMBOL_GPL(xprt_pin_rqst); 975EXPORT_SYMBOL_GPL(xprt_pin_rqst);
862 976
@@ -864,38 +978,87 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst);
864 * xprt_unpin_rqst - Unpin a request on the transport receive list 978 * xprt_unpin_rqst - Unpin a request on the transport receive list
865 * @req: Request to pin 979 * @req: Request to pin
866 * 980 *
867 * Caller should be holding the xprt transport lock. 981 * Caller should be holding the xprt receive lock.
868 */ 982 */
869void xprt_unpin_rqst(struct rpc_rqst *req) 983void xprt_unpin_rqst(struct rpc_rqst *req)
870{ 984{
871 struct rpc_task *task = req->rq_task; 985 if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) {
872 986 atomic_dec(&req->rq_pin);
873 clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); 987 return;
874 if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) 988 }
875 wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); 989 if (atomic_dec_and_test(&req->rq_pin))
990 wake_up_var(&req->rq_pin);
876} 991}
877EXPORT_SYMBOL_GPL(xprt_unpin_rqst); 992EXPORT_SYMBOL_GPL(xprt_unpin_rqst);
878 993
879static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) 994static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
880__must_hold(&req->rq_xprt->recv_lock)
881{ 995{
882 struct rpc_task *task = req->rq_task; 996 wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req));
997}
883 998
884 if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { 999static bool
885 spin_unlock(&req->rq_xprt->recv_lock); 1000xprt_request_data_received(struct rpc_task *task)
886 set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); 1001{
887 wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, 1002 return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) &&
888 TASK_UNINTERRUPTIBLE); 1003 READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0;
889 clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); 1004}
890 spin_lock(&req->rq_xprt->recv_lock); 1005
891 } 1006static bool
1007xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req)
1008{
1009 return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) &&
1010 READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0;
1011}
1012
1013/**
1014 * xprt_request_enqueue_receive - Add an request to the receive queue
1015 * @task: RPC task
1016 *
1017 */
1018void
1019xprt_request_enqueue_receive(struct rpc_task *task)
1020{
1021 struct rpc_rqst *req = task->tk_rqstp;
1022 struct rpc_xprt *xprt = req->rq_xprt;
1023
1024 if (!xprt_request_need_enqueue_receive(task, req))
1025 return;
1026 spin_lock(&xprt->queue_lock);
1027
1028 /* Update the softirq receive buffer */
1029 memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
1030 sizeof(req->rq_private_buf));
1031
1032 /* Add request to the receive list */
1033 xprt_request_rb_insert(xprt, req);
1034 set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate);
1035 spin_unlock(&xprt->queue_lock);
1036
1037 xprt_reset_majortimeo(req);
1038 /* Turn off autodisconnect */
1039 del_singleshot_timer_sync(&xprt->timer);
1040}
1041
1042/**
1043 * xprt_request_dequeue_receive_locked - Remove a request from the receive queue
1044 * @task: RPC task
1045 *
1046 * Caller must hold xprt->queue_lock.
1047 */
1048static void
1049xprt_request_dequeue_receive_locked(struct rpc_task *task)
1050{
1051 struct rpc_rqst *req = task->tk_rqstp;
1052
1053 if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
1054 xprt_request_rb_remove(req->rq_xprt, req);
892} 1055}
893 1056
894/** 1057/**
895 * xprt_update_rtt - Update RPC RTT statistics 1058 * xprt_update_rtt - Update RPC RTT statistics
896 * @task: RPC request that recently completed 1059 * @task: RPC request that recently completed
897 * 1060 *
898 * Caller holds xprt->recv_lock. 1061 * Caller holds xprt->queue_lock.
899 */ 1062 */
900void xprt_update_rtt(struct rpc_task *task) 1063void xprt_update_rtt(struct rpc_task *task)
901{ 1064{
@@ -917,7 +1080,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt);
917 * @task: RPC request that recently completed 1080 * @task: RPC request that recently completed
918 * @copied: actual number of bytes received from the transport 1081 * @copied: actual number of bytes received from the transport
919 * 1082 *
920 * Caller holds xprt->recv_lock. 1083 * Caller holds xprt->queue_lock.
921 */ 1084 */
922void xprt_complete_rqst(struct rpc_task *task, int copied) 1085void xprt_complete_rqst(struct rpc_task *task, int copied)
923{ 1086{
@@ -930,12 +1093,12 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
930 1093
931 xprt->stat.recvs++; 1094 xprt->stat.recvs++;
932 1095
933 list_del_init(&req->rq_list);
934 req->rq_private_buf.len = copied; 1096 req->rq_private_buf.len = copied;
935 /* Ensure all writes are done before we update */ 1097 /* Ensure all writes are done before we update */
936 /* req->rq_reply_bytes_recvd */ 1098 /* req->rq_reply_bytes_recvd */
937 smp_wmb(); 1099 smp_wmb();
938 req->rq_reply_bytes_recvd = copied; 1100 req->rq_reply_bytes_recvd = copied;
1101 xprt_request_dequeue_receive_locked(task);
939 rpc_wake_up_queued_task(&xprt->pending, task); 1102 rpc_wake_up_queued_task(&xprt->pending, task);
940} 1103}
941EXPORT_SYMBOL_GPL(xprt_complete_rqst); 1104EXPORT_SYMBOL_GPL(xprt_complete_rqst);
@@ -957,6 +1120,172 @@ static void xprt_timer(struct rpc_task *task)
957} 1120}
958 1121
959/** 1122/**
1123 * xprt_request_wait_receive - wait for the reply to an RPC request
1124 * @task: RPC task about to send a request
1125 *
1126 */
1127void xprt_request_wait_receive(struct rpc_task *task)
1128{
1129 struct rpc_rqst *req = task->tk_rqstp;
1130 struct rpc_xprt *xprt = req->rq_xprt;
1131
1132 if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
1133 return;
1134 /*
1135 * Sleep on the pending queue if we're expecting a reply.
1136 * The spinlock ensures atomicity between the test of
1137 * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on().
1138 */
1139 spin_lock(&xprt->queue_lock);
1140 if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) {
1141 xprt->ops->set_retrans_timeout(task);
1142 rpc_sleep_on(&xprt->pending, task, xprt_timer);
1143 /*
1144 * Send an extra queue wakeup call if the
1145 * connection was dropped in case the call to
1146 * rpc_sleep_on() raced.
1147 */
1148 if (xprt_request_retransmit_after_disconnect(task))
1149 rpc_wake_up_queued_task_set_status(&xprt->pending,
1150 task, -ENOTCONN);
1151 }
1152 spin_unlock(&xprt->queue_lock);
1153}
1154
1155static bool
1156xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req)
1157{
1158 return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
1159}
1160
1161/**
1162 * xprt_request_enqueue_transmit - queue a task for transmission
1163 * @task: pointer to rpc_task
1164 *
1165 * Add a task to the transmission queue.
1166 */
1167void
1168xprt_request_enqueue_transmit(struct rpc_task *task)
1169{
1170 struct rpc_rqst *pos, *req = task->tk_rqstp;
1171 struct rpc_xprt *xprt = req->rq_xprt;
1172
1173 if (xprt_request_need_enqueue_transmit(task, req)) {
1174 spin_lock(&xprt->queue_lock);
1175 /*
1176 * Requests that carry congestion control credits are added
1177 * to the head of the list to avoid starvation issues.
1178 */
1179 if (req->rq_cong) {
1180 xprt_clear_congestion_window_wait(xprt);
1181 list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
1182 if (pos->rq_cong)
1183 continue;
1184 /* Note: req is added _before_ pos */
1185 list_add_tail(&req->rq_xmit, &pos->rq_xmit);
1186 INIT_LIST_HEAD(&req->rq_xmit2);
1187 goto out;
1188 }
1189 } else if (RPC_IS_SWAPPER(task)) {
1190 list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
1191 if (pos->rq_cong || pos->rq_bytes_sent)
1192 continue;
1193 if (RPC_IS_SWAPPER(pos->rq_task))
1194 continue;
1195 /* Note: req is added _before_ pos */
1196 list_add_tail(&req->rq_xmit, &pos->rq_xmit);
1197 INIT_LIST_HEAD(&req->rq_xmit2);
1198 goto out;
1199 }
1200 } else {
1201 list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
1202 if (pos->rq_task->tk_owner != task->tk_owner)
1203 continue;
1204 list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
1205 INIT_LIST_HEAD(&req->rq_xmit);
1206 goto out;
1207 }
1208 }
1209 list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
1210 INIT_LIST_HEAD(&req->rq_xmit2);
1211out:
1212 set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
1213 spin_unlock(&xprt->queue_lock);
1214 }
1215}
1216
1217/**
1218 * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue
1219 * @task: pointer to rpc_task
1220 *
1221 * Remove a task from the transmission queue
1222 * Caller must hold xprt->queue_lock
1223 */
1224static void
1225xprt_request_dequeue_transmit_locked(struct rpc_task *task)
1226{
1227 struct rpc_rqst *req = task->tk_rqstp;
1228
1229 if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
1230 return;
1231 if (!list_empty(&req->rq_xmit)) {
1232 list_del(&req->rq_xmit);
1233 if (!list_empty(&req->rq_xmit2)) {
1234 struct rpc_rqst *next = list_first_entry(&req->rq_xmit2,
1235 struct rpc_rqst, rq_xmit2);
1236 list_del(&req->rq_xmit2);
1237 list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue);
1238 }
1239 } else
1240 list_del(&req->rq_xmit2);
1241}
1242
1243/**
1244 * xprt_request_dequeue_transmit - remove a task from the transmission queue
1245 * @task: pointer to rpc_task
1246 *
1247 * Remove a task from the transmission queue
1248 */
1249static void
1250xprt_request_dequeue_transmit(struct rpc_task *task)
1251{
1252 struct rpc_rqst *req = task->tk_rqstp;
1253 struct rpc_xprt *xprt = req->rq_xprt;
1254
1255 spin_lock(&xprt->queue_lock);
1256 xprt_request_dequeue_transmit_locked(task);
1257 spin_unlock(&xprt->queue_lock);
1258}
1259
1260/**
1261 * xprt_request_prepare - prepare an encoded request for transport
1262 * @req: pointer to rpc_rqst
1263 *
1264 * Calls into the transport layer to do whatever is needed to prepare
1265 * the request for transmission or receive.
1266 */
1267void
1268xprt_request_prepare(struct rpc_rqst *req)
1269{
1270 struct rpc_xprt *xprt = req->rq_xprt;
1271
1272 if (xprt->ops->prepare_request)
1273 xprt->ops->prepare_request(req);
1274}
1275
1276/**
1277 * xprt_request_need_retransmit - Test if a task needs retransmission
1278 * @task: pointer to rpc_task
1279 *
1280 * Test for whether a connection breakage requires the task to retransmit
1281 */
1282bool
1283xprt_request_need_retransmit(struct rpc_task *task)
1284{
1285 return xprt_request_retransmit_after_disconnect(task);
1286}
1287
1288/**
960 * xprt_prepare_transmit - reserve the transport before sending a request 1289 * xprt_prepare_transmit - reserve the transport before sending a request
961 * @task: RPC task about to send a request 1290 * @task: RPC task about to send a request
962 * 1291 *
@@ -965,32 +1294,18 @@ bool xprt_prepare_transmit(struct rpc_task *task)
965{ 1294{
966 struct rpc_rqst *req = task->tk_rqstp; 1295 struct rpc_rqst *req = task->tk_rqstp;
967 struct rpc_xprt *xprt = req->rq_xprt; 1296 struct rpc_xprt *xprt = req->rq_xprt;
968 bool ret = false;
969 1297
970 dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); 1298 dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
971 1299
972 spin_lock_bh(&xprt->transport_lock); 1300 if (!xprt_lock_write(xprt, task)) {
973 if (!req->rq_bytes_sent) { 1301 /* Race breaker: someone may have transmitted us */
974 if (req->rq_reply_bytes_recvd) { 1302 if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
975 task->tk_status = req->rq_reply_bytes_recvd; 1303 rpc_wake_up_queued_task_set_status(&xprt->sending,
976 goto out_unlock; 1304 task, 0);
977 } 1305 return false;
978 if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) 1306
979 && xprt_connected(xprt)
980 && req->rq_connect_cookie == xprt->connect_cookie) {
981 xprt->ops->set_retrans_timeout(task);
982 rpc_sleep_on(&xprt->pending, task, xprt_timer);
983 goto out_unlock;
984 }
985 }
986 if (!xprt->ops->reserve_xprt(xprt, task)) {
987 task->tk_status = -EAGAIN;
988 goto out_unlock;
989 } 1307 }
990 ret = true; 1308 return true;
991out_unlock:
992 spin_unlock_bh(&xprt->transport_lock);
993 return ret;
994} 1309}
995 1310
996void xprt_end_transmit(struct rpc_task *task) 1311void xprt_end_transmit(struct rpc_task *task)
@@ -999,54 +1314,62 @@ void xprt_end_transmit(struct rpc_task *task)
999} 1314}
1000 1315
1001/** 1316/**
1002 * xprt_transmit - send an RPC request on a transport 1317 * xprt_request_transmit - send an RPC request on a transport
1003 * @task: controlling RPC task 1318 * @req: pointer to request to transmit
1319 * @snd_task: RPC task that owns the transport lock
1004 * 1320 *
1005 * We have to copy the iovec because sendmsg fiddles with its contents. 1321 * This performs the transmission of a single request.
1322 * Note that if the request is not the same as snd_task, then it
1323 * does need to be pinned.
1324 * Returns '0' on success.
1006 */ 1325 */
1007void xprt_transmit(struct rpc_task *task) 1326static int
1327xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
1008{ 1328{
1009 struct rpc_rqst *req = task->tk_rqstp; 1329 struct rpc_xprt *xprt = req->rq_xprt;
1010 struct rpc_xprt *xprt = req->rq_xprt; 1330 struct rpc_task *task = req->rq_task;
1011 unsigned int connect_cookie; 1331 unsigned int connect_cookie;
1332 int is_retrans = RPC_WAS_SENT(task);
1012 int status; 1333 int status;
1013 1334
1014 dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); 1335 dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
1015 1336
1016 if (!req->rq_reply_bytes_recvd) { 1337 if (!req->rq_bytes_sent) {
1017 if (list_empty(&req->rq_list) && rpc_reply_expected(task)) { 1338 if (xprt_request_data_received(task)) {
1018 /* 1339 status = 0;
1019 * Add to the list only if we're expecting a reply 1340 goto out_dequeue;
1020 */
1021 /* Update the softirq receive buffer */
1022 memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
1023 sizeof(req->rq_private_buf));
1024 /* Add request to the receive list */
1025 spin_lock(&xprt->recv_lock);
1026 list_add_tail(&req->rq_list, &xprt->recv);
1027 spin_unlock(&xprt->recv_lock);
1028 xprt_reset_majortimeo(req);
1029 /* Turn off autodisconnect */
1030 del_singleshot_timer_sync(&xprt->timer);
1031 } 1341 }
1032 } else if (!req->rq_bytes_sent) 1342 /* Verify that our message lies in the RPCSEC_GSS window */
1033 return; 1343 if (rpcauth_xmit_need_reencode(task)) {
1344 status = -EBADMSG;
1345 goto out_dequeue;
1346 }
1347 }
1348
1349 /*
1350 * Update req->rq_ntrans before transmitting to avoid races with
1351 * xprt_update_rtt(), which needs to know that it is recording a
1352 * reply to the first transmission.
1353 */
1354 req->rq_ntrans++;
1034 1355
1035 connect_cookie = xprt->connect_cookie; 1356 connect_cookie = xprt->connect_cookie;
1036 status = xprt->ops->send_request(task); 1357 status = xprt->ops->send_request(req);
1037 trace_xprt_transmit(xprt, req->rq_xid, status); 1358 trace_xprt_transmit(xprt, req->rq_xid, status);
1038 if (status != 0) { 1359 if (status != 0) {
1039 task->tk_status = status; 1360 req->rq_ntrans--;
1040 return; 1361 return status;
1041 } 1362 }
1363
1364 if (is_retrans)
1365 task->tk_client->cl_stats->rpcretrans++;
1366
1042 xprt_inject_disconnect(xprt); 1367 xprt_inject_disconnect(xprt);
1043 1368
1044 dprintk("RPC: %5u xmit complete\n", task->tk_pid); 1369 dprintk("RPC: %5u xmit complete\n", task->tk_pid);
1045 task->tk_flags |= RPC_TASK_SENT; 1370 task->tk_flags |= RPC_TASK_SENT;
1046 spin_lock_bh(&xprt->transport_lock); 1371 spin_lock_bh(&xprt->transport_lock);
1047 1372
1048 xprt->ops->set_retrans_timeout(task);
1049
1050 xprt->stat.sends++; 1373 xprt->stat.sends++;
1051 xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; 1374 xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
1052 xprt->stat.bklog_u += xprt->backlog.qlen; 1375 xprt->stat.bklog_u += xprt->backlog.qlen;
@@ -1055,25 +1378,49 @@ void xprt_transmit(struct rpc_task *task)
1055 spin_unlock_bh(&xprt->transport_lock); 1378 spin_unlock_bh(&xprt->transport_lock);
1056 1379
1057 req->rq_connect_cookie = connect_cookie; 1380 req->rq_connect_cookie = connect_cookie;
1058 if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { 1381out_dequeue:
1059 /* 1382 xprt_request_dequeue_transmit(task);
1060 * Sleep on the pending queue if we're expecting a reply. 1383 rpc_wake_up_queued_task_set_status(&xprt->sending, task, status);
1061 * The spinlock ensures atomicity between the test of 1384 return status;
1062 * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). 1385}
1063 */ 1386
1064 spin_lock(&xprt->recv_lock); 1387/**
1065 if (!req->rq_reply_bytes_recvd) { 1388 * xprt_transmit - send an RPC request on a transport
1066 rpc_sleep_on(&xprt->pending, task, xprt_timer); 1389 * @task: controlling RPC task
1067 /* 1390 *
1068 * Send an extra queue wakeup call if the 1391 * Attempts to drain the transmit queue. On exit, either the transport
1069 * connection was dropped in case the call to 1392 * signalled an error that needs to be handled before transmission can
1070 * rpc_sleep_on() raced. 1393 * resume, or @task finished transmitting, and detected that it already
1071 */ 1394 * received a reply.
1072 if (!xprt_connected(xprt)) 1395 */
1073 xprt_wake_pending_tasks(xprt, -ENOTCONN); 1396void
1074 } 1397xprt_transmit(struct rpc_task *task)
1075 spin_unlock(&xprt->recv_lock); 1398{
1399 struct rpc_rqst *next, *req = task->tk_rqstp;
1400 struct rpc_xprt *xprt = req->rq_xprt;
1401 int status;
1402
1403 spin_lock(&xprt->queue_lock);
1404 while (!list_empty(&xprt->xmit_queue)) {
1405 next = list_first_entry(&xprt->xmit_queue,
1406 struct rpc_rqst, rq_xmit);
1407 xprt_pin_rqst(next);
1408 spin_unlock(&xprt->queue_lock);
1409 status = xprt_request_transmit(next, task);
1410 if (status == -EBADMSG && next != req)
1411 status = 0;
1412 cond_resched();
1413 spin_lock(&xprt->queue_lock);
1414 xprt_unpin_rqst(next);
1415 if (status == 0) {
1416 if (!xprt_request_data_received(task) ||
1417 test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
1418 continue;
1419 } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
1420 task->tk_status = status;
1421 break;
1076 } 1422 }
1423 spin_unlock(&xprt->queue_lock);
1077} 1424}
1078 1425
1079static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) 1426static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1170,20 +1517,6 @@ out_init_req:
1170} 1517}
1171EXPORT_SYMBOL_GPL(xprt_alloc_slot); 1518EXPORT_SYMBOL_GPL(xprt_alloc_slot);
1172 1519
1173void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
1174{
1175 /* Note: grabbing the xprt_lock_write() ensures that we throttle
1176 * new slot allocation if the transport is congested (i.e. when
1177 * reconnecting a stream transport or when out of socket write
1178 * buffer space).
1179 */
1180 if (xprt_lock_write(xprt, task)) {
1181 xprt_alloc_slot(xprt, task);
1182 xprt_release_write(xprt, task);
1183 }
1184}
1185EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot);
1186
1187void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) 1520void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
1188{ 1521{
1189 spin_lock(&xprt->reserve_lock); 1522 spin_lock(&xprt->reserve_lock);
@@ -1250,6 +1583,60 @@ void xprt_free(struct rpc_xprt *xprt)
1250} 1583}
1251EXPORT_SYMBOL_GPL(xprt_free); 1584EXPORT_SYMBOL_GPL(xprt_free);
1252 1585
1586static void
1587xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt)
1588{
1589 req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1;
1590}
1591
1592static __be32
1593xprt_alloc_xid(struct rpc_xprt *xprt)
1594{
1595 __be32 xid;
1596
1597 spin_lock(&xprt->reserve_lock);
1598 xid = (__force __be32)xprt->xid++;
1599 spin_unlock(&xprt->reserve_lock);
1600 return xid;
1601}
1602
1603static void
1604xprt_init_xid(struct rpc_xprt *xprt)
1605{
1606 xprt->xid = prandom_u32();
1607}
1608
1609static void
1610xprt_request_init(struct rpc_task *task)
1611{
1612 struct rpc_xprt *xprt = task->tk_xprt;
1613 struct rpc_rqst *req = task->tk_rqstp;
1614
1615 req->rq_timeout = task->tk_client->cl_timeout->to_initval;
1616 req->rq_task = task;
1617 req->rq_xprt = xprt;
1618 req->rq_buffer = NULL;
1619 req->rq_xid = xprt_alloc_xid(xprt);
1620 xprt_init_connect_cookie(req, xprt);
1621 req->rq_bytes_sent = 0;
1622 req->rq_snd_buf.len = 0;
1623 req->rq_snd_buf.buflen = 0;
1624 req->rq_rcv_buf.len = 0;
1625 req->rq_rcv_buf.buflen = 0;
1626 req->rq_release_snd_buf = NULL;
1627 xprt_reset_majortimeo(req);
1628 dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
1629 req, ntohl(req->rq_xid));
1630}
1631
1632static void
1633xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task)
1634{
1635 xprt->ops->alloc_slot(xprt, task);
1636 if (task->tk_rqstp != NULL)
1637 xprt_request_init(task);
1638}
1639
1253/** 1640/**
1254 * xprt_reserve - allocate an RPC request slot 1641 * xprt_reserve - allocate an RPC request slot
1255 * @task: RPC task requesting a slot allocation 1642 * @task: RPC task requesting a slot allocation
@@ -1269,7 +1656,7 @@ void xprt_reserve(struct rpc_task *task)
1269 task->tk_timeout = 0; 1656 task->tk_timeout = 0;
1270 task->tk_status = -EAGAIN; 1657 task->tk_status = -EAGAIN;
1271 if (!xprt_throttle_congested(xprt, task)) 1658 if (!xprt_throttle_congested(xprt, task))
1272 xprt->ops->alloc_slot(xprt, task); 1659 xprt_do_reserve(xprt, task);
1273} 1660}
1274 1661
1275/** 1662/**
@@ -1291,45 +1678,29 @@ void xprt_retry_reserve(struct rpc_task *task)
1291 1678
1292 task->tk_timeout = 0; 1679 task->tk_timeout = 0;
1293 task->tk_status = -EAGAIN; 1680 task->tk_status = -EAGAIN;
1294 xprt->ops->alloc_slot(xprt, task); 1681 xprt_do_reserve(xprt, task);
1295}
1296
1297static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
1298{
1299 __be32 xid;
1300
1301 spin_lock(&xprt->reserve_lock);
1302 xid = (__force __be32)xprt->xid++;
1303 spin_unlock(&xprt->reserve_lock);
1304 return xid;
1305} 1682}
1306 1683
1307static inline void xprt_init_xid(struct rpc_xprt *xprt) 1684static void
1308{ 1685xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req)
1309 xprt->xid = prandom_u32();
1310}
1311
1312void xprt_request_init(struct rpc_task *task)
1313{ 1686{
1314 struct rpc_xprt *xprt = task->tk_xprt; 1687 struct rpc_xprt *xprt = req->rq_xprt;
1315 struct rpc_rqst *req = task->tk_rqstp;
1316 1688
1317 INIT_LIST_HEAD(&req->rq_list); 1689 if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
1318 req->rq_timeout = task->tk_client->cl_timeout->to_initval; 1690 test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
1319 req->rq_task = task; 1691 xprt_is_pinned_rqst(req)) {
1320 req->rq_xprt = xprt; 1692 spin_lock(&xprt->queue_lock);
1321 req->rq_buffer = NULL; 1693 xprt_request_dequeue_transmit_locked(task);
1322 req->rq_xid = xprt_alloc_xid(xprt); 1694 xprt_request_dequeue_receive_locked(task);
1323 req->rq_connect_cookie = xprt->connect_cookie - 1; 1695 while (xprt_is_pinned_rqst(req)) {
1324 req->rq_bytes_sent = 0; 1696 set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
1325 req->rq_snd_buf.len = 0; 1697 spin_unlock(&xprt->queue_lock);
1326 req->rq_snd_buf.buflen = 0; 1698 xprt_wait_on_pinned_rqst(req);
1327 req->rq_rcv_buf.len = 0; 1699 spin_lock(&xprt->queue_lock);
1328 req->rq_rcv_buf.buflen = 0; 1700 clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
1329 req->rq_release_snd_buf = NULL; 1701 }
1330 xprt_reset_majortimeo(req); 1702 spin_unlock(&xprt->queue_lock);
1331 dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, 1703 }
1332 req, ntohl(req->rq_xid));
1333} 1704}
1334 1705
1335/** 1706/**
@@ -1345,8 +1716,7 @@ void xprt_release(struct rpc_task *task)
1345 if (req == NULL) { 1716 if (req == NULL) {
1346 if (task->tk_client) { 1717 if (task->tk_client) {
1347 xprt = task->tk_xprt; 1718 xprt = task->tk_xprt;
1348 if (xprt->snd_task == task) 1719 xprt_release_write(xprt, task);
1349 xprt_release_write(xprt, task);
1350 } 1720 }
1351 return; 1721 return;
1352 } 1722 }
@@ -1356,12 +1726,7 @@ void xprt_release(struct rpc_task *task)
1356 task->tk_ops->rpc_count_stats(task, task->tk_calldata); 1726 task->tk_ops->rpc_count_stats(task, task->tk_calldata);
1357 else if (task->tk_client) 1727 else if (task->tk_client)
1358 rpc_count_iostats(task, task->tk_client->cl_metrics); 1728 rpc_count_iostats(task, task->tk_client->cl_metrics);
1359 spin_lock(&xprt->recv_lock); 1729 xprt_request_dequeue_all(task, req);
1360 if (!list_empty(&req->rq_list)) {
1361 list_del_init(&req->rq_list);
1362 xprt_wait_on_pinned_rqst(req);
1363 }
1364 spin_unlock(&xprt->recv_lock);
1365 spin_lock_bh(&xprt->transport_lock); 1730 spin_lock_bh(&xprt->transport_lock);
1366 xprt->ops->release_xprt(xprt, task); 1731 xprt->ops->release_xprt(xprt, task);
1367 if (xprt->ops->release_request) 1732 if (xprt->ops->release_request)
@@ -1372,6 +1737,7 @@ void xprt_release(struct rpc_task *task)
1372 if (req->rq_buffer) 1737 if (req->rq_buffer)
1373 xprt->ops->buf_free(task); 1738 xprt->ops->buf_free(task);
1374 xprt_inject_disconnect(xprt); 1739 xprt_inject_disconnect(xprt);
1740 xdr_free_bvec(&req->rq_rcv_buf);
1375 if (req->rq_cred != NULL) 1741 if (req->rq_cred != NULL)
1376 put_rpccred(req->rq_cred); 1742 put_rpccred(req->rq_cred);
1377 task->tk_rqstp = NULL; 1743 task->tk_rqstp = NULL;
@@ -1385,16 +1751,36 @@ void xprt_release(struct rpc_task *task)
1385 xprt_free_bc_request(req); 1751 xprt_free_bc_request(req);
1386} 1752}
1387 1753
1754#ifdef CONFIG_SUNRPC_BACKCHANNEL
1755void
1756xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task)
1757{
1758 struct xdr_buf *xbufp = &req->rq_snd_buf;
1759
1760 task->tk_rqstp = req;
1761 req->rq_task = task;
1762 xprt_init_connect_cookie(req, req->rq_xprt);
1763 /*
1764 * Set up the xdr_buf length.
1765 * This also indicates that the buffer is XDR encoded already.
1766 */
1767 xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
1768 xbufp->tail[0].iov_len;
1769 req->rq_bytes_sent = 0;
1770}
1771#endif
1772
1388static void xprt_init(struct rpc_xprt *xprt, struct net *net) 1773static void xprt_init(struct rpc_xprt *xprt, struct net *net)
1389{ 1774{
1390 kref_init(&xprt->kref); 1775 kref_init(&xprt->kref);
1391 1776
1392 spin_lock_init(&xprt->transport_lock); 1777 spin_lock_init(&xprt->transport_lock);
1393 spin_lock_init(&xprt->reserve_lock); 1778 spin_lock_init(&xprt->reserve_lock);
1394 spin_lock_init(&xprt->recv_lock); 1779 spin_lock_init(&xprt->queue_lock);
1395 1780
1396 INIT_LIST_HEAD(&xprt->free); 1781 INIT_LIST_HEAD(&xprt->free);
1397 INIT_LIST_HEAD(&xprt->recv); 1782 xprt->recv_queue = RB_ROOT;
1783 INIT_LIST_HEAD(&xprt->xmit_queue);
1398#if defined(CONFIG_SUNRPC_BACKCHANNEL) 1784#if defined(CONFIG_SUNRPC_BACKCHANNEL)
1399 spin_lock_init(&xprt->bc_pa_lock); 1785 spin_lock_init(&xprt->bc_pa_lock);
1400 INIT_LIST_HEAD(&xprt->bc_pa_list); 1786 INIT_LIST_HEAD(&xprt->bc_pa_list);
@@ -1407,7 +1793,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
1407 1793
1408 rpc_init_wait_queue(&xprt->binding, "xprt_binding"); 1794 rpc_init_wait_queue(&xprt->binding, "xprt_binding");
1409 rpc_init_wait_queue(&xprt->pending, "xprt_pending"); 1795 rpc_init_wait_queue(&xprt->pending, "xprt_pending");
1410 rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending"); 1796 rpc_init_wait_queue(&xprt->sending, "xprt_sending");
1411 rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); 1797 rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
1412 1798
1413 xprt_init_xid(xprt); 1799 xprt_init_xid(xprt);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 90adeff4c06b..e5b367a3e517 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -51,12 +51,11 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt,
51 rqst = &req->rl_slot; 51 rqst = &req->rl_slot;
52 52
53 rqst->rq_xprt = xprt; 53 rqst->rq_xprt = xprt;
54 INIT_LIST_HEAD(&rqst->rq_list);
55 INIT_LIST_HEAD(&rqst->rq_bc_list); 54 INIT_LIST_HEAD(&rqst->rq_bc_list);
56 __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); 55 __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
57 spin_lock_bh(&xprt->bc_pa_lock); 56 spin_lock(&xprt->bc_pa_lock);
58 list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); 57 list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
59 spin_unlock_bh(&xprt->bc_pa_lock); 58 spin_unlock(&xprt->bc_pa_lock);
60 59
61 size = r_xprt->rx_data.inline_rsize; 60 size = r_xprt->rx_data.inline_rsize;
62 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); 61 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
@@ -201,6 +200,9 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
201 if (!xprt_connected(rqst->rq_xprt)) 200 if (!xprt_connected(rqst->rq_xprt))
202 goto drop_connection; 201 goto drop_connection;
203 202
203 if (!xprt_request_get_cong(rqst->rq_xprt, rqst))
204 return -EBADSLT;
205
204 rc = rpcrdma_bc_marshal_reply(rqst); 206 rc = rpcrdma_bc_marshal_reply(rqst);
205 if (rc < 0) 207 if (rc < 0)
206 goto failed_marshal; 208 goto failed_marshal;
@@ -228,16 +230,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
228 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 230 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
229 struct rpc_rqst *rqst, *tmp; 231 struct rpc_rqst *rqst, *tmp;
230 232
231 spin_lock_bh(&xprt->bc_pa_lock); 233 spin_lock(&xprt->bc_pa_lock);
232 list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { 234 list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
233 list_del(&rqst->rq_bc_pa_list); 235 list_del(&rqst->rq_bc_pa_list);
234 spin_unlock_bh(&xprt->bc_pa_lock); 236 spin_unlock(&xprt->bc_pa_lock);
235 237
236 rpcrdma_bc_free_rqst(r_xprt, rqst); 238 rpcrdma_bc_free_rqst(r_xprt, rqst);
237 239
238 spin_lock_bh(&xprt->bc_pa_lock); 240 spin_lock(&xprt->bc_pa_lock);
239 } 241 }
240 spin_unlock_bh(&xprt->bc_pa_lock); 242 spin_unlock(&xprt->bc_pa_lock);
241} 243}
242 244
243/** 245/**
@@ -255,9 +257,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
255 rpcrdma_recv_buffer_put(req->rl_reply); 257 rpcrdma_recv_buffer_put(req->rl_reply);
256 req->rl_reply = NULL; 258 req->rl_reply = NULL;
257 259
258 spin_lock_bh(&xprt->bc_pa_lock); 260 spin_lock(&xprt->bc_pa_lock);
259 list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); 261 list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
260 spin_unlock_bh(&xprt->bc_pa_lock); 262 spin_unlock(&xprt->bc_pa_lock);
261} 263}
262 264
263/** 265/**
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 0f7c465d9a5a..7f5632cd5a48 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia)
49 return true; 49 return true;
50} 50}
51 51
52static int 52static void
53fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
54{
55 static struct ib_fmr_attr fmr_attr = {
56 .max_pages = RPCRDMA_MAX_FMR_SGES,
57 .max_maps = 1,
58 .page_shift = PAGE_SHIFT
59 };
60
61 mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
62 sizeof(u64), GFP_KERNEL);
63 if (!mr->fmr.fm_physaddrs)
64 goto out_free;
65
66 mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
67 sizeof(*mr->mr_sg), GFP_KERNEL);
68 if (!mr->mr_sg)
69 goto out_free;
70
71 sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
72
73 mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
74 &fmr_attr);
75 if (IS_ERR(mr->fmr.fm_mr))
76 goto out_fmr_err;
77
78 INIT_LIST_HEAD(&mr->mr_list);
79 return 0;
80
81out_fmr_err:
82 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
83 PTR_ERR(mr->fmr.fm_mr));
84
85out_free:
86 kfree(mr->mr_sg);
87 kfree(mr->fmr.fm_physaddrs);
88 return -ENOMEM;
89}
90
91static int
92__fmr_unmap(struct rpcrdma_mr *mr) 53__fmr_unmap(struct rpcrdma_mr *mr)
93{ 54{
94 LIST_HEAD(l); 55 LIST_HEAD(l);
@@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr)
97 list_add(&mr->fmr.fm_mr->list, &l); 58 list_add(&mr->fmr.fm_mr->list, &l);
98 rc = ib_unmap_fmr(&l); 59 rc = ib_unmap_fmr(&l);
99 list_del(&mr->fmr.fm_mr->list); 60 list_del(&mr->fmr.fm_mr->list);
100 return rc; 61 if (rc)
62 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
63 mr, rc);
101} 64}
102 65
66/* Release an MR.
67 */
103static void 68static void
104fmr_op_release_mr(struct rpcrdma_mr *mr) 69fmr_op_release_mr(struct rpcrdma_mr *mr)
105{ 70{
106 LIST_HEAD(unmap_list);
107 int rc; 71 int rc;
108 72
109 kfree(mr->fmr.fm_physaddrs); 73 kfree(mr->fmr.fm_physaddrs);
@@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr)
112 /* In case this one was left mapped, try to unmap it 76 /* In case this one was left mapped, try to unmap it
113 * to prevent dealloc_fmr from failing with EBUSY 77 * to prevent dealloc_fmr from failing with EBUSY
114 */ 78 */
115 rc = __fmr_unmap(mr); 79 __fmr_unmap(mr);
116 if (rc)
117 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
118 mr, rc);
119 80
120 rc = ib_dealloc_fmr(mr->fmr.fm_mr); 81 rc = ib_dealloc_fmr(mr->fmr.fm_mr);
121 if (rc) 82 if (rc)
@@ -125,40 +86,68 @@ fmr_op_release_mr(struct rpcrdma_mr *mr)
125 kfree(mr); 86 kfree(mr);
126} 87}
127 88
128/* Reset of a single FMR. 89/* MRs are dynamically allocated, so simply clean up and release the MR.
90 * A replacement MR will subsequently be allocated on demand.
129 */ 91 */
130static void 92static void
131fmr_op_recover_mr(struct rpcrdma_mr *mr) 93fmr_mr_recycle_worker(struct work_struct *work)
132{ 94{
95 struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
133 struct rpcrdma_xprt *r_xprt = mr->mr_xprt; 96 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
134 int rc;
135 97
136 /* ORDER: invalidate first */ 98 trace_xprtrdma_mr_recycle(mr);
137 rc = __fmr_unmap(mr);
138 if (rc)
139 goto out_release;
140
141 /* ORDER: then DMA unmap */
142 rpcrdma_mr_unmap_and_put(mr);
143 99
144 r_xprt->rx_stats.mrs_recovered++; 100 trace_xprtrdma_mr_unmap(mr);
145 return;
146
147out_release:
148 pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
149 r_xprt->rx_stats.mrs_orphaned++;
150
151 trace_xprtrdma_dma_unmap(mr);
152 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 101 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
153 mr->mr_sg, mr->mr_nents, mr->mr_dir); 102 mr->mr_sg, mr->mr_nents, mr->mr_dir);
154 103
155 spin_lock(&r_xprt->rx_buf.rb_mrlock); 104 spin_lock(&r_xprt->rx_buf.rb_mrlock);
156 list_del(&mr->mr_all); 105 list_del(&mr->mr_all);
106 r_xprt->rx_stats.mrs_recycled++;
157 spin_unlock(&r_xprt->rx_buf.rb_mrlock); 107 spin_unlock(&r_xprt->rx_buf.rb_mrlock);
158
159 fmr_op_release_mr(mr); 108 fmr_op_release_mr(mr);
160} 109}
161 110
111static int
112fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
113{
114 static struct ib_fmr_attr fmr_attr = {
115 .max_pages = RPCRDMA_MAX_FMR_SGES,
116 .max_maps = 1,
117 .page_shift = PAGE_SHIFT
118 };
119
120 mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
121 sizeof(u64), GFP_KERNEL);
122 if (!mr->fmr.fm_physaddrs)
123 goto out_free;
124
125 mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
126 sizeof(*mr->mr_sg), GFP_KERNEL);
127 if (!mr->mr_sg)
128 goto out_free;
129
130 sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
131
132 mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
133 &fmr_attr);
134 if (IS_ERR(mr->fmr.fm_mr))
135 goto out_fmr_err;
136
137 INIT_LIST_HEAD(&mr->mr_list);
138 INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker);
139 return 0;
140
141out_fmr_err:
142 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
143 PTR_ERR(mr->fmr.fm_mr));
144
145out_free:
146 kfree(mr->mr_sg);
147 kfree(mr->fmr.fm_physaddrs);
148 return -ENOMEM;
149}
150
162/* On success, sets: 151/* On success, sets:
163 * ep->rep_attr.cap.max_send_wr 152 * ep->rep_attr.cap.max_send_wr
164 * ep->rep_attr.cap.max_recv_wr 153 * ep->rep_attr.cap.max_recv_wr
@@ -187,6 +176,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
187 176
188 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / 177 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
189 RPCRDMA_MAX_FMR_SGES); 178 RPCRDMA_MAX_FMR_SGES);
179 ia->ri_max_segs += 2; /* segments for head and tail buffers */
190 return 0; 180 return 0;
191} 181}
192 182
@@ -244,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
244 mr->mr_sg, i, mr->mr_dir); 234 mr->mr_sg, i, mr->mr_dir);
245 if (!mr->mr_nents) 235 if (!mr->mr_nents)
246 goto out_dmamap_err; 236 goto out_dmamap_err;
247 trace_xprtrdma_dma_map(mr); 237 trace_xprtrdma_mr_map(mr);
248 238
249 for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) 239 for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
250 dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); 240 dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
@@ -305,13 +295,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
305 list_for_each_entry(mr, mrs, mr_list) { 295 list_for_each_entry(mr, mrs, mr_list) {
306 dprintk("RPC: %s: unmapping fmr %p\n", 296 dprintk("RPC: %s: unmapping fmr %p\n",
307 __func__, &mr->fmr); 297 __func__, &mr->fmr);
308 trace_xprtrdma_localinv(mr); 298 trace_xprtrdma_mr_localinv(mr);
309 list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); 299 list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
310 } 300 }
311 r_xprt->rx_stats.local_inv_needed++; 301 r_xprt->rx_stats.local_inv_needed++;
312 rc = ib_unmap_fmr(&unmap_list); 302 rc = ib_unmap_fmr(&unmap_list);
313 if (rc) 303 if (rc)
314 goto out_reset; 304 goto out_release;
315 305
316 /* ORDER: Now DMA unmap all of the req's MRs, and return 306 /* ORDER: Now DMA unmap all of the req's MRs, and return
317 * them to the free MW list. 307 * them to the free MW list.
@@ -324,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
324 314
325 return; 315 return;
326 316
327out_reset: 317out_release:
328 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); 318 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
329 319
330 while (!list_empty(mrs)) { 320 while (!list_empty(mrs)) {
331 mr = rpcrdma_mr_pop(mrs); 321 mr = rpcrdma_mr_pop(mrs);
332 list_del(&mr->fmr.fm_mr->list); 322 list_del(&mr->fmr.fm_mr->list);
333 fmr_op_recover_mr(mr); 323 rpcrdma_mr_recycle(mr);
334 } 324 }
335} 325}
336 326
@@ -338,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
338 .ro_map = fmr_op_map, 328 .ro_map = fmr_op_map,
339 .ro_send = fmr_op_send, 329 .ro_send = fmr_op_send,
340 .ro_unmap_sync = fmr_op_unmap_sync, 330 .ro_unmap_sync = fmr_op_unmap_sync,
341 .ro_recover_mr = fmr_op_recover_mr,
342 .ro_open = fmr_op_open, 331 .ro_open = fmr_op_open,
343 .ro_maxpages = fmr_op_maxpages, 332 .ro_maxpages = fmr_op_maxpages,
344 .ro_init_mr = fmr_op_init_mr, 333 .ro_init_mr = fmr_op_init_mr,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 1bb00dd6ccdb..fc6378cc0c1c 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -97,6 +97,44 @@ out_not_supported:
97 return false; 97 return false;
98} 98}
99 99
100static void
101frwr_op_release_mr(struct rpcrdma_mr *mr)
102{
103 int rc;
104
105 rc = ib_dereg_mr(mr->frwr.fr_mr);
106 if (rc)
107 pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
108 mr, rc);
109 kfree(mr->mr_sg);
110 kfree(mr);
111}
112
113/* MRs are dynamically allocated, so simply clean up and release the MR.
114 * A replacement MR will subsequently be allocated on demand.
115 */
116static void
117frwr_mr_recycle_worker(struct work_struct *work)
118{
119 struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
120 enum rpcrdma_frwr_state state = mr->frwr.fr_state;
121 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
122
123 trace_xprtrdma_mr_recycle(mr);
124
125 if (state != FRWR_FLUSHED_LI) {
126 trace_xprtrdma_mr_unmap(mr);
127 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
128 mr->mr_sg, mr->mr_nents, mr->mr_dir);
129 }
130
131 spin_lock(&r_xprt->rx_buf.rb_mrlock);
132 list_del(&mr->mr_all);
133 r_xprt->rx_stats.mrs_recycled++;
134 spin_unlock(&r_xprt->rx_buf.rb_mrlock);
135 frwr_op_release_mr(mr);
136}
137
100static int 138static int
101frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) 139frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
102{ 140{
@@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
113 goto out_list_err; 151 goto out_list_err;
114 152
115 INIT_LIST_HEAD(&mr->mr_list); 153 INIT_LIST_HEAD(&mr->mr_list);
154 INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
116 sg_init_table(mr->mr_sg, depth); 155 sg_init_table(mr->mr_sg, depth);
117 init_completion(&frwr->fr_linv_done); 156 init_completion(&frwr->fr_linv_done);
118 return 0; 157 return 0;
@@ -131,79 +170,6 @@ out_list_err:
131 return rc; 170 return rc;
132} 171}
133 172
134static void
135frwr_op_release_mr(struct rpcrdma_mr *mr)
136{
137 int rc;
138
139 rc = ib_dereg_mr(mr->frwr.fr_mr);
140 if (rc)
141 pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
142 mr, rc);
143 kfree(mr->mr_sg);
144 kfree(mr);
145}
146
147static int
148__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
149{
150 struct rpcrdma_frwr *frwr = &mr->frwr;
151 int rc;
152
153 rc = ib_dereg_mr(frwr->fr_mr);
154 if (rc) {
155 pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
156 rc, mr);
157 return rc;
158 }
159
160 frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
161 ia->ri_max_frwr_depth);
162 if (IS_ERR(frwr->fr_mr)) {
163 pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
164 PTR_ERR(frwr->fr_mr), mr);
165 return PTR_ERR(frwr->fr_mr);
166 }
167
168 dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr);
169 frwr->fr_state = FRWR_IS_INVALID;
170 return 0;
171}
172
173/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR.
174 */
175static void
176frwr_op_recover_mr(struct rpcrdma_mr *mr)
177{
178 enum rpcrdma_frwr_state state = mr->frwr.fr_state;
179 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
180 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
181 int rc;
182
183 rc = __frwr_mr_reset(ia, mr);
184 if (state != FRWR_FLUSHED_LI) {
185 trace_xprtrdma_dma_unmap(mr);
186 ib_dma_unmap_sg(ia->ri_device,
187 mr->mr_sg, mr->mr_nents, mr->mr_dir);
188 }
189 if (rc)
190 goto out_release;
191
192 rpcrdma_mr_put(mr);
193 r_xprt->rx_stats.mrs_recovered++;
194 return;
195
196out_release:
197 pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr);
198 r_xprt->rx_stats.mrs_orphaned++;
199
200 spin_lock(&r_xprt->rx_buf.rb_mrlock);
201 list_del(&mr->mr_all);
202 spin_unlock(&r_xprt->rx_buf.rb_mrlock);
203
204 frwr_op_release_mr(mr);
205}
206
207/* On success, sets: 173/* On success, sets:
208 * ep->rep_attr.cap.max_send_wr 174 * ep->rep_attr.cap.max_send_wr
209 * ep->rep_attr.cap.max_recv_wr 175 * ep->rep_attr.cap.max_recv_wr
@@ -276,6 +242,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
276 242
277 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / 243 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
278 ia->ri_max_frwr_depth); 244 ia->ri_max_frwr_depth);
245 ia->ri_max_segs += 2; /* segments for head and tail buffers */
279 return 0; 246 return 0;
280} 247}
281 248
@@ -384,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
384 mr = NULL; 351 mr = NULL;
385 do { 352 do {
386 if (mr) 353 if (mr)
387 rpcrdma_mr_defer_recovery(mr); 354 rpcrdma_mr_recycle(mr);
388 mr = rpcrdma_mr_get(r_xprt); 355 mr = rpcrdma_mr_get(r_xprt);
389 if (!mr) 356 if (!mr)
390 return ERR_PTR(-EAGAIN); 357 return ERR_PTR(-EAGAIN);
@@ -417,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
417 mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); 384 mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
418 if (!mr->mr_nents) 385 if (!mr->mr_nents)
419 goto out_dmamap_err; 386 goto out_dmamap_err;
420 trace_xprtrdma_dma_map(mr); 387 trace_xprtrdma_mr_map(mr);
421 388
422 ibmr = frwr->fr_mr; 389 ibmr = frwr->fr_mr;
423 n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); 390 n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
@@ -451,7 +418,7 @@ out_dmamap_err:
451out_mapmr_err: 418out_mapmr_err:
452 pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", 419 pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
453 frwr->fr_mr, n, mr->mr_nents); 420 frwr->fr_mr, n, mr->mr_nents);
454 rpcrdma_mr_defer_recovery(mr); 421 rpcrdma_mr_recycle(mr);
455 return ERR_PTR(-EIO); 422 return ERR_PTR(-EIO);
456} 423}
457 424
@@ -499,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
499 list_for_each_entry(mr, mrs, mr_list) 466 list_for_each_entry(mr, mrs, mr_list)
500 if (mr->mr_handle == rep->rr_inv_rkey) { 467 if (mr->mr_handle == rep->rr_inv_rkey) {
501 list_del_init(&mr->mr_list); 468 list_del_init(&mr->mr_list);
502 trace_xprtrdma_remoteinv(mr); 469 trace_xprtrdma_mr_remoteinv(mr);
503 mr->frwr.fr_state = FRWR_IS_INVALID; 470 mr->frwr.fr_state = FRWR_IS_INVALID;
504 rpcrdma_mr_unmap_and_put(mr); 471 rpcrdma_mr_unmap_and_put(mr);
505 break; /* only one invalidated MR per RPC */ 472 break; /* only one invalidated MR per RPC */
@@ -536,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
536 mr->frwr.fr_state = FRWR_IS_INVALID; 503 mr->frwr.fr_state = FRWR_IS_INVALID;
537 504
538 frwr = &mr->frwr; 505 frwr = &mr->frwr;
539 trace_xprtrdma_localinv(mr); 506 trace_xprtrdma_mr_localinv(mr);
540 507
541 frwr->fr_cqe.done = frwr_wc_localinv; 508 frwr->fr_cqe.done = frwr_wc_localinv;
542 last = &frwr->fr_invwr; 509 last = &frwr->fr_invwr;
@@ -570,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
570 if (bad_wr != first) 537 if (bad_wr != first)
571 wait_for_completion(&frwr->fr_linv_done); 538 wait_for_completion(&frwr->fr_linv_done);
572 if (rc) 539 if (rc)
573 goto reset_mrs; 540 goto out_release;
574 541
575 /* ORDER: Now DMA unmap all of the MRs, and return 542 /* ORDER: Now DMA unmap all of the MRs, and return
576 * them to the free MR list. 543 * them to the free MR list.
@@ -582,22 +549,21 @@ unmap:
582 } 549 }
583 return; 550 return;
584 551
585reset_mrs: 552out_release:
586 pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); 553 pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
587 554
588 /* Find and reset the MRs in the LOCAL_INV WRs that did not 555 /* Unmap and release the MRs in the LOCAL_INV WRs that did not
589 * get posted. 556 * get posted.
590 */ 557 */
591 while (bad_wr) { 558 while (bad_wr) {
592 frwr = container_of(bad_wr, struct rpcrdma_frwr, 559 frwr = container_of(bad_wr, struct rpcrdma_frwr,
593 fr_invwr); 560 fr_invwr);
594 mr = container_of(frwr, struct rpcrdma_mr, frwr); 561 mr = container_of(frwr, struct rpcrdma_mr, frwr);
595
596 __frwr_mr_reset(ia, mr);
597
598 bad_wr = bad_wr->next; 562 bad_wr = bad_wr->next;
563
564 list_del(&mr->mr_list);
565 frwr_op_release_mr(mr);
599 } 566 }
600 goto unmap;
601} 567}
602 568
603const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { 569const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
@@ -605,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
605 .ro_send = frwr_op_send, 571 .ro_send = frwr_op_send,
606 .ro_reminv = frwr_op_reminv, 572 .ro_reminv = frwr_op_reminv,
607 .ro_unmap_sync = frwr_op_unmap_sync, 573 .ro_unmap_sync = frwr_op_unmap_sync,
608 .ro_recover_mr = frwr_op_recover_mr,
609 .ro_open = frwr_op_open, 574 .ro_open = frwr_op_open,
610 .ro_maxpages = frwr_op_maxpages, 575 .ro_maxpages = frwr_op_maxpages,
611 .ro_init_mr = frwr_op_init_mr, 576 .ro_init_mr = frwr_op_init_mr,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c8ae983c6cc0..9f53e0240035 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
71 size = RPCRDMA_HDRLEN_MIN; 71 size = RPCRDMA_HDRLEN_MIN;
72 72
73 /* Maximum Read list size */ 73 /* Maximum Read list size */
74 maxsegs += 2; /* segment for head and tail buffers */
75 size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); 74 size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
76 75
77 /* Minimal Read chunk size */ 76 /* Minimal Read chunk size */
@@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
97 size = RPCRDMA_HDRLEN_MIN; 96 size = RPCRDMA_HDRLEN_MIN;
98 97
99 /* Maximum Write list size */ 98 /* Maximum Write list size */
100 maxsegs += 2; /* segment for head and tail buffers */
101 size = sizeof(__be32); /* segment count */ 99 size = sizeof(__be32); /* segment count */
102 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); 100 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
103 size += sizeof(__be32); /* list discriminator */ 101 size += sizeof(__be32); /* list discriminator */
@@ -805,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
805 struct rpcrdma_mr *mr; 803 struct rpcrdma_mr *mr;
806 804
807 mr = rpcrdma_mr_pop(&req->rl_registered); 805 mr = rpcrdma_mr_pop(&req->rl_registered);
808 rpcrdma_mr_defer_recovery(mr); 806 rpcrdma_mr_recycle(mr);
809 } 807 }
810 808
811 /* This implementation supports the following combinations 809 /* This implementation supports the following combinations
@@ -866,7 +864,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
866out_err: 864out_err:
867 switch (ret) { 865 switch (ret) {
868 case -EAGAIN: 866 case -EAGAIN:
869 xprt_wait_for_buffer_space(rqst->rq_task, NULL); 867 xprt_wait_for_buffer_space(rqst->rq_xprt);
870 break; 868 break;
871 case -ENOBUFS: 869 case -ENOBUFS:
872 break; 870 break;
@@ -1216,7 +1214,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1216 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1214 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1217 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1215 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1218 struct rpc_rqst *rqst = rep->rr_rqst; 1216 struct rpc_rqst *rqst = rep->rr_rqst;
1219 unsigned long cwnd;
1220 int status; 1217 int status;
1221 1218
1222 xprt->reestablish_timeout = 0; 1219 xprt->reestablish_timeout = 0;
@@ -1238,15 +1235,10 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1238 goto out_badheader; 1235 goto out_badheader;
1239 1236
1240out: 1237out:
1241 spin_lock(&xprt->recv_lock); 1238 spin_lock(&xprt->queue_lock);
1242 cwnd = xprt->cwnd;
1243 xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT;
1244 if (xprt->cwnd > cwnd)
1245 xprt_release_rqst_cong(rqst->rq_task);
1246
1247 xprt_complete_rqst(rqst->rq_task, status); 1239 xprt_complete_rqst(rqst->rq_task, status);
1248 xprt_unpin_rqst(rqst); 1240 xprt_unpin_rqst(rqst);
1249 spin_unlock(&xprt->recv_lock); 1241 spin_unlock(&xprt->queue_lock);
1250 return; 1242 return;
1251 1243
1252/* If the incoming reply terminated a pending RPC, the next 1244/* If the incoming reply terminated a pending RPC, the next
@@ -1345,19 +1337,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1345 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1337 /* Match incoming rpcrdma_rep to an rpcrdma_req to
1346 * get context for handling any incoming chunks. 1338 * get context for handling any incoming chunks.
1347 */ 1339 */
1348 spin_lock(&xprt->recv_lock); 1340 spin_lock(&xprt->queue_lock);
1349 rqst = xprt_lookup_rqst(xprt, rep->rr_xid); 1341 rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
1350 if (!rqst) 1342 if (!rqst)
1351 goto out_norqst; 1343 goto out_norqst;
1352 xprt_pin_rqst(rqst); 1344 xprt_pin_rqst(rqst);
1345 spin_unlock(&xprt->queue_lock);
1353 1346
1354 if (credits == 0) 1347 if (credits == 0)
1355 credits = 1; /* don't deadlock */ 1348 credits = 1; /* don't deadlock */
1356 else if (credits > buf->rb_max_requests) 1349 else if (credits > buf->rb_max_requests)
1357 credits = buf->rb_max_requests; 1350 credits = buf->rb_max_requests;
1358 buf->rb_credits = credits; 1351 if (buf->rb_credits != credits) {
1359 1352 spin_lock_bh(&xprt->transport_lock);
1360 spin_unlock(&xprt->recv_lock); 1353 buf->rb_credits = credits;
1354 xprt->cwnd = credits << RPC_CWNDSHIFT;
1355 spin_unlock_bh(&xprt->transport_lock);
1356 }
1361 1357
1362 req = rpcr_to_rdmar(rqst); 1358 req = rpcr_to_rdmar(rqst);
1363 req->rl_reply = rep; 1359 req->rl_reply = rep;
@@ -1378,7 +1374,7 @@ out_badversion:
1378 * is corrupt. 1374 * is corrupt.
1379 */ 1375 */
1380out_norqst: 1376out_norqst:
1381 spin_unlock(&xprt->recv_lock); 1377 spin_unlock(&xprt->queue_lock);
1382 trace_xprtrdma_reply_rqst(rep); 1378 trace_xprtrdma_reply_rqst(rep);
1383 goto repost; 1379 goto repost;
1384 1380
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a68180090554..d3a1a237cee6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -56,7 +56,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
56 if (src->iov_len < 24) 56 if (src->iov_len < 24)
57 goto out_shortreply; 57 goto out_shortreply;
58 58
59 spin_lock(&xprt->recv_lock); 59 spin_lock(&xprt->queue_lock);
60 req = xprt_lookup_rqst(xprt, xid); 60 req = xprt_lookup_rqst(xprt, xid);
61 if (!req) 61 if (!req)
62 goto out_notfound; 62 goto out_notfound;
@@ -86,7 +86,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
86 rcvbuf->len = 0; 86 rcvbuf->len = 0;
87 87
88out_unlock: 88out_unlock:
89 spin_unlock(&xprt->recv_lock); 89 spin_unlock(&xprt->queue_lock);
90out: 90out:
91 return ret; 91 return ret;
92 92
@@ -215,9 +215,8 @@ drop_connection:
215 * connection. 215 * connection.
216 */ 216 */
217static int 217static int
218xprt_rdma_bc_send_request(struct rpc_task *task) 218xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
219{ 219{
220 struct rpc_rqst *rqst = task->tk_rqstp;
221 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; 220 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
222 struct svcxprt_rdma *rdma; 221 struct svcxprt_rdma *rdma;
223 int ret; 222 int ret;
@@ -225,12 +224,7 @@ xprt_rdma_bc_send_request(struct rpc_task *task)
225 dprintk("svcrdma: sending bc call with xid: %08x\n", 224 dprintk("svcrdma: sending bc call with xid: %08x\n",
226 be32_to_cpu(rqst->rq_xid)); 225 be32_to_cpu(rqst->rq_xid));
227 226
228 if (!mutex_trylock(&sxprt->xpt_mutex)) { 227 mutex_lock(&sxprt->xpt_mutex);
229 rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
230 if (!mutex_trylock(&sxprt->xpt_mutex))
231 return -EAGAIN;
232 rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
233 }
234 228
235 ret = -ENOTCONN; 229 ret = -ENOTCONN;
236 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); 230 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
@@ -248,6 +242,7 @@ static void
248xprt_rdma_bc_close(struct rpc_xprt *xprt) 242xprt_rdma_bc_close(struct rpc_xprt *xprt)
249{ 243{
250 dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); 244 dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
245 xprt->cwnd = RPC_CWNDSHIFT;
251} 246}
252 247
253static void 248static void
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 143ce2579ba9..ae2a83828953 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -225,69 +225,59 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
225 } 225 }
226} 226}
227 227
228void 228/**
229rpcrdma_conn_func(struct rpcrdma_ep *ep) 229 * xprt_rdma_connect_worker - establish connection in the background
230{ 230 * @work: worker thread context
231 schedule_delayed_work(&ep->rep_connect_worker, 0); 231 *
232} 232 * Requester holds the xprt's send lock to prevent activity on this
233 233 * transport while a fresh connection is being established. RPC tasks
234void 234 * sleep on the xprt's pending queue waiting for connect to complete.
235rpcrdma_connect_worker(struct work_struct *work) 235 */
236{
237 struct rpcrdma_ep *ep =
238 container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
239 struct rpcrdma_xprt *r_xprt =
240 container_of(ep, struct rpcrdma_xprt, rx_ep);
241 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
242
243 spin_lock_bh(&xprt->transport_lock);
244 if (ep->rep_connected > 0) {
245 if (!xprt_test_and_set_connected(xprt))
246 xprt_wake_pending_tasks(xprt, 0);
247 } else {
248 if (xprt_test_and_clear_connected(xprt))
249 xprt_wake_pending_tasks(xprt, -ENOTCONN);
250 }
251 spin_unlock_bh(&xprt->transport_lock);
252}
253
254static void 236static void
255xprt_rdma_connect_worker(struct work_struct *work) 237xprt_rdma_connect_worker(struct work_struct *work)
256{ 238{
257 struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, 239 struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
258 rx_connect_worker.work); 240 rx_connect_worker.work);
259 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 241 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
260 int rc = 0; 242 int rc;
261
262 xprt_clear_connected(xprt);
263 243
264 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); 244 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
265 if (rc)
266 xprt_wake_pending_tasks(xprt, rc);
267
268 xprt_clear_connecting(xprt); 245 xprt_clear_connecting(xprt);
246 if (r_xprt->rx_ep.rep_connected > 0) {
247 if (!xprt_test_and_set_connected(xprt)) {
248 xprt->stat.connect_count++;
249 xprt->stat.connect_time += (long)jiffies -
250 xprt->stat.connect_start;
251 xprt_wake_pending_tasks(xprt, -EAGAIN);
252 }
253 } else {
254 if (xprt_test_and_clear_connected(xprt))
255 xprt_wake_pending_tasks(xprt, rc);
256 }
269} 257}
270 258
259/**
260 * xprt_rdma_inject_disconnect - inject a connection fault
261 * @xprt: transport context
262 *
263 * If @xprt is connected, disconnect it to simulate spurious connection
264 * loss.
265 */
271static void 266static void
272xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) 267xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
273{ 268{
274 struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, 269 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
275 rx_xprt);
276 270
277 trace_xprtrdma_inject_dsc(r_xprt); 271 trace_xprtrdma_inject_dsc(r_xprt);
278 rdma_disconnect(r_xprt->rx_ia.ri_id); 272 rdma_disconnect(r_xprt->rx_ia.ri_id);
279} 273}
280 274
281/* 275/**
282 * xprt_rdma_destroy 276 * xprt_rdma_destroy - Full tear down of transport
277 * @xprt: doomed transport context
283 * 278 *
284 * Destroy the xprt. 279 * Caller guarantees there will be no more calls to us with
285 * Free all memory associated with the object, including its own. 280 * this @xprt.
286 * NOTE: none of the *destroy methods free memory for their top-level
287 * objects, even though they may have allocated it (they do free
288 * private memory). It's up to the caller to handle it. In this
289 * case (RDMA transport), all structure memory is inlined with the
290 * struct rpcrdma_xprt.
291 */ 281 */
292static void 282static void
293xprt_rdma_destroy(struct rpc_xprt *xprt) 283xprt_rdma_destroy(struct rpc_xprt *xprt)
@@ -298,8 +288,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
298 288
299 cancel_delayed_work_sync(&r_xprt->rx_connect_worker); 289 cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
300 290
301 xprt_clear_connected(xprt);
302
303 rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); 291 rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
304 rpcrdma_buffer_destroy(&r_xprt->rx_buf); 292 rpcrdma_buffer_destroy(&r_xprt->rx_buf);
305 rpcrdma_ia_close(&r_xprt->rx_ia); 293 rpcrdma_ia_close(&r_xprt->rx_ia);
@@ -442,11 +430,12 @@ out1:
442} 430}
443 431
444/** 432/**
445 * xprt_rdma_close - Close down RDMA connection 433 * xprt_rdma_close - close a transport connection
446 * @xprt: generic transport to be closed 434 * @xprt: transport context
447 * 435 *
448 * Called during transport shutdown reconnect, or device 436 * Called during transport shutdown, reconnect, or device removal.
449 * removal. Caller holds the transport's write lock. 437 * Caller holds @xprt's send lock to prevent activity on this
438 * transport while the connection is torn down.
450 */ 439 */
451static void 440static void
452xprt_rdma_close(struct rpc_xprt *xprt) 441xprt_rdma_close(struct rpc_xprt *xprt)
@@ -468,6 +457,12 @@ xprt_rdma_close(struct rpc_xprt *xprt)
468 xprt->reestablish_timeout = 0; 457 xprt->reestablish_timeout = 0;
469 xprt_disconnect_done(xprt); 458 xprt_disconnect_done(xprt);
470 rpcrdma_ep_disconnect(ep, ia); 459 rpcrdma_ep_disconnect(ep, ia);
460
461 /* Prepare @xprt for the next connection by reinitializing
462 * its credit grant to one (see RFC 8166, Section 3.3.3).
463 */
464 r_xprt->rx_buf.rb_credits = 1;
465 xprt->cwnd = RPC_CWNDSHIFT;
471} 466}
472 467
473/** 468/**
@@ -519,6 +514,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
519 xprt_force_disconnect(xprt); 514 xprt_force_disconnect(xprt);
520} 515}
521 516
517/**
518 * xprt_rdma_connect - try to establish a transport connection
519 * @xprt: transport state
520 * @task: RPC scheduler context
521 *
522 */
522static void 523static void
523xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) 524xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
524{ 525{
@@ -638,13 +639,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
638 * 0: Success; rq_buffer points to RPC buffer to use 639 * 0: Success; rq_buffer points to RPC buffer to use
639 * ENOMEM: Out of memory, call again later 640 * ENOMEM: Out of memory, call again later
640 * EIO: A permanent error occurred, do not retry 641 * EIO: A permanent error occurred, do not retry
641 *
642 * The RDMA allocate/free functions need the task structure as a place
643 * to hide the struct rpcrdma_req, which is necessary for the actual
644 * send/recv sequence.
645 *
646 * xprt_rdma_allocate provides buffers that are already mapped for
647 * DMA, and a local DMA lkey is provided for each.
648 */ 642 */
649static int 643static int
650xprt_rdma_allocate(struct rpc_task *task) 644xprt_rdma_allocate(struct rpc_task *task)
@@ -693,7 +687,7 @@ xprt_rdma_free(struct rpc_task *task)
693 687
694/** 688/**
695 * xprt_rdma_send_request - marshal and send an RPC request 689 * xprt_rdma_send_request - marshal and send an RPC request
696 * @task: RPC task with an RPC message in rq_snd_buf 690 * @rqst: RPC message in rq_snd_buf
697 * 691 *
698 * Caller holds the transport's write lock. 692 * Caller holds the transport's write lock.
699 * 693 *
@@ -706,9 +700,8 @@ xprt_rdma_free(struct rpc_task *task)
706 * sent. Do not try to send this message again. 700 * sent. Do not try to send this message again.
707 */ 701 */
708static int 702static int
709xprt_rdma_send_request(struct rpc_task *task) 703xprt_rdma_send_request(struct rpc_rqst *rqst)
710{ 704{
711 struct rpc_rqst *rqst = task->tk_rqstp;
712 struct rpc_xprt *xprt = rqst->rq_xprt; 705 struct rpc_xprt *xprt = rqst->rq_xprt;
713 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 706 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
714 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 707 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -722,6 +715,9 @@ xprt_rdma_send_request(struct rpc_task *task)
722 if (!xprt_connected(xprt)) 715 if (!xprt_connected(xprt))
723 goto drop_connection; 716 goto drop_connection;
724 717
718 if (!xprt_request_get_cong(xprt, rqst))
719 return -EBADSLT;
720
725 rc = rpcrdma_marshal_req(r_xprt, rqst); 721 rc = rpcrdma_marshal_req(r_xprt, rqst);
726 if (rc < 0) 722 if (rc < 0)
727 goto failed_marshal; 723 goto failed_marshal;
@@ -741,7 +737,7 @@ xprt_rdma_send_request(struct rpc_task *task)
741 /* An RPC with no reply will throw off credit accounting, 737 /* An RPC with no reply will throw off credit accounting,
742 * so drop the connection to reset the credit grant. 738 * so drop the connection to reset the credit grant.
743 */ 739 */
744 if (!rpc_reply_expected(task)) 740 if (!rpc_reply_expected(rqst->rq_task))
745 goto drop_connection; 741 goto drop_connection;
746 return 0; 742 return 0;
747 743
@@ -766,7 +762,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
766 0, /* need a local port? */ 762 0, /* need a local port? */
767 xprt->stat.bind_count, 763 xprt->stat.bind_count,
768 xprt->stat.connect_count, 764 xprt->stat.connect_count,
769 xprt->stat.connect_time, 765 xprt->stat.connect_time / HZ,
770 idle_time, 766 idle_time,
771 xprt->stat.sends, 767 xprt->stat.sends,
772 xprt->stat.recvs, 768 xprt->stat.recvs,
@@ -786,7 +782,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
786 r_xprt->rx_stats.bad_reply_count, 782 r_xprt->rx_stats.bad_reply_count,
787 r_xprt->rx_stats.nomsg_call_count); 783 r_xprt->rx_stats.nomsg_call_count);
788 seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", 784 seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
789 r_xprt->rx_stats.mrs_recovered, 785 r_xprt->rx_stats.mrs_recycled,
790 r_xprt->rx_stats.mrs_orphaned, 786 r_xprt->rx_stats.mrs_orphaned,
791 r_xprt->rx_stats.mrs_allocated, 787 r_xprt->rx_stats.mrs_allocated,
792 r_xprt->rx_stats.local_inv_needed, 788 r_xprt->rx_stats.local_inv_needed,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 956a5ea47b58..3ddba94c939f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -108,20 +108,48 @@ rpcrdma_destroy_wq(void)
108 } 108 }
109} 109}
110 110
111/**
112 * rpcrdma_disconnect_worker - Force a disconnect
113 * @work: endpoint to be disconnected
114 *
115 * Provider callbacks can possibly run in an IRQ context. This function
116 * is invoked in a worker thread to guarantee that disconnect wake-up
117 * calls are always done in process context.
118 */
119static void
120rpcrdma_disconnect_worker(struct work_struct *work)
121{
122 struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep,
123 rep_disconnect_worker.work);
124 struct rpcrdma_xprt *r_xprt =
125 container_of(ep, struct rpcrdma_xprt, rx_ep);
126
127 xprt_force_disconnect(&r_xprt->rx_xprt);
128}
129
130/**
131 * rpcrdma_qp_event_handler - Handle one QP event (error notification)
132 * @event: details of the event
133 * @context: ep that owns QP where event occurred
134 *
135 * Called from the RDMA provider (device driver) possibly in an interrupt
136 * context.
137 */
111static void 138static void
112rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 139rpcrdma_qp_event_handler(struct ib_event *event, void *context)
113{ 140{
114 struct rpcrdma_ep *ep = context; 141 struct rpcrdma_ep *ep = context;
115 struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, 142 struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
116 rx_ep); 143 rx_ep);
117 144
118 trace_xprtrdma_qp_error(r_xprt, event); 145 trace_xprtrdma_qp_event(r_xprt, event);
119 pr_err("rpcrdma: %s on device %s ep %p\n", 146 pr_err("rpcrdma: %s on device %s connected to %s:%s\n",
120 ib_event_msg(event->event), event->device->name, context); 147 ib_event_msg(event->event), event->device->name,
148 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
121 149
122 if (ep->rep_connected == 1) { 150 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO; 151 ep->rep_connected = -EIO;
124 rpcrdma_conn_func(ep); 152 schedule_delayed_work(&ep->rep_disconnect_worker, 0);
125 wake_up_all(&ep->rep_connect_wait); 153 wake_up_all(&ep->rep_connect_wait);
126 } 154 }
127} 155}
@@ -219,38 +247,48 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
219 rpcrdma_set_max_header_sizes(r_xprt); 247 rpcrdma_set_max_header_sizes(r_xprt);
220} 248}
221 249
250/**
251 * rpcrdma_cm_event_handler - Handle RDMA CM events
252 * @id: rdma_cm_id on which an event has occurred
253 * @event: details of the event
254 *
255 * Called with @id's mutex held. Returns 1 if caller should
256 * destroy @id, otherwise 0.
257 */
222static int 258static int
223rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 259rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
224{ 260{
225 struct rpcrdma_xprt *xprt = id->context; 261 struct rpcrdma_xprt *r_xprt = id->context;
226 struct rpcrdma_ia *ia = &xprt->rx_ia; 262 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
227 struct rpcrdma_ep *ep = &xprt->rx_ep; 263 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
228 int connstate = 0; 264 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
265
266 might_sleep();
229 267
230 trace_xprtrdma_conn_upcall(xprt, event); 268 trace_xprtrdma_cm_event(r_xprt, event);
231 switch (event->event) { 269 switch (event->event) {
232 case RDMA_CM_EVENT_ADDR_RESOLVED: 270 case RDMA_CM_EVENT_ADDR_RESOLVED:
233 case RDMA_CM_EVENT_ROUTE_RESOLVED: 271 case RDMA_CM_EVENT_ROUTE_RESOLVED:
234 ia->ri_async_rc = 0; 272 ia->ri_async_rc = 0;
235 complete(&ia->ri_done); 273 complete(&ia->ri_done);
236 break; 274 return 0;
237 case RDMA_CM_EVENT_ADDR_ERROR: 275 case RDMA_CM_EVENT_ADDR_ERROR:
238 ia->ri_async_rc = -EPROTO; 276 ia->ri_async_rc = -EPROTO;
239 complete(&ia->ri_done); 277 complete(&ia->ri_done);
240 break; 278 return 0;
241 case RDMA_CM_EVENT_ROUTE_ERROR: 279 case RDMA_CM_EVENT_ROUTE_ERROR:
242 ia->ri_async_rc = -ENETUNREACH; 280 ia->ri_async_rc = -ENETUNREACH;
243 complete(&ia->ri_done); 281 complete(&ia->ri_done);
244 break; 282 return 0;
245 case RDMA_CM_EVENT_DEVICE_REMOVAL: 283 case RDMA_CM_EVENT_DEVICE_REMOVAL:
246#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 284#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
247 pr_info("rpcrdma: removing device %s for %s:%s\n", 285 pr_info("rpcrdma: removing device %s for %s:%s\n",
248 ia->ri_device->name, 286 ia->ri_device->name,
249 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); 287 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
250#endif 288#endif
251 set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); 289 set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
252 ep->rep_connected = -ENODEV; 290 ep->rep_connected = -ENODEV;
253 xprt_force_disconnect(&xprt->rx_xprt); 291 xprt_force_disconnect(xprt);
254 wait_for_completion(&ia->ri_remove_done); 292 wait_for_completion(&ia->ri_remove_done);
255 293
256 ia->ri_id = NULL; 294 ia->ri_id = NULL;
@@ -258,41 +296,40 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
258 /* Return 1 to ensure the core destroys the id. */ 296 /* Return 1 to ensure the core destroys the id. */
259 return 1; 297 return 1;
260 case RDMA_CM_EVENT_ESTABLISHED: 298 case RDMA_CM_EVENT_ESTABLISHED:
261 ++xprt->rx_xprt.connect_cookie; 299 ++xprt->connect_cookie;
262 connstate = 1; 300 ep->rep_connected = 1;
263 rpcrdma_update_connect_private(xprt, &event->param.conn); 301 rpcrdma_update_connect_private(r_xprt, &event->param.conn);
264 goto connected; 302 wake_up_all(&ep->rep_connect_wait);
303 break;
265 case RDMA_CM_EVENT_CONNECT_ERROR: 304 case RDMA_CM_EVENT_CONNECT_ERROR:
266 connstate = -ENOTCONN; 305 ep->rep_connected = -ENOTCONN;
267 goto connected; 306 goto disconnected;
268 case RDMA_CM_EVENT_UNREACHABLE: 307 case RDMA_CM_EVENT_UNREACHABLE:
269 connstate = -ENETUNREACH; 308 ep->rep_connected = -ENETUNREACH;
270 goto connected; 309 goto disconnected;
271 case RDMA_CM_EVENT_REJECTED: 310 case RDMA_CM_EVENT_REJECTED:
272 dprintk("rpcrdma: connection to %s:%s rejected: %s\n", 311 dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
273 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), 312 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
274 rdma_reject_msg(id, event->status)); 313 rdma_reject_msg(id, event->status));
275 connstate = -ECONNREFUSED; 314 ep->rep_connected = -ECONNREFUSED;
276 if (event->status == IB_CM_REJ_STALE_CONN) 315 if (event->status == IB_CM_REJ_STALE_CONN)
277 connstate = -EAGAIN; 316 ep->rep_connected = -EAGAIN;
278 goto connected; 317 goto disconnected;
279 case RDMA_CM_EVENT_DISCONNECTED: 318 case RDMA_CM_EVENT_DISCONNECTED:
280 ++xprt->rx_xprt.connect_cookie; 319 ++xprt->connect_cookie;
281 connstate = -ECONNABORTED; 320 ep->rep_connected = -ECONNABORTED;
282connected: 321disconnected:
283 ep->rep_connected = connstate; 322 xprt_force_disconnect(xprt);
284 rpcrdma_conn_func(ep);
285 wake_up_all(&ep->rep_connect_wait); 323 wake_up_all(&ep->rep_connect_wait);
286 /*FALLTHROUGH*/ 324 break;
287 default: 325 default:
288 dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n",
289 __func__,
290 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
291 ia->ri_device->name, ia->ri_ops->ro_displayname,
292 ep, rdma_event_msg(event->event));
293 break; 326 break;
294 } 327 }
295 328
329 dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__,
330 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
331 ia->ri_device->name, ia->ri_ops->ro_displayname,
332 rdma_event_msg(event->event));
296 return 0; 333 return 0;
297} 334}
298 335
@@ -308,7 +345,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
308 init_completion(&ia->ri_done); 345 init_completion(&ia->ri_done);
309 init_completion(&ia->ri_remove_done); 346 init_completion(&ia->ri_remove_done);
310 347
311 id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, 348 id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
312 xprt, RDMA_PS_TCP, IB_QPT_RC); 349 xprt, RDMA_PS_TCP, IB_QPT_RC);
313 if (IS_ERR(id)) { 350 if (IS_ERR(id)) {
314 rc = PTR_ERR(id); 351 rc = PTR_ERR(id);
@@ -519,7 +556,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
519 if (rc) 556 if (rc)
520 return rc; 557 return rc;
521 558
522 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 559 ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
523 ep->rep_attr.qp_context = ep; 560 ep->rep_attr.qp_context = ep;
524 ep->rep_attr.srq = NULL; 561 ep->rep_attr.srq = NULL;
525 ep->rep_attr.cap.max_send_sge = max_sge; 562 ep->rep_attr.cap.max_send_sge = max_sge;
@@ -542,7 +579,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
542 cdata->max_requests >> 2); 579 cdata->max_requests >> 2);
543 ep->rep_send_count = ep->rep_send_batch; 580 ep->rep_send_count = ep->rep_send_batch;
544 init_waitqueue_head(&ep->rep_connect_wait); 581 init_waitqueue_head(&ep->rep_connect_wait);
545 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 582 INIT_DELAYED_WORK(&ep->rep_disconnect_worker,
583 rpcrdma_disconnect_worker);
546 584
547 sendcq = ib_alloc_cq(ia->ri_device, NULL, 585 sendcq = ib_alloc_cq(ia->ri_device, NULL,
548 ep->rep_attr.cap.max_send_wr + 1, 586 ep->rep_attr.cap.max_send_wr + 1,
@@ -615,7 +653,7 @@ out1:
615void 653void
616rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 654rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
617{ 655{
618 cancel_delayed_work_sync(&ep->rep_connect_worker); 656 cancel_delayed_work_sync(&ep->rep_disconnect_worker);
619 657
620 if (ia->ri_id && ia->ri_id->qp) { 658 if (ia->ri_id && ia->ri_id->qp) {
621 rpcrdma_ep_disconnect(ep, ia); 659 rpcrdma_ep_disconnect(ep, ia);
@@ -728,6 +766,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
728{ 766{
729 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, 767 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
730 rx_ia); 768 rx_ia);
769 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
731 int rc; 770 int rc;
732 771
733retry: 772retry:
@@ -754,6 +793,8 @@ retry:
754 } 793 }
755 794
756 ep->rep_connected = 0; 795 ep->rep_connected = 0;
796 xprt_clear_connected(xprt);
797
757 rpcrdma_post_recvs(r_xprt, true); 798 rpcrdma_post_recvs(r_xprt, true);
758 799
759 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 800 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -877,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
877 sc->sc_xprt = r_xprt; 918 sc->sc_xprt = r_xprt;
878 buf->rb_sc_ctxs[i] = sc; 919 buf->rb_sc_ctxs[i] = sc;
879 } 920 }
880 buf->rb_flags = 0;
881 921
882 return 0; 922 return 0;
883 923
@@ -978,39 +1018,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
978} 1018}
979 1019
980static void 1020static void
981rpcrdma_mr_recovery_worker(struct work_struct *work)
982{
983 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
984 rb_recovery_worker.work);
985 struct rpcrdma_mr *mr;
986
987 spin_lock(&buf->rb_recovery_lock);
988 while (!list_empty(&buf->rb_stale_mrs)) {
989 mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
990 spin_unlock(&buf->rb_recovery_lock);
991
992 trace_xprtrdma_recover_mr(mr);
993 mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
994
995 spin_lock(&buf->rb_recovery_lock);
996 }
997 spin_unlock(&buf->rb_recovery_lock);
998}
999
1000void
1001rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
1002{
1003 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1004 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1005
1006 spin_lock(&buf->rb_recovery_lock);
1007 rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
1008 spin_unlock(&buf->rb_recovery_lock);
1009
1010 schedule_delayed_work(&buf->rb_recovery_worker, 0);
1011}
1012
1013static void
1014rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) 1021rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
1015{ 1022{
1016 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1023 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1019,7 +1026,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
1019 LIST_HEAD(free); 1026 LIST_HEAD(free);
1020 LIST_HEAD(all); 1027 LIST_HEAD(all);
1021 1028
1022 for (count = 0; count < 3; count++) { 1029 for (count = 0; count < ia->ri_max_segs; count++) {
1023 struct rpcrdma_mr *mr; 1030 struct rpcrdma_mr *mr;
1024 int rc; 1031 int rc;
1025 1032
@@ -1138,18 +1145,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1138 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1145 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1139 int i, rc; 1146 int i, rc;
1140 1147
1148 buf->rb_flags = 0;
1141 buf->rb_max_requests = r_xprt->rx_data.max_requests; 1149 buf->rb_max_requests = r_xprt->rx_data.max_requests;
1142 buf->rb_bc_srv_max_requests = 0; 1150 buf->rb_bc_srv_max_requests = 0;
1143 spin_lock_init(&buf->rb_mrlock); 1151 spin_lock_init(&buf->rb_mrlock);
1144 spin_lock_init(&buf->rb_lock); 1152 spin_lock_init(&buf->rb_lock);
1145 spin_lock_init(&buf->rb_recovery_lock);
1146 INIT_LIST_HEAD(&buf->rb_mrs); 1153 INIT_LIST_HEAD(&buf->rb_mrs);
1147 INIT_LIST_HEAD(&buf->rb_all); 1154 INIT_LIST_HEAD(&buf->rb_all);
1148 INIT_LIST_HEAD(&buf->rb_stale_mrs);
1149 INIT_DELAYED_WORK(&buf->rb_refresh_worker, 1155 INIT_DELAYED_WORK(&buf->rb_refresh_worker,
1150 rpcrdma_mr_refresh_worker); 1156 rpcrdma_mr_refresh_worker);
1151 INIT_DELAYED_WORK(&buf->rb_recovery_worker,
1152 rpcrdma_mr_recovery_worker);
1153 1157
1154 rpcrdma_mrs_create(r_xprt); 1158 rpcrdma_mrs_create(r_xprt);
1155 1159
@@ -1233,7 +1237,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
1233void 1237void
1234rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1238rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1235{ 1239{
1236 cancel_delayed_work_sync(&buf->rb_recovery_worker);
1237 cancel_delayed_work_sync(&buf->rb_refresh_worker); 1240 cancel_delayed_work_sync(&buf->rb_refresh_worker);
1238 1241
1239 rpcrdma_sendctxs_destroy(buf); 1242 rpcrdma_sendctxs_destroy(buf);
@@ -1326,7 +1329,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
1326{ 1329{
1327 struct rpcrdma_xprt *r_xprt = mr->mr_xprt; 1330 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1328 1331
1329 trace_xprtrdma_dma_unmap(mr); 1332 trace_xprtrdma_mr_unmap(mr);
1330 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 1333 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
1331 mr->mr_sg, mr->mr_nents, mr->mr_dir); 1334 mr->mr_sg, mr->mr_nents, mr->mr_dir);
1332 __rpcrdma_mr_put(&r_xprt->rx_buf, mr); 1335 __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
@@ -1518,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1518 struct ib_recv_wr *wr, *bad_wr; 1521 struct ib_recv_wr *wr, *bad_wr;
1519 int needed, count, rc; 1522 int needed, count, rc;
1520 1523
1524 rc = 0;
1525 count = 0;
1521 needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); 1526 needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
1522 if (buf->rb_posted_receives > needed) 1527 if (buf->rb_posted_receives > needed)
1523 return; 1528 goto out;
1524 needed -= buf->rb_posted_receives; 1529 needed -= buf->rb_posted_receives;
1525 1530
1526 count = 0; 1531 count = 0;
@@ -1556,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1556 --needed; 1561 --needed;
1557 } 1562 }
1558 if (!count) 1563 if (!count)
1559 return; 1564 goto out;
1560 1565
1561 rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, 1566 rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
1562 (const struct ib_recv_wr **)&bad_wr); 1567 (const struct ib_recv_wr **)&bad_wr);
@@ -1570,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1570 } 1575 }
1571 } 1576 }
1572 buf->rb_posted_receives += count; 1577 buf->rb_posted_receives += count;
1578out:
1573 trace_xprtrdma_post_recvs(r_xprt, count, rc); 1579 trace_xprtrdma_post_recvs(r_xprt, count, rc);
1574} 1580}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ca14f7c2d51..a13ccb643ce0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -101,7 +101,7 @@ struct rpcrdma_ep {
101 wait_queue_head_t rep_connect_wait; 101 wait_queue_head_t rep_connect_wait;
102 struct rpcrdma_connect_private rep_cm_private; 102 struct rpcrdma_connect_private rep_cm_private;
103 struct rdma_conn_param rep_remote_cma; 103 struct rdma_conn_param rep_remote_cma;
104 struct delayed_work rep_connect_worker; 104 struct delayed_work rep_disconnect_worker;
105}; 105};
106 106
107/* Pre-allocate extra Work Requests for handling backward receives 107/* Pre-allocate extra Work Requests for handling backward receives
@@ -280,6 +280,7 @@ struct rpcrdma_mr {
280 u32 mr_handle; 280 u32 mr_handle;
281 u32 mr_length; 281 u32 mr_length;
282 u64 mr_offset; 282 u64 mr_offset;
283 struct work_struct mr_recycle;
283 struct list_head mr_all; 284 struct list_head mr_all;
284}; 285};
285 286
@@ -411,9 +412,6 @@ struct rpcrdma_buffer {
411 412
412 u32 rb_bc_max_requests; 413 u32 rb_bc_max_requests;
413 414
414 spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
415 struct list_head rb_stale_mrs;
416 struct delayed_work rb_recovery_worker;
417 struct delayed_work rb_refresh_worker; 415 struct delayed_work rb_refresh_worker;
418}; 416};
419#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 417#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -452,7 +450,7 @@ struct rpcrdma_stats {
452 unsigned long hardway_register_count; 450 unsigned long hardway_register_count;
453 unsigned long failed_marshal_count; 451 unsigned long failed_marshal_count;
454 unsigned long bad_reply_count; 452 unsigned long bad_reply_count;
455 unsigned long mrs_recovered; 453 unsigned long mrs_recycled;
456 unsigned long mrs_orphaned; 454 unsigned long mrs_orphaned;
457 unsigned long mrs_allocated; 455 unsigned long mrs_allocated;
458 unsigned long empty_sendctx_q; 456 unsigned long empty_sendctx_q;
@@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops {
481 struct list_head *mrs); 479 struct list_head *mrs);
482 void (*ro_unmap_sync)(struct rpcrdma_xprt *, 480 void (*ro_unmap_sync)(struct rpcrdma_xprt *,
483 struct list_head *); 481 struct list_head *);
484 void (*ro_recover_mr)(struct rpcrdma_mr *mr);
485 int (*ro_open)(struct rpcrdma_ia *, 482 int (*ro_open)(struct rpcrdma_ia *,
486 struct rpcrdma_ep *, 483 struct rpcrdma_ep *,
487 struct rpcrdma_create_data_internal *); 484 struct rpcrdma_create_data_internal *);
@@ -559,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
559 struct rpcrdma_create_data_internal *); 556 struct rpcrdma_create_data_internal *);
560void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); 557void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
561int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); 558int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
562void rpcrdma_conn_func(struct rpcrdma_ep *ep);
563void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); 559void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
564 560
565int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, 561int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
@@ -578,7 +574,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
578struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); 574struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
579void rpcrdma_mr_put(struct rpcrdma_mr *mr); 575void rpcrdma_mr_put(struct rpcrdma_mr *mr);
580void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); 576void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
581void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); 577
578static inline void
579rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
580{
581 schedule_work(&mr->mr_recycle);
582}
582 583
583struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); 584struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
584void rpcrdma_buffer_put(struct rpcrdma_req *); 585void rpcrdma_buffer_put(struct rpcrdma_req *);
@@ -652,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
652extern unsigned int xprt_rdma_max_inline_read; 653extern unsigned int xprt_rdma_max_inline_read;
653void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); 654void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
654void xprt_rdma_free_addresses(struct rpc_xprt *xprt); 655void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
655void rpcrdma_connect_worker(struct work_struct *work);
656void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); 656void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
657int xprt_rdma_init(void); 657int xprt_rdma_init(void);
658void xprt_rdma_cleanup(void); 658void xprt_rdma_cleanup(void);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 6b7539c0466e..1b51e04d3566 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -47,13 +47,13 @@
47#include <net/checksum.h> 47#include <net/checksum.h>
48#include <net/udp.h> 48#include <net/udp.h>
49#include <net/tcp.h> 49#include <net/tcp.h>
50#include <linux/bvec.h>
51#include <linux/uio.h>
50 52
51#include <trace/events/sunrpc.h> 53#include <trace/events/sunrpc.h>
52 54
53#include "sunrpc.h" 55#include "sunrpc.h"
54 56
55#define RPC_TCP_READ_CHUNK_SZ (3*512*1024)
56
57static void xs_close(struct rpc_xprt *xprt); 57static void xs_close(struct rpc_xprt *xprt);
58static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, 58static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
59 struct socket *sock); 59 struct socket *sock);
@@ -129,7 +129,7 @@ static struct ctl_table xs_tunables_table[] = {
129 .mode = 0644, 129 .mode = 0644,
130 .proc_handler = proc_dointvec_minmax, 130 .proc_handler = proc_dointvec_minmax,
131 .extra1 = &xprt_min_resvport_limit, 131 .extra1 = &xprt_min_resvport_limit,
132 .extra2 = &xprt_max_resvport 132 .extra2 = &xprt_max_resvport_limit
133 }, 133 },
134 { 134 {
135 .procname = "max_resvport", 135 .procname = "max_resvport",
@@ -137,7 +137,7 @@ static struct ctl_table xs_tunables_table[] = {
137 .maxlen = sizeof(unsigned int), 137 .maxlen = sizeof(unsigned int),
138 .mode = 0644, 138 .mode = 0644,
139 .proc_handler = proc_dointvec_minmax, 139 .proc_handler = proc_dointvec_minmax,
140 .extra1 = &xprt_min_resvport, 140 .extra1 = &xprt_min_resvport_limit,
141 .extra2 = &xprt_max_resvport_limit 141 .extra2 = &xprt_max_resvport_limit
142 }, 142 },
143 { 143 {
@@ -325,6 +325,362 @@ static void xs_free_peer_addresses(struct rpc_xprt *xprt)
325 } 325 }
326} 326}
327 327
328static size_t
329xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
330{
331 size_t i,n;
332
333 if (!(buf->flags & XDRBUF_SPARSE_PAGES))
334 return want;
335 if (want > buf->page_len)
336 want = buf->page_len;
337 n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT;
338 for (i = 0; i < n; i++) {
339 if (buf->pages[i])
340 continue;
341 buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp);
342 if (!buf->pages[i]) {
343 buf->page_len = (i * PAGE_SIZE) - buf->page_base;
344 return buf->page_len;
345 }
346 }
347 return want;
348}
349
350static ssize_t
351xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek)
352{
353 ssize_t ret;
354 if (seek != 0)
355 iov_iter_advance(&msg->msg_iter, seek);
356 ret = sock_recvmsg(sock, msg, flags);
357 return ret > 0 ? ret + seek : ret;
358}
359
360static ssize_t
361xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags,
362 struct kvec *kvec, size_t count, size_t seek)
363{
364 iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, kvec, 1, count);
365 return xs_sock_recvmsg(sock, msg, flags, seek);
366}
367
368static ssize_t
369xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags,
370 struct bio_vec *bvec, unsigned long nr, size_t count,
371 size_t seek)
372{
373 iov_iter_bvec(&msg->msg_iter, READ | ITER_BVEC, bvec, nr, count);
374 return xs_sock_recvmsg(sock, msg, flags, seek);
375}
376
377static ssize_t
378xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
379 size_t count)
380{
381 struct kvec kvec = { 0 };
382 return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0);
383}
384
385static ssize_t
386xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
387 struct xdr_buf *buf, size_t count, size_t seek, size_t *read)
388{
389 size_t want, seek_init = seek, offset = 0;
390 ssize_t ret;
391
392 if (seek < buf->head[0].iov_len) {
393 want = min_t(size_t, count, buf->head[0].iov_len);
394 ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek);
395 if (ret <= 0)
396 goto sock_err;
397 offset += ret;
398 if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
399 goto out;
400 if (ret != want)
401 goto eagain;
402 seek = 0;
403 } else {
404 seek -= buf->head[0].iov_len;
405 offset += buf->head[0].iov_len;
406 }
407 if (seek < buf->page_len) {
408 want = xs_alloc_sparse_pages(buf,
409 min_t(size_t, count - offset, buf->page_len),
410 GFP_NOWAIT);
411 ret = xs_read_bvec(sock, msg, flags, buf->bvec,
412 xdr_buf_pagecount(buf),
413 want + buf->page_base,
414 seek + buf->page_base);
415 if (ret <= 0)
416 goto sock_err;
417 offset += ret - buf->page_base;
418 if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
419 goto out;
420 if (ret != want)
421 goto eagain;
422 seek = 0;
423 } else {
424 seek -= buf->page_len;
425 offset += buf->page_len;
426 }
427 if (seek < buf->tail[0].iov_len) {
428 want = min_t(size_t, count - offset, buf->tail[0].iov_len);
429 ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek);
430 if (ret <= 0)
431 goto sock_err;
432 offset += ret;
433 if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
434 goto out;
435 if (ret != want)
436 goto eagain;
437 } else
438 offset += buf->tail[0].iov_len;
439 ret = -EMSGSIZE;
440 msg->msg_flags |= MSG_TRUNC;
441out:
442 *read = offset - seek_init;
443 return ret;
444eagain:
445 ret = -EAGAIN;
446 goto out;
447sock_err:
448 offset += seek;
449 goto out;
450}
451
452static void
453xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf)
454{
455 if (!transport->recv.copied) {
456 if (buf->head[0].iov_len >= transport->recv.offset)
457 memcpy(buf->head[0].iov_base,
458 &transport->recv.xid,
459 transport->recv.offset);
460 transport->recv.copied = transport->recv.offset;
461 }
462}
463
464static bool
465xs_read_stream_request_done(struct sock_xprt *transport)
466{
467 return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT);
468}
469
470static ssize_t
471xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg,
472 int flags, struct rpc_rqst *req)
473{
474 struct xdr_buf *buf = &req->rq_private_buf;
475 size_t want, read;
476 ssize_t ret;
477
478 xs_read_header(transport, buf);
479
480 want = transport->recv.len - transport->recv.offset;
481 ret = xs_read_xdr_buf(transport->sock, msg, flags, buf,
482 transport->recv.copied + want, transport->recv.copied,
483 &read);
484 transport->recv.offset += read;
485 transport->recv.copied += read;
486 if (transport->recv.offset == transport->recv.len) {
487 if (xs_read_stream_request_done(transport))
488 msg->msg_flags |= MSG_EOR;
489 return transport->recv.copied;
490 }
491
492 switch (ret) {
493 case -EMSGSIZE:
494 return transport->recv.copied;
495 case 0:
496 return -ESHUTDOWN;
497 default:
498 if (ret < 0)
499 return ret;
500 }
501 return -EAGAIN;
502}
503
504static size_t
505xs_read_stream_headersize(bool isfrag)
506{
507 if (isfrag)
508 return sizeof(__be32);
509 return 3 * sizeof(__be32);
510}
511
512static ssize_t
513xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg,
514 int flags, size_t want, size_t seek)
515{
516 struct kvec kvec = {
517 .iov_base = &transport->recv.fraghdr,
518 .iov_len = want,
519 };
520 return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek);
521}
522
523#if defined(CONFIG_SUNRPC_BACKCHANNEL)
524static ssize_t
525xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
526{
527 struct rpc_xprt *xprt = &transport->xprt;
528 struct rpc_rqst *req;
529 ssize_t ret;
530
531 /* Look up and lock the request corresponding to the given XID */
532 req = xprt_lookup_bc_request(xprt, transport->recv.xid);
533 if (!req) {
534 printk(KERN_WARNING "Callback slot table overflowed\n");
535 return -ESHUTDOWN;
536 }
537
538 ret = xs_read_stream_request(transport, msg, flags, req);
539 if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
540 xprt_complete_bc_request(req, ret);
541
542 return ret;
543}
544#else /* CONFIG_SUNRPC_BACKCHANNEL */
545static ssize_t
546xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
547{
548 return -ESHUTDOWN;
549}
550#endif /* CONFIG_SUNRPC_BACKCHANNEL */
551
552static ssize_t
553xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
554{
555 struct rpc_xprt *xprt = &transport->xprt;
556 struct rpc_rqst *req;
557 ssize_t ret = 0;
558
559 /* Look up and lock the request corresponding to the given XID */
560 spin_lock(&xprt->queue_lock);
561 req = xprt_lookup_rqst(xprt, transport->recv.xid);
562 if (!req) {
563 msg->msg_flags |= MSG_TRUNC;
564 goto out;
565 }
566 xprt_pin_rqst(req);
567 spin_unlock(&xprt->queue_lock);
568
569 ret = xs_read_stream_request(transport, msg, flags, req);
570
571 spin_lock(&xprt->queue_lock);
572 if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
573 xprt_complete_rqst(req->rq_task, ret);
574 xprt_unpin_rqst(req);
575out:
576 spin_unlock(&xprt->queue_lock);
577 return ret;
578}
579
580static ssize_t
581xs_read_stream(struct sock_xprt *transport, int flags)
582{
583 struct msghdr msg = { 0 };
584 size_t want, read = 0;
585 ssize_t ret = 0;
586
587 if (transport->recv.len == 0) {
588 want = xs_read_stream_headersize(transport->recv.copied != 0);
589 ret = xs_read_stream_header(transport, &msg, flags, want,
590 transport->recv.offset);
591 if (ret <= 0)
592 goto out_err;
593 transport->recv.offset = ret;
594 if (ret != want) {
595 ret = -EAGAIN;
596 goto out_err;
597 }
598 transport->recv.len = be32_to_cpu(transport->recv.fraghdr) &
599 RPC_FRAGMENT_SIZE_MASK;
600 transport->recv.offset -= sizeof(transport->recv.fraghdr);
601 read = ret;
602 }
603
604 switch (be32_to_cpu(transport->recv.calldir)) {
605 case RPC_CALL:
606 ret = xs_read_stream_call(transport, &msg, flags);
607 break;
608 case RPC_REPLY:
609 ret = xs_read_stream_reply(transport, &msg, flags);
610 }
611 if (msg.msg_flags & MSG_TRUNC) {
612 transport->recv.calldir = cpu_to_be32(-1);
613 transport->recv.copied = -1;
614 }
615 if (ret < 0)
616 goto out_err;
617 read += ret;
618 if (transport->recv.offset < transport->recv.len) {
619 ret = xs_read_discard(transport->sock, &msg, flags,
620 transport->recv.len - transport->recv.offset);
621 if (ret <= 0)
622 goto out_err;
623 transport->recv.offset += ret;
624 read += ret;
625 if (transport->recv.offset != transport->recv.len)
626 return -EAGAIN;
627 }
628 if (xs_read_stream_request_done(transport)) {
629 trace_xs_stream_read_request(transport);
630 transport->recv.copied = 0;
631 }
632 transport->recv.offset = 0;
633 transport->recv.len = 0;
634 return read;
635out_err:
636 switch (ret) {
637 case 0:
638 case -ESHUTDOWN:
639 xprt_force_disconnect(&transport->xprt);
640 return -ESHUTDOWN;
641 }
642 return ret;
643}
644
645static void xs_stream_data_receive(struct sock_xprt *transport)
646{
647 size_t read = 0;
648 ssize_t ret = 0;
649
650 mutex_lock(&transport->recv_mutex);
651 if (transport->sock == NULL)
652 goto out;
653 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
654 for (;;) {
655 ret = xs_read_stream(transport, MSG_DONTWAIT);
656 if (ret <= 0)
657 break;
658 read += ret;
659 cond_resched();
660 }
661out:
662 mutex_unlock(&transport->recv_mutex);
663 trace_xs_stream_read_data(&transport->xprt, ret, read);
664}
665
666static void xs_stream_data_receive_workfn(struct work_struct *work)
667{
668 struct sock_xprt *transport =
669 container_of(work, struct sock_xprt, recv_worker);
670 xs_stream_data_receive(transport);
671}
672
673static void
674xs_stream_reset_connect(struct sock_xprt *transport)
675{
676 transport->recv.offset = 0;
677 transport->recv.len = 0;
678 transport->recv.copied = 0;
679 transport->xmit.offset = 0;
680 transport->xprt.stat.connect_count++;
681 transport->xprt.stat.connect_start = jiffies;
682}
683
328#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) 684#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
329 685
330static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) 686static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
@@ -440,28 +796,21 @@ out:
440 return err; 796 return err;
441} 797}
442 798
443static void xs_nospace_callback(struct rpc_task *task)
444{
445 struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
446
447 transport->inet->sk_write_pending--;
448}
449
450/** 799/**
451 * xs_nospace - place task on wait queue if transmit was incomplete 800 * xs_nospace - handle transmit was incomplete
452 * @task: task to put to sleep 801 * @req: pointer to RPC request
453 * 802 *
454 */ 803 */
455static int xs_nospace(struct rpc_task *task) 804static int xs_nospace(struct rpc_rqst *req)
456{ 805{
457 struct rpc_rqst *req = task->tk_rqstp;
458 struct rpc_xprt *xprt = req->rq_xprt; 806 struct rpc_xprt *xprt = req->rq_xprt;
459 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 807 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
460 struct sock *sk = transport->inet; 808 struct sock *sk = transport->inet;
461 int ret = -EAGAIN; 809 int ret = -EAGAIN;
462 810
463 dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", 811 dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
464 task->tk_pid, req->rq_slen - req->rq_bytes_sent, 812 req->rq_task->tk_pid,
813 req->rq_slen - transport->xmit.offset,
465 req->rq_slen); 814 req->rq_slen);
466 815
467 /* Protect against races with write_space */ 816 /* Protect against races with write_space */
@@ -471,7 +820,7 @@ static int xs_nospace(struct rpc_task *task)
471 if (xprt_connected(xprt)) { 820 if (xprt_connected(xprt)) {
472 /* wait for more buffer space */ 821 /* wait for more buffer space */
473 sk->sk_write_pending++; 822 sk->sk_write_pending++;
474 xprt_wait_for_buffer_space(task, xs_nospace_callback); 823 xprt_wait_for_buffer_space(xprt);
475 } else 824 } else
476 ret = -ENOTCONN; 825 ret = -ENOTCONN;
477 826
@@ -491,6 +840,22 @@ static int xs_nospace(struct rpc_task *task)
491 return ret; 840 return ret;
492} 841}
493 842
843static void
844xs_stream_prepare_request(struct rpc_rqst *req)
845{
846 req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_NOIO);
847}
848
849/*
850 * Determine if the previous message in the stream was aborted before it
851 * could complete transmission.
852 */
853static bool
854xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req)
855{
856 return transport->xmit.offset != 0 && req->rq_bytes_sent == 0;
857}
858
494/* 859/*
495 * Construct a stream transport record marker in @buf. 860 * Construct a stream transport record marker in @buf.
496 */ 861 */
@@ -503,7 +868,7 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
503 868
504/** 869/**
505 * xs_local_send_request - write an RPC request to an AF_LOCAL socket 870 * xs_local_send_request - write an RPC request to an AF_LOCAL socket
506 * @task: RPC task that manages the state of an RPC request 871 * @req: pointer to RPC request
507 * 872 *
508 * Return values: 873 * Return values:
509 * 0: The request has been sent 874 * 0: The request has been sent
@@ -512,9 +877,8 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
512 * ENOTCONN: Caller needs to invoke connect logic then call again 877 * ENOTCONN: Caller needs to invoke connect logic then call again
513 * other: Some other error occured, the request was not sent 878 * other: Some other error occured, the request was not sent
514 */ 879 */
515static int xs_local_send_request(struct rpc_task *task) 880static int xs_local_send_request(struct rpc_rqst *req)
516{ 881{
517 struct rpc_rqst *req = task->tk_rqstp;
518 struct rpc_xprt *xprt = req->rq_xprt; 882 struct rpc_xprt *xprt = req->rq_xprt;
519 struct sock_xprt *transport = 883 struct sock_xprt *transport =
520 container_of(xprt, struct sock_xprt, xprt); 884 container_of(xprt, struct sock_xprt, xprt);
@@ -522,25 +886,34 @@ static int xs_local_send_request(struct rpc_task *task)
522 int status; 886 int status;
523 int sent = 0; 887 int sent = 0;
524 888
889 /* Close the stream if the previous transmission was incomplete */
890 if (xs_send_request_was_aborted(transport, req)) {
891 xs_close(xprt);
892 return -ENOTCONN;
893 }
894
525 xs_encode_stream_record_marker(&req->rq_snd_buf); 895 xs_encode_stream_record_marker(&req->rq_snd_buf);
526 896
527 xs_pktdump("packet data:", 897 xs_pktdump("packet data:",
528 req->rq_svec->iov_base, req->rq_svec->iov_len); 898 req->rq_svec->iov_base, req->rq_svec->iov_len);
529 899
530 req->rq_xtime = ktime_get(); 900 req->rq_xtime = ktime_get();
531 status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, 901 status = xs_sendpages(transport->sock, NULL, 0, xdr,
902 transport->xmit.offset,
532 true, &sent); 903 true, &sent);
533 dprintk("RPC: %s(%u) = %d\n", 904 dprintk("RPC: %s(%u) = %d\n",
534 __func__, xdr->len - req->rq_bytes_sent, status); 905 __func__, xdr->len - transport->xmit.offset, status);
535 906
536 if (status == -EAGAIN && sock_writeable(transport->inet)) 907 if (status == -EAGAIN && sock_writeable(transport->inet))
537 status = -ENOBUFS; 908 status = -ENOBUFS;
538 909
539 if (likely(sent > 0) || status == 0) { 910 if (likely(sent > 0) || status == 0) {
540 req->rq_bytes_sent += sent; 911 transport->xmit.offset += sent;
541 req->rq_xmit_bytes_sent += sent; 912 req->rq_bytes_sent = transport->xmit.offset;
542 if (likely(req->rq_bytes_sent >= req->rq_slen)) { 913 if (likely(req->rq_bytes_sent >= req->rq_slen)) {
914 req->rq_xmit_bytes_sent += transport->xmit.offset;
543 req->rq_bytes_sent = 0; 915 req->rq_bytes_sent = 0;
916 transport->xmit.offset = 0;
544 return 0; 917 return 0;
545 } 918 }
546 status = -EAGAIN; 919 status = -EAGAIN;
@@ -550,7 +923,7 @@ static int xs_local_send_request(struct rpc_task *task)
550 case -ENOBUFS: 923 case -ENOBUFS:
551 break; 924 break;
552 case -EAGAIN: 925 case -EAGAIN:
553 status = xs_nospace(task); 926 status = xs_nospace(req);
554 break; 927 break;
555 default: 928 default:
556 dprintk("RPC: sendmsg returned unrecognized error %d\n", 929 dprintk("RPC: sendmsg returned unrecognized error %d\n",
@@ -566,7 +939,7 @@ static int xs_local_send_request(struct rpc_task *task)
566 939
567/** 940/**
568 * xs_udp_send_request - write an RPC request to a UDP socket 941 * xs_udp_send_request - write an RPC request to a UDP socket
569 * @task: address of RPC task that manages the state of an RPC request 942 * @req: pointer to RPC request
570 * 943 *
571 * Return values: 944 * Return values:
572 * 0: The request has been sent 945 * 0: The request has been sent
@@ -575,9 +948,8 @@ static int xs_local_send_request(struct rpc_task *task)
575 * ENOTCONN: Caller needs to invoke connect logic then call again 948 * ENOTCONN: Caller needs to invoke connect logic then call again
576 * other: Some other error occurred, the request was not sent 949 * other: Some other error occurred, the request was not sent
577 */ 950 */
578static int xs_udp_send_request(struct rpc_task *task) 951static int xs_udp_send_request(struct rpc_rqst *req)
579{ 952{
580 struct rpc_rqst *req = task->tk_rqstp;
581 struct rpc_xprt *xprt = req->rq_xprt; 953 struct rpc_xprt *xprt = req->rq_xprt;
582 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 954 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
583 struct xdr_buf *xdr = &req->rq_snd_buf; 955 struct xdr_buf *xdr = &req->rq_snd_buf;
@@ -590,12 +962,16 @@ static int xs_udp_send_request(struct rpc_task *task)
590 962
591 if (!xprt_bound(xprt)) 963 if (!xprt_bound(xprt))
592 return -ENOTCONN; 964 return -ENOTCONN;
965
966 if (!xprt_request_get_cong(xprt, req))
967 return -EBADSLT;
968
593 req->rq_xtime = ktime_get(); 969 req->rq_xtime = ktime_get();
594 status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, 970 status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
595 xdr, req->rq_bytes_sent, true, &sent); 971 xdr, 0, true, &sent);
596 972
597 dprintk("RPC: xs_udp_send_request(%u) = %d\n", 973 dprintk("RPC: xs_udp_send_request(%u) = %d\n",
598 xdr->len - req->rq_bytes_sent, status); 974 xdr->len, status);
599 975
600 /* firewall is blocking us, don't return -EAGAIN or we end up looping */ 976 /* firewall is blocking us, don't return -EAGAIN or we end up looping */
601 if (status == -EPERM) 977 if (status == -EPERM)
@@ -619,7 +995,7 @@ process_status:
619 /* Should we call xs_close() here? */ 995 /* Should we call xs_close() here? */
620 break; 996 break;
621 case -EAGAIN: 997 case -EAGAIN:
622 status = xs_nospace(task); 998 status = xs_nospace(req);
623 break; 999 break;
624 case -ENETUNREACH: 1000 case -ENETUNREACH:
625 case -ENOBUFS: 1001 case -ENOBUFS:
@@ -639,7 +1015,7 @@ process_status:
639 1015
640/** 1016/**
641 * xs_tcp_send_request - write an RPC request to a TCP socket 1017 * xs_tcp_send_request - write an RPC request to a TCP socket
642 * @task: address of RPC task that manages the state of an RPC request 1018 * @req: pointer to RPC request
643 * 1019 *
644 * Return values: 1020 * Return values:
645 * 0: The request has been sent 1021 * 0: The request has been sent
@@ -651,9 +1027,8 @@ process_status:
651 * XXX: In the case of soft timeouts, should we eventually give up 1027 * XXX: In the case of soft timeouts, should we eventually give up
652 * if sendmsg is not able to make progress? 1028 * if sendmsg is not able to make progress?
653 */ 1029 */
654static int xs_tcp_send_request(struct rpc_task *task) 1030static int xs_tcp_send_request(struct rpc_rqst *req)
655{ 1031{
656 struct rpc_rqst *req = task->tk_rqstp;
657 struct rpc_xprt *xprt = req->rq_xprt; 1032 struct rpc_xprt *xprt = req->rq_xprt;
658 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 1033 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
659 struct xdr_buf *xdr = &req->rq_snd_buf; 1034 struct xdr_buf *xdr = &req->rq_snd_buf;
@@ -662,6 +1037,13 @@ static int xs_tcp_send_request(struct rpc_task *task)
662 int status; 1037 int status;
663 int sent; 1038 int sent;
664 1039
1040 /* Close the stream if the previous transmission was incomplete */
1041 if (xs_send_request_was_aborted(transport, req)) {
1042 if (transport->sock != NULL)
1043 kernel_sock_shutdown(transport->sock, SHUT_RDWR);
1044 return -ENOTCONN;
1045 }
1046
665 xs_encode_stream_record_marker(&req->rq_snd_buf); 1047 xs_encode_stream_record_marker(&req->rq_snd_buf);
666 1048
667 xs_pktdump("packet data:", 1049 xs_pktdump("packet data:",
@@ -671,7 +1053,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
671 * completes while the socket holds a reference to the pages, 1053 * completes while the socket holds a reference to the pages,
672 * then we may end up resending corrupted data. 1054 * then we may end up resending corrupted data.
673 */ 1055 */
674 if (task->tk_flags & RPC_TASK_SENT) 1056 if (req->rq_task->tk_flags & RPC_TASK_SENT)
675 zerocopy = false; 1057 zerocopy = false;
676 1058
677 if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) 1059 if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
@@ -684,17 +1066,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
684 while (1) { 1066 while (1) {
685 sent = 0; 1067 sent = 0;
686 status = xs_sendpages(transport->sock, NULL, 0, xdr, 1068 status = xs_sendpages(transport->sock, NULL, 0, xdr,
687 req->rq_bytes_sent, zerocopy, &sent); 1069 transport->xmit.offset,
1070 zerocopy, &sent);
688 1071
689 dprintk("RPC: xs_tcp_send_request(%u) = %d\n", 1072 dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
690 xdr->len - req->rq_bytes_sent, status); 1073 xdr->len - transport->xmit.offset, status);
691 1074
692 /* If we've sent the entire packet, immediately 1075 /* If we've sent the entire packet, immediately
693 * reset the count of bytes sent. */ 1076 * reset the count of bytes sent. */
694 req->rq_bytes_sent += sent; 1077 transport->xmit.offset += sent;
695 req->rq_xmit_bytes_sent += sent; 1078 req->rq_bytes_sent = transport->xmit.offset;
696 if (likely(req->rq_bytes_sent >= req->rq_slen)) { 1079 if (likely(req->rq_bytes_sent >= req->rq_slen)) {
1080 req->rq_xmit_bytes_sent += transport->xmit.offset;
697 req->rq_bytes_sent = 0; 1081 req->rq_bytes_sent = 0;
1082 transport->xmit.offset = 0;
698 return 0; 1083 return 0;
699 } 1084 }
700 1085
@@ -732,7 +1117,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
732 /* Should we call xs_close() here? */ 1117 /* Should we call xs_close() here? */
733 break; 1118 break;
734 case -EAGAIN: 1119 case -EAGAIN:
735 status = xs_nospace(task); 1120 status = xs_nospace(req);
736 break; 1121 break;
737 case -ECONNRESET: 1122 case -ECONNRESET:
738 case -ECONNREFUSED: 1123 case -ECONNREFUSED:
@@ -749,35 +1134,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
749 return status; 1134 return status;
750} 1135}
751 1136
752/**
753 * xs_tcp_release_xprt - clean up after a tcp transmission
754 * @xprt: transport
755 * @task: rpc task
756 *
757 * This cleans up if an error causes us to abort the transmission of a request.
758 * In this case, the socket may need to be reset in order to avoid confusing
759 * the server.
760 */
761static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
762{
763 struct rpc_rqst *req;
764
765 if (task != xprt->snd_task)
766 return;
767 if (task == NULL)
768 goto out_release;
769 req = task->tk_rqstp;
770 if (req == NULL)
771 goto out_release;
772 if (req->rq_bytes_sent == 0)
773 goto out_release;
774 if (req->rq_bytes_sent == req->rq_snd_buf.len)
775 goto out_release;
776 set_bit(XPRT_CLOSE_WAIT, &xprt->state);
777out_release:
778 xprt_release_xprt(xprt, task);
779}
780
781static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) 1137static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
782{ 1138{
783 transport->old_data_ready = sk->sk_data_ready; 1139 transport->old_data_ready = sk->sk_data_ready;
@@ -921,114 +1277,6 @@ static void xs_destroy(struct rpc_xprt *xprt)
921 module_put(THIS_MODULE); 1277 module_put(THIS_MODULE);
922} 1278}
923 1279
924static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
925{
926 struct xdr_skb_reader desc = {
927 .skb = skb,
928 .offset = sizeof(rpc_fraghdr),
929 .count = skb->len - sizeof(rpc_fraghdr),
930 };
931
932 if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
933 return -1;
934 if (desc.count)
935 return -1;
936 return 0;
937}
938
939/**
940 * xs_local_data_read_skb
941 * @xprt: transport
942 * @sk: socket
943 * @skb: skbuff
944 *
945 * Currently this assumes we can read the whole reply in a single gulp.
946 */
947static void xs_local_data_read_skb(struct rpc_xprt *xprt,
948 struct sock *sk,
949 struct sk_buff *skb)
950{
951 struct rpc_task *task;
952 struct rpc_rqst *rovr;
953 int repsize, copied;
954 u32 _xid;
955 __be32 *xp;
956
957 repsize = skb->len - sizeof(rpc_fraghdr);
958 if (repsize < 4) {
959 dprintk("RPC: impossible RPC reply size %d\n", repsize);
960 return;
961 }
962
963 /* Copy the XID from the skb... */
964 xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
965 if (xp == NULL)
966 return;
967
968 /* Look up and lock the request corresponding to the given XID */
969 spin_lock(&xprt->recv_lock);
970 rovr = xprt_lookup_rqst(xprt, *xp);
971 if (!rovr)
972 goto out_unlock;
973 xprt_pin_rqst(rovr);
974 spin_unlock(&xprt->recv_lock);
975 task = rovr->rq_task;
976
977 copied = rovr->rq_private_buf.buflen;
978 if (copied > repsize)
979 copied = repsize;
980
981 if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
982 dprintk("RPC: sk_buff copy failed\n");
983 spin_lock(&xprt->recv_lock);
984 goto out_unpin;
985 }
986
987 spin_lock(&xprt->recv_lock);
988 xprt_complete_rqst(task, copied);
989out_unpin:
990 xprt_unpin_rqst(rovr);
991 out_unlock:
992 spin_unlock(&xprt->recv_lock);
993}
994
995static void xs_local_data_receive(struct sock_xprt *transport)
996{
997 struct sk_buff *skb;
998 struct sock *sk;
999 int err;
1000
1001restart:
1002 mutex_lock(&transport->recv_mutex);
1003 sk = transport->inet;
1004 if (sk == NULL)
1005 goto out;
1006 for (;;) {
1007 skb = skb_recv_datagram(sk, 0, 1, &err);
1008 if (skb != NULL) {
1009 xs_local_data_read_skb(&transport->xprt, sk, skb);
1010 skb_free_datagram(sk, skb);
1011 continue;
1012 }
1013 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1014 break;
1015 if (need_resched()) {
1016 mutex_unlock(&transport->recv_mutex);
1017 cond_resched();
1018 goto restart;
1019 }
1020 }
1021out:
1022 mutex_unlock(&transport->recv_mutex);
1023}
1024
1025static void xs_local_data_receive_workfn(struct work_struct *work)
1026{
1027 struct sock_xprt *transport =
1028 container_of(work, struct sock_xprt, recv_worker);
1029 xs_local_data_receive(transport);
1030}
1031
1032/** 1280/**
1033 * xs_udp_data_read_skb - receive callback for UDP sockets 1281 * xs_udp_data_read_skb - receive callback for UDP sockets
1034 * @xprt: transport 1282 * @xprt: transport
@@ -1058,13 +1306,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
1058 return; 1306 return;
1059 1307
1060 /* Look up and lock the request corresponding to the given XID */ 1308 /* Look up and lock the request corresponding to the given XID */
1061 spin_lock(&xprt->recv_lock); 1309 spin_lock(&xprt->queue_lock);
1062 rovr = xprt_lookup_rqst(xprt, *xp); 1310 rovr = xprt_lookup_rqst(xprt, *xp);
1063 if (!rovr) 1311 if (!rovr)
1064 goto out_unlock; 1312 goto out_unlock;
1065 xprt_pin_rqst(rovr); 1313 xprt_pin_rqst(rovr);
1066 xprt_update_rtt(rovr->rq_task); 1314 xprt_update_rtt(rovr->rq_task);
1067 spin_unlock(&xprt->recv_lock); 1315 spin_unlock(&xprt->queue_lock);
1068 task = rovr->rq_task; 1316 task = rovr->rq_task;
1069 1317
1070 if ((copied = rovr->rq_private_buf.buflen) > repsize) 1318 if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1072,7 +1320,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
1072 1320
1073 /* Suck it into the iovec, verify checksum if not done by hw. */ 1321 /* Suck it into the iovec, verify checksum if not done by hw. */
1074 if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { 1322 if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
1075 spin_lock(&xprt->recv_lock); 1323 spin_lock(&xprt->queue_lock);
1076 __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); 1324 __UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
1077 goto out_unpin; 1325 goto out_unpin;
1078 } 1326 }
@@ -1081,13 +1329,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
1081 spin_lock_bh(&xprt->transport_lock); 1329 spin_lock_bh(&xprt->transport_lock);
1082 xprt_adjust_cwnd(xprt, task, copied); 1330 xprt_adjust_cwnd(xprt, task, copied);
1083 spin_unlock_bh(&xprt->transport_lock); 1331 spin_unlock_bh(&xprt->transport_lock);
1084 spin_lock(&xprt->recv_lock); 1332 spin_lock(&xprt->queue_lock);
1085 xprt_complete_rqst(task, copied); 1333 xprt_complete_rqst(task, copied);
1086 __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); 1334 __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
1087out_unpin: 1335out_unpin:
1088 xprt_unpin_rqst(rovr); 1336 xprt_unpin_rqst(rovr);
1089 out_unlock: 1337 out_unlock:
1090 spin_unlock(&xprt->recv_lock); 1338 spin_unlock(&xprt->queue_lock);
1091} 1339}
1092 1340
1093static void xs_udp_data_receive(struct sock_xprt *transport) 1341static void xs_udp_data_receive(struct sock_xprt *transport)
@@ -1096,25 +1344,18 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
1096 struct sock *sk; 1344 struct sock *sk;
1097 int err; 1345 int err;
1098 1346
1099restart:
1100 mutex_lock(&transport->recv_mutex); 1347 mutex_lock(&transport->recv_mutex);
1101 sk = transport->inet; 1348 sk = transport->inet;
1102 if (sk == NULL) 1349 if (sk == NULL)
1103 goto out; 1350 goto out;
1351 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
1104 for (;;) { 1352 for (;;) {
1105 skb = skb_recv_udp(sk, 0, 1, &err); 1353 skb = skb_recv_udp(sk, 0, 1, &err);
1106 if (skb != NULL) { 1354 if (skb == NULL)
1107 xs_udp_data_read_skb(&transport->xprt, sk, skb);
1108 consume_skb(skb);
1109 continue;
1110 }
1111 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1112 break; 1355 break;
1113 if (need_resched()) { 1356 xs_udp_data_read_skb(&transport->xprt, sk, skb);
1114 mutex_unlock(&transport->recv_mutex); 1357 consume_skb(skb);
1115 cond_resched(); 1358 cond_resched();
1116 goto restart;
1117 }
1118 } 1359 }
1119out: 1360out:
1120 mutex_unlock(&transport->recv_mutex); 1361 mutex_unlock(&transport->recv_mutex);
@@ -1163,263 +1404,7 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt)
1163 xprt_force_disconnect(xprt); 1404 xprt_force_disconnect(xprt);
1164} 1405}
1165 1406
1166static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
1167{
1168 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
1169 size_t len, used;
1170 char *p;
1171
1172 p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset;
1173 len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset;
1174 used = xdr_skb_read_bits(desc, p, len);
1175 transport->tcp_offset += used;
1176 if (used != len)
1177 return;
1178
1179 transport->tcp_reclen = ntohl(transport->tcp_fraghdr);
1180 if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
1181 transport->tcp_flags |= TCP_RCV_LAST_FRAG;
1182 else
1183 transport->tcp_flags &= ~TCP_RCV_LAST_FRAG;
1184 transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
1185
1186 transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR;
1187 transport->tcp_offset = 0;
1188
1189 /* Sanity check of the record length */
1190 if (unlikely(transport->tcp_reclen < 8)) {
1191 dprintk("RPC: invalid TCP record fragment length\n");
1192 xs_tcp_force_close(xprt);
1193 return;
1194 }
1195 dprintk("RPC: reading TCP record fragment of length %d\n",
1196 transport->tcp_reclen);
1197}
1198
1199static void xs_tcp_check_fraghdr(struct sock_xprt *transport)
1200{
1201 if (transport->tcp_offset == transport->tcp_reclen) {
1202 transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR;
1203 transport->tcp_offset = 0;
1204 if (transport->tcp_flags & TCP_RCV_LAST_FRAG) {
1205 transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
1206 transport->tcp_flags |= TCP_RCV_COPY_XID;
1207 transport->tcp_copied = 0;
1208 }
1209 }
1210}
1211
1212static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc)
1213{
1214 size_t len, used;
1215 char *p;
1216
1217 len = sizeof(transport->tcp_xid) - transport->tcp_offset;
1218 dprintk("RPC: reading XID (%zu bytes)\n", len);
1219 p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
1220 used = xdr_skb_read_bits(desc, p, len);
1221 transport->tcp_offset += used;
1222 if (used != len)
1223 return;
1224 transport->tcp_flags &= ~TCP_RCV_COPY_XID;
1225 transport->tcp_flags |= TCP_RCV_READ_CALLDIR;
1226 transport->tcp_copied = 4;
1227 dprintk("RPC: reading %s XID %08x\n",
1228 (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for"
1229 : "request with",
1230 ntohl(transport->tcp_xid));
1231 xs_tcp_check_fraghdr(transport);
1232}
1233
1234static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
1235 struct xdr_skb_reader *desc)
1236{
1237 size_t len, used;
1238 u32 offset;
1239 char *p;
1240
1241 /*
1242 * We want transport->tcp_offset to be 8 at the end of this routine
1243 * (4 bytes for the xid and 4 bytes for the call/reply flag).
1244 * When this function is called for the first time,
1245 * transport->tcp_offset is 4 (after having already read the xid).
1246 */
1247 offset = transport->tcp_offset - sizeof(transport->tcp_xid);
1248 len = sizeof(transport->tcp_calldir) - offset;
1249 dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len);
1250 p = ((char *) &transport->tcp_calldir) + offset;
1251 used = xdr_skb_read_bits(desc, p, len);
1252 transport->tcp_offset += used;
1253 if (used != len)
1254 return;
1255 transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR;
1256 /*
1257 * We don't yet have the XDR buffer, so we will write the calldir
1258 * out after we get the buffer from the 'struct rpc_rqst'
1259 */
1260 switch (ntohl(transport->tcp_calldir)) {
1261 case RPC_REPLY:
1262 transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
1263 transport->tcp_flags |= TCP_RCV_COPY_DATA;
1264 transport->tcp_flags |= TCP_RPC_REPLY;
1265 break;
1266 case RPC_CALL:
1267 transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
1268 transport->tcp_flags |= TCP_RCV_COPY_DATA;
1269 transport->tcp_flags &= ~TCP_RPC_REPLY;
1270 break;
1271 default:
1272 dprintk("RPC: invalid request message type\n");
1273 xs_tcp_force_close(&transport->xprt);
1274 }
1275 xs_tcp_check_fraghdr(transport);
1276}
1277
1278static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
1279 struct xdr_skb_reader *desc,
1280 struct rpc_rqst *req)
1281{
1282 struct sock_xprt *transport =
1283 container_of(xprt, struct sock_xprt, xprt);
1284 struct xdr_buf *rcvbuf;
1285 size_t len;
1286 ssize_t r;
1287
1288 rcvbuf = &req->rq_private_buf;
1289
1290 if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) {
1291 /*
1292 * Save the RPC direction in the XDR buffer
1293 */
1294 memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied,
1295 &transport->tcp_calldir,
1296 sizeof(transport->tcp_calldir));
1297 transport->tcp_copied += sizeof(transport->tcp_calldir);
1298 transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR;
1299 }
1300
1301 len = desc->count;
1302 if (len > transport->tcp_reclen - transport->tcp_offset)
1303 desc->count = transport->tcp_reclen - transport->tcp_offset;
1304 r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
1305 desc, xdr_skb_read_bits);
1306
1307 if (desc->count) {
1308 /* Error when copying to the receive buffer,
1309 * usually because we weren't able to allocate
1310 * additional buffer pages. All we can do now
1311 * is turn off TCP_RCV_COPY_DATA, so the request
1312 * will not receive any additional updates,
1313 * and time out.
1314 * Any remaining data from this record will
1315 * be discarded.
1316 */
1317 transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
1318 dprintk("RPC: XID %08x truncated request\n",
1319 ntohl(transport->tcp_xid));
1320 dprintk("RPC: xprt = %p, tcp_copied = %lu, "
1321 "tcp_offset = %u, tcp_reclen = %u\n",
1322 xprt, transport->tcp_copied,
1323 transport->tcp_offset, transport->tcp_reclen);
1324 return;
1325 }
1326
1327 transport->tcp_copied += r;
1328 transport->tcp_offset += r;
1329 desc->count = len - r;
1330
1331 dprintk("RPC: XID %08x read %zd bytes\n",
1332 ntohl(transport->tcp_xid), r);
1333 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
1334 "tcp_reclen = %u\n", xprt, transport->tcp_copied,
1335 transport->tcp_offset, transport->tcp_reclen);
1336
1337 if (transport->tcp_copied == req->rq_private_buf.buflen)
1338 transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
1339 else if (transport->tcp_offset == transport->tcp_reclen) {
1340 if (transport->tcp_flags & TCP_RCV_LAST_FRAG)
1341 transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
1342 }
1343}
1344
1345/*
1346 * Finds the request corresponding to the RPC xid and invokes the common
1347 * tcp read code to read the data.
1348 */
1349static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
1350 struct xdr_skb_reader *desc)
1351{
1352 struct sock_xprt *transport =
1353 container_of(xprt, struct sock_xprt, xprt);
1354 struct rpc_rqst *req;
1355
1356 dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
1357
1358 /* Find and lock the request corresponding to this xid */
1359 spin_lock(&xprt->recv_lock);
1360 req = xprt_lookup_rqst(xprt, transport->tcp_xid);
1361 if (!req) {
1362 dprintk("RPC: XID %08x request not found!\n",
1363 ntohl(transport->tcp_xid));
1364 spin_unlock(&xprt->recv_lock);
1365 return -1;
1366 }
1367 xprt_pin_rqst(req);
1368 spin_unlock(&xprt->recv_lock);
1369
1370 xs_tcp_read_common(xprt, desc, req);
1371
1372 spin_lock(&xprt->recv_lock);
1373 if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
1374 xprt_complete_rqst(req->rq_task, transport->tcp_copied);
1375 xprt_unpin_rqst(req);
1376 spin_unlock(&xprt->recv_lock);
1377 return 0;
1378}
1379
1380#if defined(CONFIG_SUNRPC_BACKCHANNEL) 1407#if defined(CONFIG_SUNRPC_BACKCHANNEL)
1381/*
1382 * Obtains an rpc_rqst previously allocated and invokes the common
1383 * tcp read code to read the data. The result is placed in the callback
1384 * queue.
1385 * If we're unable to obtain the rpc_rqst we schedule the closing of the
1386 * connection and return -1.
1387 */
1388static int xs_tcp_read_callback(struct rpc_xprt *xprt,
1389 struct xdr_skb_reader *desc)
1390{
1391 struct sock_xprt *transport =
1392 container_of(xprt, struct sock_xprt, xprt);
1393 struct rpc_rqst *req;
1394
1395 /* Look up the request corresponding to the given XID */
1396 req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
1397 if (req == NULL) {
1398 printk(KERN_WARNING "Callback slot table overflowed\n");
1399 xprt_force_disconnect(xprt);
1400 return -1;
1401 }
1402
1403 dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid));
1404 xs_tcp_read_common(xprt, desc, req);
1405
1406 if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
1407 xprt_complete_bc_request(req, transport->tcp_copied);
1408
1409 return 0;
1410}
1411
1412static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
1413 struct xdr_skb_reader *desc)
1414{
1415 struct sock_xprt *transport =
1416 container_of(xprt, struct sock_xprt, xprt);
1417
1418 return (transport->tcp_flags & TCP_RPC_REPLY) ?
1419 xs_tcp_read_reply(xprt, desc) :
1420 xs_tcp_read_callback(xprt, desc);
1421}
1422
1423static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) 1408static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
1424{ 1409{
1425 int ret; 1410 int ret;
@@ -1435,145 +1420,8 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
1435{ 1420{
1436 return PAGE_SIZE; 1421 return PAGE_SIZE;
1437} 1422}
1438#else
1439static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
1440 struct xdr_skb_reader *desc)
1441{
1442 return xs_tcp_read_reply(xprt, desc);
1443}
1444#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1423#endif /* CONFIG_SUNRPC_BACKCHANNEL */
1445 1424
1446/*
1447 * Read data off the transport. This can be either an RPC_CALL or an
1448 * RPC_REPLY. Relay the processing to helper functions.
1449 */
1450static void xs_tcp_read_data(struct rpc_xprt *xprt,
1451 struct xdr_skb_reader *desc)
1452{
1453 struct sock_xprt *transport =
1454 container_of(xprt, struct sock_xprt, xprt);
1455
1456 if (_xs_tcp_read_data(xprt, desc) == 0)
1457 xs_tcp_check_fraghdr(transport);
1458 else {
1459 /*
1460 * The transport_lock protects the request handling.
1461 * There's no need to hold it to update the tcp_flags.
1462 */
1463 transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
1464 }
1465}
1466
1467static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc)
1468{
1469 size_t len;
1470
1471 len = transport->tcp_reclen - transport->tcp_offset;
1472 if (len > desc->count)
1473 len = desc->count;
1474 desc->count -= len;
1475 desc->offset += len;
1476 transport->tcp_offset += len;
1477 dprintk("RPC: discarded %zu bytes\n", len);
1478 xs_tcp_check_fraghdr(transport);
1479}
1480
1481static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
1482{
1483 struct rpc_xprt *xprt = rd_desc->arg.data;
1484 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
1485 struct xdr_skb_reader desc = {
1486 .skb = skb,
1487 .offset = offset,
1488 .count = len,
1489 };
1490 size_t ret;
1491
1492 dprintk("RPC: xs_tcp_data_recv started\n");
1493 do {
1494 trace_xs_tcp_data_recv(transport);
1495 /* Read in a new fragment marker if necessary */
1496 /* Can we ever really expect to get completely empty fragments? */
1497 if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
1498 xs_tcp_read_fraghdr(xprt, &desc);
1499 continue;
1500 }
1501 /* Read in the xid if necessary */
1502 if (transport->tcp_flags & TCP_RCV_COPY_XID) {
1503 xs_tcp_read_xid(transport, &desc);
1504 continue;
1505 }
1506 /* Read in the call/reply flag */
1507 if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) {
1508 xs_tcp_read_calldir(transport, &desc);
1509 continue;
1510 }
1511 /* Read in the request data */
1512 if (transport->tcp_flags & TCP_RCV_COPY_DATA) {
1513 xs_tcp_read_data(xprt, &desc);
1514 continue;
1515 }
1516 /* Skip over any trailing bytes on short reads */
1517 xs_tcp_read_discard(transport, &desc);
1518 } while (desc.count);
1519 ret = len - desc.count;
1520 if (ret < rd_desc->count)
1521 rd_desc->count -= ret;
1522 else
1523 rd_desc->count = 0;
1524 trace_xs_tcp_data_recv(transport);
1525 dprintk("RPC: xs_tcp_data_recv done\n");
1526 return ret;
1527}
1528
1529static void xs_tcp_data_receive(struct sock_xprt *transport)
1530{
1531 struct rpc_xprt *xprt = &transport->xprt;
1532 struct sock *sk;
1533 read_descriptor_t rd_desc = {
1534 .arg.data = xprt,
1535 };
1536 unsigned long total = 0;
1537 int read = 0;
1538
1539restart:
1540 mutex_lock(&transport->recv_mutex);
1541 sk = transport->inet;
1542 if (sk == NULL)
1543 goto out;
1544
1545 /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
1546 for (;;) {
1547 rd_desc.count = RPC_TCP_READ_CHUNK_SZ;
1548 lock_sock(sk);
1549 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
1550 if (rd_desc.count != 0 || read < 0) {
1551 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
1552 release_sock(sk);
1553 break;
1554 }
1555 release_sock(sk);
1556 total += read;
1557 if (need_resched()) {
1558 mutex_unlock(&transport->recv_mutex);
1559 cond_resched();
1560 goto restart;
1561 }
1562 }
1563 if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1564 queue_work(xprtiod_workqueue, &transport->recv_worker);
1565out:
1566 mutex_unlock(&transport->recv_mutex);
1567 trace_xs_tcp_data_ready(xprt, read, total);
1568}
1569
1570static void xs_tcp_data_receive_workfn(struct work_struct *work)
1571{
1572 struct sock_xprt *transport =
1573 container_of(work, struct sock_xprt, recv_worker);
1574 xs_tcp_data_receive(transport);
1575}
1576
1577/** 1425/**
1578 * xs_tcp_state_change - callback to handle TCP socket state changes 1426 * xs_tcp_state_change - callback to handle TCP socket state changes
1579 * @sk: socket whose state has changed 1427 * @sk: socket whose state has changed
@@ -1600,17 +1448,13 @@ static void xs_tcp_state_change(struct sock *sk)
1600 case TCP_ESTABLISHED: 1448 case TCP_ESTABLISHED:
1601 spin_lock(&xprt->transport_lock); 1449 spin_lock(&xprt->transport_lock);
1602 if (!xprt_test_and_set_connected(xprt)) { 1450 if (!xprt_test_and_set_connected(xprt)) {
1603
1604 /* Reset TCP record info */
1605 transport->tcp_offset = 0;
1606 transport->tcp_reclen = 0;
1607 transport->tcp_copied = 0;
1608 transport->tcp_flags =
1609 TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
1610 xprt->connect_cookie++; 1451 xprt->connect_cookie++;
1611 clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); 1452 clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
1612 xprt_clear_connecting(xprt); 1453 xprt_clear_connecting(xprt);
1613 1454
1455 xprt->stat.connect_count++;
1456 xprt->stat.connect_time += (long)jiffies -
1457 xprt->stat.connect_start;
1614 xprt_wake_pending_tasks(xprt, -EAGAIN); 1458 xprt_wake_pending_tasks(xprt, -EAGAIN);
1615 } 1459 }
1616 spin_unlock(&xprt->transport_lock); 1460 spin_unlock(&xprt->transport_lock);
@@ -1675,7 +1519,8 @@ static void xs_write_space(struct sock *sk)
1675 if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) 1519 if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
1676 goto out; 1520 goto out;
1677 1521
1678 xprt_write_space(xprt); 1522 if (xprt_write_space(xprt))
1523 sk->sk_write_pending--;
1679out: 1524out:
1680 rcu_read_unlock(); 1525 rcu_read_unlock();
1681} 1526}
@@ -1773,11 +1618,17 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
1773 spin_unlock_bh(&xprt->transport_lock); 1618 spin_unlock_bh(&xprt->transport_lock);
1774} 1619}
1775 1620
1776static unsigned short xs_get_random_port(void) 1621static int xs_get_random_port(void)
1777{ 1622{
1778 unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; 1623 unsigned short min = xprt_min_resvport, max = xprt_max_resvport;
1779 unsigned short rand = (unsigned short) prandom_u32() % range; 1624 unsigned short range;
1780 return rand + xprt_min_resvport; 1625 unsigned short rand;
1626
1627 if (max < min)
1628 return -EADDRINUSE;
1629 range = max - min + 1;
1630 rand = (unsigned short) prandom_u32() % range;
1631 return rand + min;
1781} 1632}
1782 1633
1783/** 1634/**
@@ -1833,9 +1684,9 @@ static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
1833 transport->srcport = xs_sock_getport(sock); 1684 transport->srcport = xs_sock_getport(sock);
1834} 1685}
1835 1686
1836static unsigned short xs_get_srcport(struct sock_xprt *transport) 1687static int xs_get_srcport(struct sock_xprt *transport)
1837{ 1688{
1838 unsigned short port = transport->srcport; 1689 int port = transport->srcport;
1839 1690
1840 if (port == 0 && transport->xprt.resvport) 1691 if (port == 0 && transport->xprt.resvport)
1841 port = xs_get_random_port(); 1692 port = xs_get_random_port();
@@ -1856,7 +1707,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
1856{ 1707{
1857 struct sockaddr_storage myaddr; 1708 struct sockaddr_storage myaddr;
1858 int err, nloop = 0; 1709 int err, nloop = 0;
1859 unsigned short port = xs_get_srcport(transport); 1710 int port = xs_get_srcport(transport);
1860 unsigned short last; 1711 unsigned short last;
1861 1712
1862 /* 1713 /*
@@ -1874,8 +1725,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
1874 * transport->xprt.resvport == 1) xs_get_srcport above will 1725 * transport->xprt.resvport == 1) xs_get_srcport above will
1875 * ensure that port is non-zero and we will bind as needed. 1726 * ensure that port is non-zero and we will bind as needed.
1876 */ 1727 */
1877 if (port == 0) 1728 if (port <= 0)
1878 return 0; 1729 return port;
1879 1730
1880 memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); 1731 memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
1881 do { 1732 do {
@@ -2028,9 +1879,8 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
2028 write_unlock_bh(&sk->sk_callback_lock); 1879 write_unlock_bh(&sk->sk_callback_lock);
2029 } 1880 }
2030 1881
2031 /* Tell the socket layer to start connecting... */ 1882 xs_stream_reset_connect(transport);
2032 xprt->stat.connect_count++; 1883
2033 xprt->stat.connect_start = jiffies;
2034 return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); 1884 return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
2035} 1885}
2036 1886
@@ -2062,6 +1912,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
2062 case 0: 1912 case 0:
2063 dprintk("RPC: xprt %p connected to %s\n", 1913 dprintk("RPC: xprt %p connected to %s\n",
2064 xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); 1914 xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
1915 xprt->stat.connect_count++;
1916 xprt->stat.connect_time += (long)jiffies -
1917 xprt->stat.connect_start;
2065 xprt_set_connected(xprt); 1918 xprt_set_connected(xprt);
2066 case -ENOBUFS: 1919 case -ENOBUFS:
2067 break; 1920 break;
@@ -2386,9 +2239,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2386 2239
2387 xs_set_memalloc(xprt); 2240 xs_set_memalloc(xprt);
2388 2241
2242 /* Reset TCP record info */
2243 xs_stream_reset_connect(transport);
2244
2389 /* Tell the socket layer to start connecting... */ 2245 /* Tell the socket layer to start connecting... */
2390 xprt->stat.connect_count++;
2391 xprt->stat.connect_start = jiffies;
2392 set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); 2246 set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
2393 ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); 2247 ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
2394 switch (ret) { 2248 switch (ret) {
@@ -2561,7 +2415,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
2561 "%llu %llu %lu %llu %llu\n", 2415 "%llu %llu %lu %llu %llu\n",
2562 xprt->stat.bind_count, 2416 xprt->stat.bind_count,
2563 xprt->stat.connect_count, 2417 xprt->stat.connect_count,
2564 xprt->stat.connect_time, 2418 xprt->stat.connect_time / HZ,
2565 idle_time, 2419 idle_time,
2566 xprt->stat.sends, 2420 xprt->stat.sends,
2567 xprt->stat.recvs, 2421 xprt->stat.recvs,
@@ -2616,7 +2470,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
2616 transport->srcport, 2470 transport->srcport,
2617 xprt->stat.bind_count, 2471 xprt->stat.bind_count,
2618 xprt->stat.connect_count, 2472 xprt->stat.connect_count,
2619 xprt->stat.connect_time, 2473 xprt->stat.connect_time / HZ,
2620 idle_time, 2474 idle_time,
2621 xprt->stat.sends, 2475 xprt->stat.sends,
2622 xprt->stat.recvs, 2476 xprt->stat.recvs,
@@ -2704,9 +2558,8 @@ static int bc_sendto(struct rpc_rqst *req)
2704/* 2558/*
2705 * The send routine. Borrows from svc_send 2559 * The send routine. Borrows from svc_send
2706 */ 2560 */
2707static int bc_send_request(struct rpc_task *task) 2561static int bc_send_request(struct rpc_rqst *req)
2708{ 2562{
2709 struct rpc_rqst *req = task->tk_rqstp;
2710 struct svc_xprt *xprt; 2563 struct svc_xprt *xprt;
2711 int len; 2564 int len;
2712 2565
@@ -2720,12 +2573,7 @@ static int bc_send_request(struct rpc_task *task)
2720 * Grab the mutex to serialize data as the connection is shared 2573 * Grab the mutex to serialize data as the connection is shared
2721 * with the fore channel 2574 * with the fore channel
2722 */ 2575 */
2723 if (!mutex_trylock(&xprt->xpt_mutex)) { 2576 mutex_lock(&xprt->xpt_mutex);
2724 rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL);
2725 if (!mutex_trylock(&xprt->xpt_mutex))
2726 return -EAGAIN;
2727 rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task);
2728 }
2729 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) 2577 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
2730 len = -ENOTCONN; 2578 len = -ENOTCONN;
2731 else 2579 else
@@ -2761,7 +2609,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
2761 2609
2762static const struct rpc_xprt_ops xs_local_ops = { 2610static const struct rpc_xprt_ops xs_local_ops = {
2763 .reserve_xprt = xprt_reserve_xprt, 2611 .reserve_xprt = xprt_reserve_xprt,
2764 .release_xprt = xs_tcp_release_xprt, 2612 .release_xprt = xprt_release_xprt,
2765 .alloc_slot = xprt_alloc_slot, 2613 .alloc_slot = xprt_alloc_slot,
2766 .free_slot = xprt_free_slot, 2614 .free_slot = xprt_free_slot,
2767 .rpcbind = xs_local_rpcbind, 2615 .rpcbind = xs_local_rpcbind,
@@ -2769,6 +2617,7 @@ static const struct rpc_xprt_ops xs_local_ops = {
2769 .connect = xs_local_connect, 2617 .connect = xs_local_connect,
2770 .buf_alloc = rpc_malloc, 2618 .buf_alloc = rpc_malloc,
2771 .buf_free = rpc_free, 2619 .buf_free = rpc_free,
2620 .prepare_request = xs_stream_prepare_request,
2772 .send_request = xs_local_send_request, 2621 .send_request = xs_local_send_request,
2773 .set_retrans_timeout = xprt_set_retrans_timeout_def, 2622 .set_retrans_timeout = xprt_set_retrans_timeout_def,
2774 .close = xs_close, 2623 .close = xs_close,
@@ -2803,14 +2652,15 @@ static const struct rpc_xprt_ops xs_udp_ops = {
2803 2652
2804static const struct rpc_xprt_ops xs_tcp_ops = { 2653static const struct rpc_xprt_ops xs_tcp_ops = {
2805 .reserve_xprt = xprt_reserve_xprt, 2654 .reserve_xprt = xprt_reserve_xprt,
2806 .release_xprt = xs_tcp_release_xprt, 2655 .release_xprt = xprt_release_xprt,
2807 .alloc_slot = xprt_lock_and_alloc_slot, 2656 .alloc_slot = xprt_alloc_slot,
2808 .free_slot = xprt_free_slot, 2657 .free_slot = xprt_free_slot,
2809 .rpcbind = rpcb_getport_async, 2658 .rpcbind = rpcb_getport_async,
2810 .set_port = xs_set_port, 2659 .set_port = xs_set_port,
2811 .connect = xs_connect, 2660 .connect = xs_connect,
2812 .buf_alloc = rpc_malloc, 2661 .buf_alloc = rpc_malloc,
2813 .buf_free = rpc_free, 2662 .buf_free = rpc_free,
2663 .prepare_request = xs_stream_prepare_request,
2814 .send_request = xs_tcp_send_request, 2664 .send_request = xs_tcp_send_request,
2815 .set_retrans_timeout = xprt_set_retrans_timeout_def, 2665 .set_retrans_timeout = xprt_set_retrans_timeout_def,
2816 .close = xs_tcp_shutdown, 2666 .close = xs_tcp_shutdown,
@@ -2952,9 +2802,8 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
2952 xprt->ops = &xs_local_ops; 2802 xprt->ops = &xs_local_ops;
2953 xprt->timeout = &xs_local_default_timeout; 2803 xprt->timeout = &xs_local_default_timeout;
2954 2804
2955 INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); 2805 INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
2956 INIT_DELAYED_WORK(&transport->connect_worker, 2806 INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
2957 xs_dummy_setup_socket);
2958 2807
2959 switch (sun->sun_family) { 2808 switch (sun->sun_family) {
2960 case AF_LOCAL: 2809 case AF_LOCAL:
@@ -3106,7 +2955,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
3106 xprt->connect_timeout = xprt->timeout->to_initval * 2955 xprt->connect_timeout = xprt->timeout->to_initval *
3107 (xprt->timeout->to_retries + 1); 2956 (xprt->timeout->to_retries + 1);
3108 2957
3109 INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); 2958 INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
3110 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); 2959 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
3111 2960
3112 switch (addr->sa_family) { 2961 switch (addr->sa_family) {
@@ -3317,12 +3166,8 @@ static int param_set_uint_minmax(const char *val,
3317 3166
3318static int param_set_portnr(const char *val, const struct kernel_param *kp) 3167static int param_set_portnr(const char *val, const struct kernel_param *kp)
3319{ 3168{
3320 if (kp->arg == &xprt_min_resvport)
3321 return param_set_uint_minmax(val, kp,
3322 RPC_MIN_RESVPORT,
3323 xprt_max_resvport);
3324 return param_set_uint_minmax(val, kp, 3169 return param_set_uint_minmax(val, kp,
3325 xprt_min_resvport, 3170 RPC_MIN_RESVPORT,
3326 RPC_MAX_RESVPORT); 3171 RPC_MAX_RESVPORT);
3327} 3172}
3328 3173
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 645c16052052..e65c3a8551e4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -577,7 +577,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
577 rcu_dereference_rtnl(orig_dev->tipc_ptr); 577 rcu_dereference_rtnl(orig_dev->tipc_ptr);
578 if (likely(b && test_bit(0, &b->up) && 578 if (likely(b && test_bit(0, &b->up) &&
579 (skb->pkt_type <= PACKET_MULTICAST))) { 579 (skb->pkt_type <= PACKET_MULTICAST))) {
580 skb->next = NULL; 580 skb_mark_not_on_list(skb);
581 tipc_rcv(dev_net(b->pt.dev), skb, b); 581 tipc_rcv(dev_net(b->pt.dev), skb, b);
582 rcu_read_unlock(); 582 rcu_read_unlock();
583 return NET_RX_SUCCESS; 583 return NET_RX_SUCCESS;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b61891054709..f48e5857210f 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -499,54 +499,56 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
499/** 499/**
500 * tipc_msg_reverse(): swap source and destination addresses and add error code 500 * tipc_msg_reverse(): swap source and destination addresses and add error code
501 * @own_node: originating node id for reversed message 501 * @own_node: originating node id for reversed message
502 * @skb: buffer containing message to be reversed; may be replaced. 502 * @skb: buffer containing message to be reversed; will be consumed
503 * @err: error code to be set in message, if any 503 * @err: error code to be set in message, if any
504 * Consumes buffer at failure 504 * Replaces consumed buffer with new one when successful
505 * Returns true if success, otherwise false 505 * Returns true if success, otherwise false
506 */ 506 */
507bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) 507bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
508{ 508{
509 struct sk_buff *_skb = *skb; 509 struct sk_buff *_skb = *skb;
510 struct tipc_msg *hdr; 510 struct tipc_msg *_hdr, *hdr;
511 struct tipc_msg ohdr; 511 int hlen, dlen;
512 int dlen;
513 512
514 if (skb_linearize(_skb)) 513 if (skb_linearize(_skb))
515 goto exit; 514 goto exit;
516 hdr = buf_msg(_skb); 515 _hdr = buf_msg(_skb);
517 dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); 516 dlen = min_t(uint, msg_data_sz(_hdr), MAX_FORWARD_SIZE);
518 if (msg_dest_droppable(hdr)) 517 hlen = msg_hdr_sz(_hdr);
518
519 if (msg_dest_droppable(_hdr))
519 goto exit; 520 goto exit;
520 if (msg_errcode(hdr)) 521 if (msg_errcode(_hdr))
521 goto exit; 522 goto exit;
522 523
523 /* Take a copy of original header before altering message */ 524 /* Never return SHORT header */
524 memcpy(&ohdr, hdr, msg_hdr_sz(hdr)); 525 if (hlen == SHORT_H_SIZE)
525 526 hlen = BASIC_H_SIZE;
526 /* Never return SHORT header; expand by replacing buffer if necessary */ 527
527 if (msg_short(hdr)) { 528 /* Don't return data along with SYN+, - sender has a clone */
528 *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC); 529 if (msg_is_syn(_hdr) && err == TIPC_ERR_OVERLOAD)
529 if (!*skb) 530 dlen = 0;
530 goto exit; 531
531 memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); 532 /* Allocate new buffer to return */
532 kfree_skb(_skb); 533 *skb = tipc_buf_acquire(hlen + dlen, GFP_ATOMIC);
533 _skb = *skb; 534 if (!*skb)
534 hdr = buf_msg(_skb); 535 goto exit;
535 memcpy(hdr, &ohdr, BASIC_H_SIZE); 536 memcpy((*skb)->data, _skb->data, msg_hdr_sz(_hdr));
536 msg_set_hdr_sz(hdr, BASIC_H_SIZE); 537 memcpy((*skb)->data + hlen, msg_data(_hdr), dlen);
537 }
538 538
539 /* Now reverse the concerned fields */ 539 /* Build reverse header in new buffer */
540 hdr = buf_msg(*skb);
541 msg_set_hdr_sz(hdr, hlen);
540 msg_set_errcode(hdr, err); 542 msg_set_errcode(hdr, err);
541 msg_set_non_seq(hdr, 0); 543 msg_set_non_seq(hdr, 0);
542 msg_set_origport(hdr, msg_destport(&ohdr)); 544 msg_set_origport(hdr, msg_destport(_hdr));
543 msg_set_destport(hdr, msg_origport(&ohdr)); 545 msg_set_destport(hdr, msg_origport(_hdr));
544 msg_set_destnode(hdr, msg_prevnode(&ohdr)); 546 msg_set_destnode(hdr, msg_prevnode(_hdr));
545 msg_set_prevnode(hdr, own_node); 547 msg_set_prevnode(hdr, own_node);
546 msg_set_orignode(hdr, own_node); 548 msg_set_orignode(hdr, own_node);
547 msg_set_size(hdr, msg_hdr_sz(hdr) + dlen); 549 msg_set_size(hdr, hlen + dlen);
548 skb_trim(_skb, msg_size(hdr));
549 skb_orphan(_skb); 550 skb_orphan(_skb);
551 kfree_skb(_skb);
550 return true; 552 return true;
551exit: 553exit:
552 kfree_skb(_skb); 554 kfree_skb(_skb);
@@ -554,6 +556,22 @@ exit:
554 return false; 556 return false;
555} 557}
556 558
559bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy)
560{
561 struct sk_buff *skb, *_skb;
562
563 skb_queue_walk(msg, skb) {
564 _skb = skb_clone(skb, GFP_ATOMIC);
565 if (!_skb) {
566 __skb_queue_purge(cpy);
567 pr_err_ratelimited("Failed to clone buffer chain\n");
568 return false;
569 }
570 __skb_queue_tail(cpy, _skb);
571 }
572 return true;
573}
574
557/** 575/**
558 * tipc_msg_lookup_dest(): try to find new destination for named message 576 * tipc_msg_lookup_dest(): try to find new destination for named message
559 * @skb: the buffer containing the message. 577 * @skb: the buffer containing the message.
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index a4e944d59394..a2879e6ec5b6 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -216,6 +216,16 @@ static inline void msg_set_non_seq(struct tipc_msg *m, u32 n)
216 msg_set_bits(m, 0, 20, 1, n); 216 msg_set_bits(m, 0, 20, 1, n);
217} 217}
218 218
219static inline int msg_is_syn(struct tipc_msg *m)
220{
221 return msg_bits(m, 0, 17, 1);
222}
223
224static inline void msg_set_syn(struct tipc_msg *m, u32 d)
225{
226 msg_set_bits(m, 0, 17, 1, d);
227}
228
219static inline int msg_dest_droppable(struct tipc_msg *m) 229static inline int msg_dest_droppable(struct tipc_msg *m)
220{ 230{
221 return msg_bits(m, 0, 19, 1); 231 return msg_bits(m, 0, 19, 1);
@@ -970,6 +980,7 @@ bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
970 struct sk_buff_head *cpy); 980 struct sk_buff_head *cpy);
971void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, 981void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
972 struct sk_buff *skb); 982 struct sk_buff *skb);
983bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy);
973 984
974static inline u16 buf_seqno(struct sk_buff *skb) 985static inline u16 buf_seqno(struct sk_buff *skb)
975{ 986{
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 3cfeb9df64b0..61219f0b9677 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -94,8 +94,9 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
94 list_add_tail_rcu(&publ->binding_node, &nt->node_scope); 94 list_add_tail_rcu(&publ->binding_node, &nt->node_scope);
95 return NULL; 95 return NULL;
96 } 96 }
97 list_add_tail_rcu(&publ->binding_node, &nt->cluster_scope); 97 write_lock_bh(&nt->cluster_scope_lock);
98 98 list_add_tail(&publ->binding_node, &nt->cluster_scope);
99 write_unlock_bh(&nt->cluster_scope_lock);
99 skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); 100 skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
100 if (!skb) { 101 if (!skb) {
101 pr_warn("Publication distribution failure\n"); 102 pr_warn("Publication distribution failure\n");
@@ -112,11 +113,13 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
112 */ 113 */
113struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) 114struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
114{ 115{
116 struct name_table *nt = tipc_name_table(net);
115 struct sk_buff *buf; 117 struct sk_buff *buf;
116 struct distr_item *item; 118 struct distr_item *item;
117 119
118 list_del_rcu(&publ->binding_node); 120 write_lock_bh(&nt->cluster_scope_lock);
119 121 list_del(&publ->binding_node);
122 write_unlock_bh(&nt->cluster_scope_lock);
120 if (publ->scope == TIPC_NODE_SCOPE) 123 if (publ->scope == TIPC_NODE_SCOPE)
121 return NULL; 124 return NULL;
122 125
@@ -147,7 +150,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
147 ITEM_SIZE) * ITEM_SIZE; 150 ITEM_SIZE) * ITEM_SIZE;
148 u32 msg_rem = msg_dsz; 151 u32 msg_rem = msg_dsz;
149 152
150 list_for_each_entry_rcu(publ, pls, binding_node) { 153 list_for_each_entry(publ, pls, binding_node) {
151 /* Prepare next buffer: */ 154 /* Prepare next buffer: */
152 if (!skb) { 155 if (!skb) {
153 skb = named_prepare_buf(net, PUBLICATION, msg_rem, 156 skb = named_prepare_buf(net, PUBLICATION, msg_rem,
@@ -189,11 +192,10 @@ void tipc_named_node_up(struct net *net, u32 dnode)
189 192
190 __skb_queue_head_init(&head); 193 __skb_queue_head_init(&head);
191 194
192 rcu_read_lock(); 195 read_lock_bh(&nt->cluster_scope_lock);
193 named_distribute(net, &head, dnode, &nt->cluster_scope); 196 named_distribute(net, &head, dnode, &nt->cluster_scope);
194 rcu_read_unlock();
195
196 tipc_node_xmit(net, &head, dnode, 0); 197 tipc_node_xmit(net, &head, dnode, 0);
198 read_unlock_bh(&nt->cluster_scope_lock);
197} 199}
198 200
199/** 201/**
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 66d5b2c5987a..bff241f03525 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -744,6 +744,7 @@ int tipc_nametbl_init(struct net *net)
744 744
745 INIT_LIST_HEAD(&nt->node_scope); 745 INIT_LIST_HEAD(&nt->node_scope);
746 INIT_LIST_HEAD(&nt->cluster_scope); 746 INIT_LIST_HEAD(&nt->cluster_scope);
747 rwlock_init(&nt->cluster_scope_lock);
747 tn->nametbl = nt; 748 tn->nametbl = nt;
748 spin_lock_init(&tn->nametbl_lock); 749 spin_lock_init(&tn->nametbl_lock);
749 return 0; 750 return 0;
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 892bd750b85f..f79066334cc8 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -100,6 +100,7 @@ struct name_table {
100 struct hlist_head services[TIPC_NAMETBL_SIZE]; 100 struct hlist_head services[TIPC_NAMETBL_SIZE];
101 struct list_head node_scope; 101 struct list_head node_scope;
102 struct list_head cluster_scope; 102 struct list_head cluster_scope;
103 rwlock_t cluster_scope_lock;
103 u32 local_publ_count; 104 u32 local_publ_count;
104}; 105};
105 106
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 48b3298a248d..03f5efb62cfb 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -45,6 +45,7 @@
45/* Optional capabilities supported by this code version 45/* Optional capabilities supported by this code version
46 */ 46 */
47enum { 47enum {
48 TIPC_SYN_BIT = (1),
48 TIPC_BCAST_SYNCH = (1 << 1), 49 TIPC_BCAST_SYNCH = (1 << 1),
49 TIPC_BCAST_STATE_NACK = (1 << 2), 50 TIPC_BCAST_STATE_NACK = (1 << 2),
50 TIPC_BLOCK_FLOWCTL = (1 << 3), 51 TIPC_BLOCK_FLOWCTL = (1 << 3),
@@ -53,11 +54,12 @@ enum {
53 TIPC_LINK_PROTO_SEQNO = (1 << 6) 54 TIPC_LINK_PROTO_SEQNO = (1 << 6)
54}; 55};
55 56
56#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ 57#define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \
57 TIPC_BCAST_STATE_NACK | \ 58 TIPC_BCAST_SYNCH | \
58 TIPC_BCAST_RCAST | \ 59 TIPC_BCAST_STATE_NACK | \
59 TIPC_BLOCK_FLOWCTL | \ 60 TIPC_BCAST_RCAST | \
60 TIPC_NODE_ID128 | \ 61 TIPC_BLOCK_FLOWCTL | \
62 TIPC_NODE_ID128 | \
61 TIPC_LINK_PROTO_SEQNO) 63 TIPC_LINK_PROTO_SEQNO)
62#define INVALID_BEARER_ID -1 64#define INVALID_BEARER_ID -1
63 65
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 49810fdff4c5..636e6131769d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -47,7 +47,7 @@
47#include "netlink.h" 47#include "netlink.h"
48#include "group.h" 48#include "group.h"
49 49
50#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ 50#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
51#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ 51#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */
52#define TIPC_FWD_MSG 1 52#define TIPC_FWD_MSG 1
53#define TIPC_MAX_PORT 0xffffffff 53#define TIPC_MAX_PORT 0xffffffff
@@ -80,7 +80,6 @@ struct sockaddr_pair {
80 * @publications: list of publications for port 80 * @publications: list of publications for port
81 * @blocking_link: address of the congested link we are currently sleeping on 81 * @blocking_link: address of the congested link we are currently sleeping on
82 * @pub_count: total # of publications port has made during its lifetime 82 * @pub_count: total # of publications port has made during its lifetime
83 * @probing_state:
84 * @conn_timeout: the time we can wait for an unresponded setup request 83 * @conn_timeout: the time we can wait for an unresponded setup request
85 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue 84 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
86 * @cong_link_cnt: number of congested links 85 * @cong_link_cnt: number of congested links
@@ -102,8 +101,8 @@ struct tipc_sock {
102 struct list_head cong_links; 101 struct list_head cong_links;
103 struct list_head publications; 102 struct list_head publications;
104 u32 pub_count; 103 u32 pub_count;
105 uint conn_timeout;
106 atomic_t dupl_rcvcnt; 104 atomic_t dupl_rcvcnt;
105 u16 conn_timeout;
107 bool probe_unacked; 106 bool probe_unacked;
108 u16 cong_link_cnt; 107 u16 cong_link_cnt;
109 u16 snt_unacked; 108 u16 snt_unacked;
@@ -507,6 +506,9 @@ static void __tipc_shutdown(struct socket *sock, int error)
507 tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && 506 tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
508 !tsk_conn_cong(tsk))); 507 !tsk_conn_cong(tsk)));
509 508
509 /* Remove any pending SYN message */
510 __skb_queue_purge(&sk->sk_write_queue);
511
510 /* Reject all unreceived messages, except on an active connection 512 /* Reject all unreceived messages, except on an active connection
511 * (which disconnects locally & sends a 'FIN+' to peer). 513 * (which disconnects locally & sends a 'FIN+' to peer).
512 */ 514 */
@@ -715,7 +717,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
715 struct tipc_sock *tsk = tipc_sk(sk); 717 struct tipc_sock *tsk = tipc_sk(sk);
716 __poll_t revents = 0; 718 __poll_t revents = 0;
717 719
718 sock_poll_wait(file, wait); 720 sock_poll_wait(file, sock, wait);
719 721
720 if (sk->sk_shutdown & RCV_SHUTDOWN) 722 if (sk->sk_shutdown & RCV_SHUTDOWN)
721 revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 723 revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
@@ -1329,6 +1331,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
1329 tsk->conn_type = dest->addr.name.name.type; 1331 tsk->conn_type = dest->addr.name.name.type;
1330 tsk->conn_instance = dest->addr.name.name.instance; 1332 tsk->conn_instance = dest->addr.name.name.instance;
1331 } 1333 }
1334 msg_set_syn(hdr, 1);
1332 } 1335 }
1333 1336
1334 seq = &dest->addr.nameseq; 1337 seq = &dest->addr.nameseq;
@@ -1371,6 +1374,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
1371 rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); 1374 rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
1372 if (unlikely(rc != dlen)) 1375 if (unlikely(rc != dlen))
1373 return rc; 1376 return rc;
1377 if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue)))
1378 return -ENOMEM;
1374 1379
1375 rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); 1380 rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
1376 if (unlikely(rc == -ELINKCONG)) { 1381 if (unlikely(rc == -ELINKCONG)) {
@@ -1490,6 +1495,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
1490 struct net *net = sock_net(sk); 1495 struct net *net = sock_net(sk);
1491 struct tipc_msg *msg = &tsk->phdr; 1496 struct tipc_msg *msg = &tsk->phdr;
1492 1497
1498 msg_set_syn(msg, 0);
1493 msg_set_destnode(msg, peer_node); 1499 msg_set_destnode(msg, peer_node);
1494 msg_set_destport(msg, peer_port); 1500 msg_set_destport(msg, peer_port);
1495 msg_set_type(msg, TIPC_CONN_MSG); 1501 msg_set_type(msg, TIPC_CONN_MSG);
@@ -1501,6 +1507,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
1501 tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); 1507 tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
1502 tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); 1508 tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
1503 tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); 1509 tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
1510 __skb_queue_purge(&sk->sk_write_queue);
1504 if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) 1511 if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
1505 return; 1512 return;
1506 1513
@@ -1971,91 +1978,90 @@ static void tipc_sk_proto_rcv(struct sock *sk,
1971} 1978}
1972 1979
1973/** 1980/**
1974 * tipc_filter_connect - Handle incoming message for a connection-based socket 1981 * tipc_sk_filter_connect - check incoming message for a connection-based socket
1975 * @tsk: TIPC socket 1982 * @tsk: TIPC socket
1976 * @skb: pointer to message buffer. Set to NULL if buffer is consumed 1983 * @skb: pointer to message buffer.
1977 * 1984 * Returns true if message should be added to receive queue, false otherwise
1978 * Returns true if everything ok, false otherwise
1979 */ 1985 */
1980static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) 1986static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
1981{ 1987{
1982 struct sock *sk = &tsk->sk; 1988 struct sock *sk = &tsk->sk;
1983 struct net *net = sock_net(sk); 1989 struct net *net = sock_net(sk);
1984 struct tipc_msg *hdr = buf_msg(skb); 1990 struct tipc_msg *hdr = buf_msg(skb);
1985 u32 pport = msg_origport(hdr); 1991 bool con_msg = msg_connected(hdr);
1986 u32 pnode = msg_orignode(hdr); 1992 u32 pport = tsk_peer_port(tsk);
1993 u32 pnode = tsk_peer_node(tsk);
1994 u32 oport = msg_origport(hdr);
1995 u32 onode = msg_orignode(hdr);
1996 int err = msg_errcode(hdr);
1997 unsigned long delay;
1987 1998
1988 if (unlikely(msg_mcast(hdr))) 1999 if (unlikely(msg_mcast(hdr)))
1989 return false; 2000 return false;
1990 2001
1991 switch (sk->sk_state) { 2002 switch (sk->sk_state) {
1992 case TIPC_CONNECTING: 2003 case TIPC_CONNECTING:
1993 /* Accept only ACK or NACK message */ 2004 /* Setup ACK */
1994 if (unlikely(!msg_connected(hdr))) { 2005 if (likely(con_msg)) {
1995 if (pport != tsk_peer_port(tsk) || 2006 if (err)
1996 pnode != tsk_peer_node(tsk)) 2007 break;
1997 return false; 2008 tipc_sk_finish_conn(tsk, oport, onode);
1998 2009 msg_set_importance(&tsk->phdr, msg_importance(hdr));
1999 tipc_set_sk_state(sk, TIPC_DISCONNECTING); 2010 /* ACK+ message with data is added to receive queue */
2000 sk->sk_err = ECONNREFUSED; 2011 if (msg_data_sz(hdr))
2001 sk->sk_state_change(sk); 2012 return true;
2002 return true; 2013 /* Empty ACK-, - wake up sleeping connect() and drop */
2003 } 2014 sk->sk_data_ready(sk);
2004 2015 msg_set_dest_droppable(hdr, 1);
2005 if (unlikely(msg_errcode(hdr))) { 2016 return false;
2006 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
2007 sk->sk_err = ECONNREFUSED;
2008 sk->sk_state_change(sk);
2009 return true;
2010 }
2011
2012 if (unlikely(!msg_isdata(hdr))) {
2013 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
2014 sk->sk_err = EINVAL;
2015 sk->sk_state_change(sk);
2016 return true;
2017 } 2017 }
2018 /* Ignore connectionless message if not from listening socket */
2019 if (oport != pport || onode != pnode)
2020 return false;
2018 2021
2019 tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); 2022 /* Rejected SYN */
2020 msg_set_importance(&tsk->phdr, msg_importance(hdr)); 2023 if (err != TIPC_ERR_OVERLOAD)
2021 2024 break;
2022 /* If 'ACK+' message, add to socket receive queue */
2023 if (msg_data_sz(hdr))
2024 return true;
2025
2026 /* If empty 'ACK-' message, wake up sleeping connect() */
2027 sk->sk_data_ready(sk);
2028 2025
2029 /* 'ACK-' message is neither accepted nor rejected: */ 2026 /* Prepare for new setup attempt if we have a SYN clone */
2030 msg_set_dest_droppable(hdr, 1); 2027 if (skb_queue_empty(&sk->sk_write_queue))
2028 break;
2029 get_random_bytes(&delay, 2);
2030 delay %= (tsk->conn_timeout / 4);
2031 delay = msecs_to_jiffies(delay + 100);
2032 sk_reset_timer(sk, &sk->sk_timer, jiffies + delay);
2031 return false; 2033 return false;
2032
2033 case TIPC_OPEN: 2034 case TIPC_OPEN:
2034 case TIPC_DISCONNECTING: 2035 case TIPC_DISCONNECTING:
2035 break; 2036 return false;
2036 case TIPC_LISTEN: 2037 case TIPC_LISTEN:
2037 /* Accept only SYN message */ 2038 /* Accept only SYN message */
2038 if (!msg_connected(hdr) && !(msg_errcode(hdr))) 2039 if (!msg_is_syn(hdr) &&
2040 tipc_node_get_capabilities(net, onode) & TIPC_SYN_BIT)
2041 return false;
2042 if (!con_msg && !err)
2039 return true; 2043 return true;
2040 break; 2044 return false;
2041 case TIPC_ESTABLISHED: 2045 case TIPC_ESTABLISHED:
2042 /* Accept only connection-based messages sent by peer */ 2046 /* Accept only connection-based messages sent by peer */
2043 if (unlikely(!tsk_peer_msg(tsk, hdr))) 2047 if (likely(con_msg && !err && pport == oport && pnode == onode))
2048 return true;
2049 if (!tsk_peer_msg(tsk, hdr))
2044 return false; 2050 return false;
2045 2051 if (!err)
2046 if (unlikely(msg_errcode(hdr))) { 2052 return true;
2047 tipc_set_sk_state(sk, TIPC_DISCONNECTING); 2053 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
2048 /* Let timer expire on it's own */ 2054 tipc_node_remove_conn(net, pnode, tsk->portid);
2049 tipc_node_remove_conn(net, tsk_peer_node(tsk), 2055 sk->sk_state_change(sk);
2050 tsk->portid);
2051 sk->sk_state_change(sk);
2052 }
2053 return true; 2056 return true;
2054 default: 2057 default:
2055 pr_err("Unknown sk_state %u\n", sk->sk_state); 2058 pr_err("Unknown sk_state %u\n", sk->sk_state);
2056 } 2059 }
2057 2060 /* Abort connection setup attempt */
2058 return false; 2061 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
2062 sk->sk_err = ECONNREFUSED;
2063 sk->sk_state_change(sk);
2064 return true;
2059} 2065}
2060 2066
2061/** 2067/**
@@ -2557,43 +2563,78 @@ static int tipc_shutdown(struct socket *sock, int how)
2557 return res; 2563 return res;
2558} 2564}
2559 2565
2566static void tipc_sk_check_probing_state(struct sock *sk,
2567 struct sk_buff_head *list)
2568{
2569 struct tipc_sock *tsk = tipc_sk(sk);
2570 u32 pnode = tsk_peer_node(tsk);
2571 u32 pport = tsk_peer_port(tsk);
2572 u32 self = tsk_own_node(tsk);
2573 u32 oport = tsk->portid;
2574 struct sk_buff *skb;
2575
2576 if (tsk->probe_unacked) {
2577 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
2578 sk->sk_err = ECONNABORTED;
2579 tipc_node_remove_conn(sock_net(sk), pnode, pport);
2580 sk->sk_state_change(sk);
2581 return;
2582 }
2583 /* Prepare new probe */
2584 skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0,
2585 pnode, self, pport, oport, TIPC_OK);
2586 if (skb)
2587 __skb_queue_tail(list, skb);
2588 tsk->probe_unacked = true;
2589 sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
2590}
2591
2592static void tipc_sk_retry_connect(struct sock *sk, struct sk_buff_head *list)
2593{
2594 struct tipc_sock *tsk = tipc_sk(sk);
2595
2596 /* Try again later if dest link is congested */
2597 if (tsk->cong_link_cnt) {
2598 sk_reset_timer(sk, &sk->sk_timer, msecs_to_jiffies(100));
2599 return;
2600 }
2601 /* Prepare SYN for retransmit */
2602 tipc_msg_skb_clone(&sk->sk_write_queue, list);
2603}
2604
2560static void tipc_sk_timeout(struct timer_list *t) 2605static void tipc_sk_timeout(struct timer_list *t)
2561{ 2606{
2562 struct sock *sk = from_timer(sk, t, sk_timer); 2607 struct sock *sk = from_timer(sk, t, sk_timer);
2563 struct tipc_sock *tsk = tipc_sk(sk); 2608 struct tipc_sock *tsk = tipc_sk(sk);
2564 u32 peer_port = tsk_peer_port(tsk); 2609 u32 pnode = tsk_peer_node(tsk);
2565 u32 peer_node = tsk_peer_node(tsk); 2610 struct sk_buff_head list;
2566 u32 own_node = tsk_own_node(tsk); 2611 int rc = 0;
2567 u32 own_port = tsk->portid;
2568 struct net *net = sock_net(sk);
2569 struct sk_buff *skb = NULL;
2570 2612
2613 skb_queue_head_init(&list);
2571 bh_lock_sock(sk); 2614 bh_lock_sock(sk);
2572 if (!tipc_sk_connected(sk))
2573 goto exit;
2574 2615
2575 /* Try again later if socket is busy */ 2616 /* Try again later if socket is busy */
2576 if (sock_owned_by_user(sk)) { 2617 if (sock_owned_by_user(sk)) {
2577 sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); 2618 sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20);
2578 goto exit; 2619 bh_unlock_sock(sk);
2620 return;
2579 } 2621 }
2580 2622
2581 if (tsk->probe_unacked) { 2623 if (sk->sk_state == TIPC_ESTABLISHED)
2582 tipc_set_sk_state(sk, TIPC_DISCONNECTING); 2624 tipc_sk_check_probing_state(sk, &list);
2583 tipc_node_remove_conn(net, peer_node, peer_port); 2625 else if (sk->sk_state == TIPC_CONNECTING)
2584 sk->sk_state_change(sk); 2626 tipc_sk_retry_connect(sk, &list);
2585 goto exit; 2627
2586 }
2587 /* Send new probe */
2588 skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0,
2589 peer_node, own_node, peer_port, own_port,
2590 TIPC_OK);
2591 tsk->probe_unacked = true;
2592 sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
2593exit:
2594 bh_unlock_sock(sk); 2628 bh_unlock_sock(sk);
2595 if (skb) 2629
2596 tipc_node_xmit_skb(net, skb, peer_node, own_port); 2630 if (!skb_queue_empty(&list))
2631 rc = tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid);
2632
2633 /* SYN messages may cause link congestion */
2634 if (rc == -ELINKCONG) {
2635 tipc_dest_push(&tsk->cong_links, pnode, 0);
2636 tsk->cong_link_cnt = 1;
2637 }
2597 sock_put(sk); 2638 sock_put(sk);
2598} 2639}
2599 2640
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 2627b5d812e9..4bdea0057171 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -57,16 +57,12 @@
57 * @idr_lock: protect the connection identifier set 57 * @idr_lock: protect the connection identifier set
58 * @idr_in_use: amount of allocated identifier entry 58 * @idr_in_use: amount of allocated identifier entry
59 * @net: network namspace instance 59 * @net: network namspace instance
60 * @rcvbuf_cache: memory cache of server receive buffer 60 * @awork: accept work item
61 * @rcv_wq: receive workqueue 61 * @rcv_wq: receive workqueue
62 * @send_wq: send workqueue 62 * @send_wq: send workqueue
63 * @max_rcvbuf_size: maximum permitted receive message length 63 * @max_rcvbuf_size: maximum permitted receive message length
64 * @tipc_conn_new: callback will be called when new connection is incoming 64 * @listener: topsrv listener socket
65 * @tipc_conn_release: callback will be called before releasing the connection
66 * @tipc_conn_recvmsg: callback will be called when message arrives
67 * @name: server name 65 * @name: server name
68 * @imp: message importance
69 * @type: socket type
70 */ 66 */
71struct tipc_topsrv { 67struct tipc_topsrv {
72 struct idr conn_idr; 68 struct idr conn_idr;
@@ -90,9 +86,7 @@ struct tipc_topsrv {
90 * @server: pointer to connected server 86 * @server: pointer to connected server
91 * @sub_list: lsit to all pertaing subscriptions 87 * @sub_list: lsit to all pertaing subscriptions
92 * @sub_lock: lock protecting the subscription list 88 * @sub_lock: lock protecting the subscription list
93 * @outqueue_lock: control access to the outqueue
94 * @rwork: receive work item 89 * @rwork: receive work item
95 * @rx_action: what to do when connection socket is active
96 * @outqueue: pointer to first outbound message in queue 90 * @outqueue: pointer to first outbound message in queue
97 * @outqueue_lock: control access to the outqueue 91 * @outqueue_lock: control access to the outqueue
98 * @swork: send work item 92 * @swork: send work item
@@ -657,7 +651,7 @@ int tipc_topsrv_start(struct net *net)
657 srv->max_rcvbuf_size = sizeof(struct tipc_subscr); 651 srv->max_rcvbuf_size = sizeof(struct tipc_subscr);
658 INIT_WORK(&srv->awork, tipc_topsrv_accept); 652 INIT_WORK(&srv->awork, tipc_topsrv_accept);
659 653
660 strncpy(srv->name, name, strlen(name) + 1); 654 strscpy(srv->name, name, sizeof(srv->name));
661 tn->topsrv = srv; 655 tn->topsrv = srv;
662 atomic_set(&tn->subscription_count, 0); 656 atomic_set(&tn->subscription_count, 0);
663 657
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 9783101bc4a9..10dc59ce9c82 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -650,6 +650,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
650 struct udp_tunnel_sock_cfg tuncfg = {NULL}; 650 struct udp_tunnel_sock_cfg tuncfg = {NULL};
651 struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; 651 struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
652 u8 node_id[NODE_ID_LEN] = {0,}; 652 u8 node_id[NODE_ID_LEN] = {0,};
653 int rmcast = 0;
653 654
654 ub = kzalloc(sizeof(*ub), GFP_ATOMIC); 655 ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
655 if (!ub) 656 if (!ub)
@@ -680,6 +681,9 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
680 if (err) 681 if (err)
681 goto err; 682 goto err;
682 683
684 /* Checking remote ip address */
685 rmcast = tipc_udp_is_mcast_addr(&remote);
686
683 /* Autoconfigure own node identity if needed */ 687 /* Autoconfigure own node identity if needed */
684 if (!tipc_own_id(net)) { 688 if (!tipc_own_id(net)) {
685 memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); 689 memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16);
@@ -705,7 +709,12 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
705 goto err; 709 goto err;
706 } 710 }
707 udp_conf.family = AF_INET; 711 udp_conf.family = AF_INET;
708 udp_conf.local_ip.s_addr = htonl(INADDR_ANY); 712
713 /* Switch to use ANY to receive packets from group */
714 if (rmcast)
715 udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
716 else
717 udp_conf.local_ip.s_addr = local.ipv4.s_addr;
709 udp_conf.use_udp_checksums = false; 718 udp_conf.use_udp_checksums = false;
710 ub->ifindex = dev->ifindex; 719 ub->ifindex = dev->ifindex;
711 if (tipc_mtu_bad(dev, sizeof(struct iphdr) + 720 if (tipc_mtu_bad(dev, sizeof(struct iphdr) +
@@ -719,7 +728,10 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
719 udp_conf.family = AF_INET6; 728 udp_conf.family = AF_INET6;
720 udp_conf.use_udp6_tx_checksums = true; 729 udp_conf.use_udp6_tx_checksums = true;
721 udp_conf.use_udp6_rx_checksums = true; 730 udp_conf.use_udp6_rx_checksums = true;
722 udp_conf.local_ip6 = in6addr_any; 731 if (rmcast)
732 udp_conf.local_ip6 = in6addr_any;
733 else
734 udp_conf.local_ip6 = local.ipv6;
723 b->mtu = 1280; 735 b->mtu = 1280;
724#endif 736#endif
725 } else { 737 } else {
@@ -741,7 +753,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
741 * is used if it's a multicast address. 753 * is used if it's a multicast address.
742 */ 754 */
743 memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); 755 memcpy(&b->bcast_addr.value, &remote, sizeof(remote));
744 if (tipc_udp_is_mcast_addr(&remote)) 756 if (rmcast)
745 err = enable_mcast(ub, &remote); 757 err = enable_mcast(ub, &remote);
746 else 758 else
747 err = tipc_udp_rcast_add(b, &remote); 759 err = tipc_udp_rcast_add(b, &remote);
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 73f05ece53d0..99c1a19c17b1 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -8,6 +8,7 @@ config TLS
8 select CRYPTO_AES 8 select CRYPTO_AES
9 select CRYPTO_GCM 9 select CRYPTO_GCM
10 select STREAM_PARSER 10 select STREAM_PARSER
11 select NET_SOCK_MSG
11 default n 12 default n
12 ---help--- 13 ---help---
13 Enable kernel support for TLS protocol. This allows symmetric 14 Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 961b07d4d41c..276edbc04f38 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -421,7 +421,7 @@ last_record:
421 tls_push_record_flags = flags; 421 tls_push_record_flags = flags;
422 if (more) { 422 if (more) {
423 tls_ctx->pending_open_record_frags = 423 tls_ctx->pending_open_record_frags =
424 record->num_frags; 424 !!record->num_frags;
425 break; 425 break;
426 } 426 }
427 427
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 523622dc74f8..311cec8e533d 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -141,7 +141,6 @@ retry:
141 size = sg->length; 141 size = sg->length;
142 } 142 }
143 143
144 clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
145 ctx->in_tcp_sendpages = false; 144 ctx->in_tcp_sendpages = false;
146 ctx->sk_write_space(sk); 145 ctx->sk_write_space(sk);
147 146
@@ -193,15 +192,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
193 return rc; 192 return rc;
194} 193}
195 194
196int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, 195int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
197 int flags, long *timeo) 196 int flags)
198{ 197{
199 struct scatterlist *sg; 198 struct scatterlist *sg;
200 u16 offset; 199 u16 offset;
201 200
202 if (!tls_is_partially_sent_record(ctx))
203 return ctx->push_pending_record(sk, flags);
204
205 sg = ctx->partially_sent_record; 201 sg = ctx->partially_sent_record;
206 offset = ctx->partially_sent_offset; 202 offset = ctx->partially_sent_offset;
207 203
@@ -209,9 +205,23 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
209 return tls_push_sg(sk, ctx, sg, offset, flags); 205 return tls_push_sg(sk, ctx, sg, offset, flags);
210} 206}
211 207
208int tls_push_pending_closed_record(struct sock *sk,
209 struct tls_context *tls_ctx,
210 int flags, long *timeo)
211{
212 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
213
214 if (tls_is_partially_sent_record(tls_ctx) ||
215 !list_empty(&ctx->tx_list))
216 return tls_tx_records(sk, flags);
217 else
218 return tls_ctx->push_pending_record(sk, flags);
219}
220
212static void tls_write_space(struct sock *sk) 221static void tls_write_space(struct sock *sk)
213{ 222{
214 struct tls_context *ctx = tls_get_ctx(sk); 223 struct tls_context *ctx = tls_get_ctx(sk);
224 struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx);
215 225
216 /* If in_tcp_sendpages call lower protocol write space handler 226 /* If in_tcp_sendpages call lower protocol write space handler
217 * to ensure we wake up any waiting operations there. For example 227 * to ensure we wake up any waiting operations there. For example
@@ -222,20 +232,11 @@ static void tls_write_space(struct sock *sk)
222 return; 232 return;
223 } 233 }
224 234
225 if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { 235 /* Schedule the transmission if tx list is ready */
226 gfp_t sk_allocation = sk->sk_allocation; 236 if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) {
227 int rc; 237 /* Schedule the transmission */
228 long timeo = 0; 238 if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
229 239 schedule_delayed_work(&tx_ctx->tx_work.work, 0);
230 sk->sk_allocation = GFP_ATOMIC;
231 rc = tls_push_pending_closed_record(sk, ctx,
232 MSG_DONTWAIT |
233 MSG_NOSIGNAL,
234 &timeo);
235 sk->sk_allocation = sk_allocation;
236
237 if (rc < 0)
238 return;
239 } 240 }
240 241
241 ctx->sk_write_space(sk); 242 ctx->sk_write_space(sk);
@@ -270,19 +271,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
270 if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) 271 if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
271 tls_handle_open_record(sk, 0); 272 tls_handle_open_record(sk, 0);
272 273
273 if (ctx->partially_sent_record) {
274 struct scatterlist *sg = ctx->partially_sent_record;
275
276 while (1) {
277 put_page(sg_page(sg));
278 sk_mem_uncharge(sk, sg->length);
279
280 if (sg_is_last(sg))
281 break;
282 sg++;
283 }
284 }
285
286 /* We need these for tls_sw_fallback handling of other packets */ 274 /* We need these for tls_sw_fallback handling of other packets */
287 if (ctx->tx_conf == TLS_SW) { 275 if (ctx->tx_conf == TLS_SW) {
288 kfree(ctx->tx.rec_seq); 276 kfree(ctx->tx.rec_seq);
@@ -632,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
632 prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; 620 prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
633 621
634 prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; 622 prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
635 prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; 623 prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
636 prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; 624 prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
625 prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
637 626
638 prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; 627 prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
639 prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; 628 prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
640 prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; 629 prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
630 prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
641 631
642#ifdef CONFIG_TLS_DEVICE 632#ifdef CONFIG_TLS_DEVICE
643 prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; 633 prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
@@ -725,8 +715,6 @@ EXPORT_SYMBOL(tls_unregister_device);
725 715
726static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { 716static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
727 .name = "tls", 717 .name = "tls",
728 .uid = TCP_ULP_TLS,
729 .user_visible = true,
730 .owner = THIS_MODULE, 718 .owner = THIS_MODULE,
731 .init = tls_init, 719 .init = tls_init,
732}; 720};
@@ -736,7 +724,6 @@ static int __init tls_register(void)
736 build_protos(tls_prots[TLSV4], &tcp_prot); 724 build_protos(tls_prots[TLSV4], &tcp_prot);
737 725
738 tls_sw_proto_ops = inet_stream_ops; 726 tls_sw_proto_ops = inet_stream_ops;
739 tls_sw_proto_ops.poll = tls_sw_poll;
740 tls_sw_proto_ops.splice_read = tls_sw_splice_read; 727 tls_sw_proto_ops.splice_read = tls_sw_splice_read;
741 728
742#ifdef CONFIG_TLS_DEVICE 729#ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index b9c6ecfbcfea..5cd88ba8acd1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -4,6 +4,7 @@
4 * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved. 4 * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved.
5 * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved. 5 * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved.
6 * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved. 6 * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved.
7 * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io
7 * 8 *
8 * This software is available to you under a choice of one of two 9 * This software is available to you under a choice of one of two
9 * licenses. You may choose to be licensed under the terms of the GNU 10 * licenses. You may choose to be licensed under the terms of the GNU
@@ -43,12 +44,133 @@
43 44
44#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE 45#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE
45 46
47static int __skb_nsg(struct sk_buff *skb, int offset, int len,
48 unsigned int recursion_level)
49{
50 int start = skb_headlen(skb);
51 int i, chunk = start - offset;
52 struct sk_buff *frag_iter;
53 int elt = 0;
54
55 if (unlikely(recursion_level >= 24))
56 return -EMSGSIZE;
57
58 if (chunk > 0) {
59 if (chunk > len)
60 chunk = len;
61 elt++;
62 len -= chunk;
63 if (len == 0)
64 return elt;
65 offset += chunk;
66 }
67
68 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
69 int end;
70
71 WARN_ON(start > offset + len);
72
73 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
74 chunk = end - offset;
75 if (chunk > 0) {
76 if (chunk > len)
77 chunk = len;
78 elt++;
79 len -= chunk;
80 if (len == 0)
81 return elt;
82 offset += chunk;
83 }
84 start = end;
85 }
86
87 if (unlikely(skb_has_frag_list(skb))) {
88 skb_walk_frags(skb, frag_iter) {
89 int end, ret;
90
91 WARN_ON(start > offset + len);
92
93 end = start + frag_iter->len;
94 chunk = end - offset;
95 if (chunk > 0) {
96 if (chunk > len)
97 chunk = len;
98 ret = __skb_nsg(frag_iter, offset - start, chunk,
99 recursion_level + 1);
100 if (unlikely(ret < 0))
101 return ret;
102 elt += ret;
103 len -= chunk;
104 if (len == 0)
105 return elt;
106 offset += chunk;
107 }
108 start = end;
109 }
110 }
111 BUG_ON(len);
112 return elt;
113}
114
115/* Return the number of scatterlist elements required to completely map the
116 * skb, or -EMSGSIZE if the recursion depth is exceeded.
117 */
118static int skb_nsg(struct sk_buff *skb, int offset, int len)
119{
120 return __skb_nsg(skb, offset, len, 0);
121}
122
123static void tls_decrypt_done(struct crypto_async_request *req, int err)
124{
125 struct aead_request *aead_req = (struct aead_request *)req;
126 struct scatterlist *sgout = aead_req->dst;
127 struct tls_sw_context_rx *ctx;
128 struct tls_context *tls_ctx;
129 struct scatterlist *sg;
130 struct sk_buff *skb;
131 unsigned int pages;
132 int pending;
133
134 skb = (struct sk_buff *)req->data;
135 tls_ctx = tls_get_ctx(skb->sk);
136 ctx = tls_sw_ctx_rx(tls_ctx);
137 pending = atomic_dec_return(&ctx->decrypt_pending);
138
139 /* Propagate if there was an err */
140 if (err) {
141 ctx->async_wait.err = err;
142 tls_err_abort(skb->sk, err);
143 }
144
145 /* After using skb->sk to propagate sk through crypto async callback
146 * we need to NULL it again.
147 */
148 skb->sk = NULL;
149
150 /* Release the skb, pages and memory allocated for crypto req */
151 kfree_skb(skb);
152
153 /* Skip the first S/G entry as it points to AAD */
154 for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) {
155 if (!sg)
156 break;
157 put_page(sg_page(sg));
158 }
159
160 kfree(aead_req);
161
162 if (!pending && READ_ONCE(ctx->async_notify))
163 complete(&ctx->async_wait.completion);
164}
165
46static int tls_do_decryption(struct sock *sk, 166static int tls_do_decryption(struct sock *sk,
167 struct sk_buff *skb,
47 struct scatterlist *sgin, 168 struct scatterlist *sgin,
48 struct scatterlist *sgout, 169 struct scatterlist *sgout,
49 char *iv_recv, 170 char *iv_recv,
50 size_t data_len, 171 size_t data_len,
51 struct aead_request *aead_req) 172 struct aead_request *aead_req,
173 bool async)
52{ 174{
53 struct tls_context *tls_ctx = tls_get_ctx(sk); 175 struct tls_context *tls_ctx = tls_get_ctx(sk);
54 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 176 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -59,324 +181,657 @@ static int tls_do_decryption(struct sock *sk,
59 aead_request_set_crypt(aead_req, sgin, sgout, 181 aead_request_set_crypt(aead_req, sgin, sgout,
60 data_len + tls_ctx->rx.tag_size, 182 data_len + tls_ctx->rx.tag_size,
61 (u8 *)iv_recv); 183 (u8 *)iv_recv);
62 aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
63 crypto_req_done, &ctx->async_wait);
64 184
65 ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); 185 if (async) {
66 return ret; 186 /* Using skb->sk to push sk through to crypto async callback
67} 187 * handler. This allows propagating errors up to the socket
68 188 * if needed. It _must_ be cleared in the async handler
69static void trim_sg(struct sock *sk, struct scatterlist *sg, 189 * before kfree_skb is called. We _know_ skb->sk is NULL
70 int *sg_num_elem, unsigned int *sg_size, int target_size) 190 * because it is a clone from strparser.
71{ 191 */
72 int i = *sg_num_elem - 1; 192 skb->sk = sk;
73 int trim = *sg_size - target_size; 193 aead_request_set_callback(aead_req,
74 194 CRYPTO_TFM_REQ_MAY_BACKLOG,
75 if (trim <= 0) { 195 tls_decrypt_done, skb);
76 WARN_ON(trim < 0); 196 atomic_inc(&ctx->decrypt_pending);
77 return; 197 } else {
198 aead_request_set_callback(aead_req,
199 CRYPTO_TFM_REQ_MAY_BACKLOG,
200 crypto_req_done, &ctx->async_wait);
78 } 201 }
79 202
80 *sg_size = target_size; 203 ret = crypto_aead_decrypt(aead_req);
81 while (trim >= sg[i].length) { 204 if (ret == -EINPROGRESS) {
82 trim -= sg[i].length; 205 if (async)
83 sk_mem_uncharge(sk, sg[i].length); 206 return ret;
84 put_page(sg_page(&sg[i]));
85 i--;
86 207
87 if (i < 0) 208 ret = crypto_wait_req(ret, &ctx->async_wait);
88 goto out;
89 } 209 }
90 210
91 sg[i].length -= trim; 211 if (async)
92 sk_mem_uncharge(sk, trim); 212 atomic_dec(&ctx->decrypt_pending);
93 213
94out: 214 return ret;
95 *sg_num_elem = i + 1;
96} 215}
97 216
98static void trim_both_sgl(struct sock *sk, int target_size) 217static void tls_trim_both_msgs(struct sock *sk, int target_size)
99{ 218{
100 struct tls_context *tls_ctx = tls_get_ctx(sk); 219 struct tls_context *tls_ctx = tls_get_ctx(sk);
101 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 220 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
221 struct tls_rec *rec = ctx->open_rec;
102 222
103 trim_sg(sk, ctx->sg_plaintext_data, 223 sk_msg_trim(sk, &rec->msg_plaintext, target_size);
104 &ctx->sg_plaintext_num_elem,
105 &ctx->sg_plaintext_size,
106 target_size);
107
108 if (target_size > 0) 224 if (target_size > 0)
109 target_size += tls_ctx->tx.overhead_size; 225 target_size += tls_ctx->tx.overhead_size;
110 226 sk_msg_trim(sk, &rec->msg_encrypted, target_size);
111 trim_sg(sk, ctx->sg_encrypted_data,
112 &ctx->sg_encrypted_num_elem,
113 &ctx->sg_encrypted_size,
114 target_size);
115} 227}
116 228
117static int alloc_encrypted_sg(struct sock *sk, int len) 229static int tls_alloc_encrypted_msg(struct sock *sk, int len)
118{ 230{
119 struct tls_context *tls_ctx = tls_get_ctx(sk); 231 struct tls_context *tls_ctx = tls_get_ctx(sk);
120 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 232 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
121 int rc = 0; 233 struct tls_rec *rec = ctx->open_rec;
234 struct sk_msg *msg_en = &rec->msg_encrypted;
122 235
123 rc = sk_alloc_sg(sk, len, 236 return sk_msg_alloc(sk, msg_en, len, 0);
124 ctx->sg_encrypted_data, 0, 237}
125 &ctx->sg_encrypted_num_elem,
126 &ctx->sg_encrypted_size, 0);
127 238
128 if (rc == -ENOSPC) 239static int tls_clone_plaintext_msg(struct sock *sk, int required)
129 ctx->sg_encrypted_num_elem = ARRAY_SIZE(ctx->sg_encrypted_data); 240{
241 struct tls_context *tls_ctx = tls_get_ctx(sk);
242 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
243 struct tls_rec *rec = ctx->open_rec;
244 struct sk_msg *msg_pl = &rec->msg_plaintext;
245 struct sk_msg *msg_en = &rec->msg_encrypted;
246 int skip, len;
247
248 /* We add page references worth len bytes from encrypted sg
249 * at the end of plaintext sg. It is guaranteed that msg_en
250 * has enough required room (ensured by caller).
251 */
252 len = required - msg_pl->sg.size;
130 253
131 return rc; 254 /* Skip initial bytes in msg_en's data to be able to use
255 * same offset of both plain and encrypted data.
256 */
257 skip = tls_ctx->tx.prepend_size + msg_pl->sg.size;
258
259 return sk_msg_clone(sk, msg_pl, msg_en, skip, len);
132} 260}
133 261
134static int alloc_plaintext_sg(struct sock *sk, int len) 262static struct tls_rec *tls_get_rec(struct sock *sk)
135{ 263{
136 struct tls_context *tls_ctx = tls_get_ctx(sk); 264 struct tls_context *tls_ctx = tls_get_ctx(sk);
137 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 265 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
138 int rc = 0; 266 struct sk_msg *msg_pl, *msg_en;
267 struct tls_rec *rec;
268 int mem_size;
139 269
140 rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, 270 mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
141 &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
142 tls_ctx->pending_open_record_frags);
143 271
144 if (rc == -ENOSPC) 272 rec = kzalloc(mem_size, sk->sk_allocation);
145 ctx->sg_plaintext_num_elem = ARRAY_SIZE(ctx->sg_plaintext_data); 273 if (!rec)
274 return NULL;
146 275
147 return rc; 276 msg_pl = &rec->msg_plaintext;
277 msg_en = &rec->msg_encrypted;
278
279 sk_msg_init(msg_pl);
280 sk_msg_init(msg_en);
281
282 sg_init_table(rec->sg_aead_in, 2);
283 sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
284 sizeof(rec->aad_space));
285 sg_unmark_end(&rec->sg_aead_in[1]);
286
287 sg_init_table(rec->sg_aead_out, 2);
288 sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
289 sizeof(rec->aad_space));
290 sg_unmark_end(&rec->sg_aead_out[1]);
291
292 return rec;
148} 293}
149 294
150static void free_sg(struct sock *sk, struct scatterlist *sg, 295static void tls_free_rec(struct sock *sk, struct tls_rec *rec)
151 int *sg_num_elem, unsigned int *sg_size)
152{ 296{
153 int i, n = *sg_num_elem; 297 sk_msg_free(sk, &rec->msg_encrypted);
298 sk_msg_free(sk, &rec->msg_plaintext);
299 kfree(rec);
300}
154 301
155 for (i = 0; i < n; ++i) { 302static void tls_free_open_rec(struct sock *sk)
156 sk_mem_uncharge(sk, sg[i].length); 303{
157 put_page(sg_page(&sg[i])); 304 struct tls_context *tls_ctx = tls_get_ctx(sk);
305 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
306 struct tls_rec *rec = ctx->open_rec;
307
308 if (rec) {
309 tls_free_rec(sk, rec);
310 ctx->open_rec = NULL;
158 } 311 }
159 *sg_num_elem = 0;
160 *sg_size = 0;
161} 312}
162 313
163static void tls_free_both_sg(struct sock *sk) 314int tls_tx_records(struct sock *sk, int flags)
315{
316 struct tls_context *tls_ctx = tls_get_ctx(sk);
317 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
318 struct tls_rec *rec, *tmp;
319 struct sk_msg *msg_en;
320 int tx_flags, rc = 0;
321
322 if (tls_is_partially_sent_record(tls_ctx)) {
323 rec = list_first_entry(&ctx->tx_list,
324 struct tls_rec, list);
325
326 if (flags == -1)
327 tx_flags = rec->tx_flags;
328 else
329 tx_flags = flags;
330
331 rc = tls_push_partial_record(sk, tls_ctx, tx_flags);
332 if (rc)
333 goto tx_err;
334
335 /* Full record has been transmitted.
336 * Remove the head of tx_list
337 */
338 list_del(&rec->list);
339 sk_msg_free(sk, &rec->msg_plaintext);
340 kfree(rec);
341 }
342
343 /* Tx all ready records */
344 list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
345 if (READ_ONCE(rec->tx_ready)) {
346 if (flags == -1)
347 tx_flags = rec->tx_flags;
348 else
349 tx_flags = flags;
350
351 msg_en = &rec->msg_encrypted;
352 rc = tls_push_sg(sk, tls_ctx,
353 &msg_en->sg.data[msg_en->sg.curr],
354 0, tx_flags);
355 if (rc)
356 goto tx_err;
357
358 list_del(&rec->list);
359 sk_msg_free(sk, &rec->msg_plaintext);
360 kfree(rec);
361 } else {
362 break;
363 }
364 }
365
366tx_err:
367 if (rc < 0 && rc != -EAGAIN)
368 tls_err_abort(sk, EBADMSG);
369
370 return rc;
371}
372
373static void tls_encrypt_done(struct crypto_async_request *req, int err)
164{ 374{
375 struct aead_request *aead_req = (struct aead_request *)req;
376 struct sock *sk = req->data;
165 struct tls_context *tls_ctx = tls_get_ctx(sk); 377 struct tls_context *tls_ctx = tls_get_ctx(sk);
166 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 378 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
379 struct scatterlist *sge;
380 struct sk_msg *msg_en;
381 struct tls_rec *rec;
382 bool ready = false;
383 int pending;
384
385 rec = container_of(aead_req, struct tls_rec, aead_req);
386 msg_en = &rec->msg_encrypted;
167 387
168 free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, 388 sge = sk_msg_elem(msg_en, msg_en->sg.curr);
169 &ctx->sg_encrypted_size); 389 sge->offset -= tls_ctx->tx.prepend_size;
390 sge->length += tls_ctx->tx.prepend_size;
170 391
171 free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, 392 /* Check if error is previously set on socket */
172 &ctx->sg_plaintext_size); 393 if (err || sk->sk_err) {
394 rec = NULL;
395
396 /* If err is already set on socket, return the same code */
397 if (sk->sk_err) {
398 ctx->async_wait.err = sk->sk_err;
399 } else {
400 ctx->async_wait.err = err;
401 tls_err_abort(sk, err);
402 }
403 }
404
405 if (rec) {
406 struct tls_rec *first_rec;
407
408 /* Mark the record as ready for transmission */
409 smp_store_mb(rec->tx_ready, true);
410
411 /* If received record is at head of tx_list, schedule tx */
412 first_rec = list_first_entry(&ctx->tx_list,
413 struct tls_rec, list);
414 if (rec == first_rec)
415 ready = true;
416 }
417
418 pending = atomic_dec_return(&ctx->encrypt_pending);
419
420 if (!pending && READ_ONCE(ctx->async_notify))
421 complete(&ctx->async_wait.completion);
422
423 if (!ready)
424 return;
425
426 /* Schedule the transmission */
427 if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
428 schedule_delayed_work(&ctx->tx_work.work, 1);
173} 429}
174 430
175static int tls_do_encryption(struct tls_context *tls_ctx, 431static int tls_do_encryption(struct sock *sk,
432 struct tls_context *tls_ctx,
176 struct tls_sw_context_tx *ctx, 433 struct tls_sw_context_tx *ctx,
177 struct aead_request *aead_req, 434 struct aead_request *aead_req,
178 size_t data_len) 435 size_t data_len, u32 start)
179{ 436{
437 struct tls_rec *rec = ctx->open_rec;
438 struct sk_msg *msg_en = &rec->msg_encrypted;
439 struct scatterlist *sge = sk_msg_elem(msg_en, start);
180 int rc; 440 int rc;
181 441
182 ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; 442 sge->offset += tls_ctx->tx.prepend_size;
183 ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; 443 sge->length -= tls_ctx->tx.prepend_size;
444
445 msg_en->sg.curr = start;
184 446
185 aead_request_set_tfm(aead_req, ctx->aead_send); 447 aead_request_set_tfm(aead_req, ctx->aead_send);
186 aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); 448 aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
187 aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, 449 aead_request_set_crypt(aead_req, rec->sg_aead_in,
450 rec->sg_aead_out,
188 data_len, tls_ctx->tx.iv); 451 data_len, tls_ctx->tx.iv);
189 452
190 aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, 453 aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
191 crypto_req_done, &ctx->async_wait); 454 tls_encrypt_done, sk);
192 455
193 rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); 456 /* Add the record in tx_list */
457 list_add_tail((struct list_head *)&rec->list, &ctx->tx_list);
458 atomic_inc(&ctx->encrypt_pending);
194 459
195 ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; 460 rc = crypto_aead_encrypt(aead_req);
196 ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; 461 if (!rc || rc != -EINPROGRESS) {
462 atomic_dec(&ctx->encrypt_pending);
463 sge->offset -= tls_ctx->tx.prepend_size;
464 sge->length += tls_ctx->tx.prepend_size;
465 }
466
467 if (!rc) {
468 WRITE_ONCE(rec->tx_ready, true);
469 } else if (rc != -EINPROGRESS) {
470 list_del(&rec->list);
471 return rc;
472 }
197 473
474 /* Unhook the record from context if encryption is not failure */
475 ctx->open_rec = NULL;
476 tls_advance_record_sn(sk, &tls_ctx->tx);
198 return rc; 477 return rc;
199} 478}
200 479
201static int tls_push_record(struct sock *sk, int flags, 480static int tls_split_open_record(struct sock *sk, struct tls_rec *from,
202 unsigned char record_type) 481 struct tls_rec **to, struct sk_msg *msg_opl,
482 struct sk_msg *msg_oen, u32 split_point,
483 u32 tx_overhead_size, u32 *orig_end)
203{ 484{
204 struct tls_context *tls_ctx = tls_get_ctx(sk); 485 u32 i, j, bytes = 0, apply = msg_opl->apply_bytes;
205 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 486 struct scatterlist *sge, *osge, *nsge;
206 struct aead_request *req; 487 u32 orig_size = msg_opl->sg.size;
207 int rc; 488 struct scatterlist tmp = { };
489 struct sk_msg *msg_npl;
490 struct tls_rec *new;
491 int ret;
208 492
209 req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); 493 new = tls_get_rec(sk);
210 if (!req) 494 if (!new)
211 return -ENOMEM; 495 return -ENOMEM;
496 ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size +
497 tx_overhead_size, 0);
498 if (ret < 0) {
499 tls_free_rec(sk, new);
500 return ret;
501 }
212 502
213 sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); 503 *orig_end = msg_opl->sg.end;
214 sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); 504 i = msg_opl->sg.start;
215 505 sge = sk_msg_elem(msg_opl, i);
216 tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, 506 while (apply && sge->length) {
217 tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, 507 if (sge->length > apply) {
218 record_type); 508 u32 len = sge->length - apply;
219 509
220 tls_fill_prepend(tls_ctx, 510 get_page(sg_page(sge));
221 page_address(sg_page(&ctx->sg_encrypted_data[0])) + 511 sg_set_page(&tmp, sg_page(sge), len,
222 ctx->sg_encrypted_data[0].offset, 512 sge->offset + apply);
223 ctx->sg_plaintext_size, record_type); 513 sge->length = apply;
224 514 bytes += apply;
225 tls_ctx->pending_open_record_frags = 0; 515 apply = 0;
226 set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); 516 } else {
517 apply -= sge->length;
518 bytes += sge->length;
519 }
227 520
228 rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); 521 sk_msg_iter_var_next(i);
229 if (rc < 0) { 522 if (i == msg_opl->sg.end)
230 /* If we are called from write_space and 523 break;
231 * we fail, we need to set this SOCK_NOSPACE 524 sge = sk_msg_elem(msg_opl, i);
232 * to trigger another write_space in the future.
233 */
234 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
235 goto out_req;
236 } 525 }
237 526
238 free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, 527 msg_opl->sg.end = i;
239 &ctx->sg_plaintext_size); 528 msg_opl->sg.curr = i;
529 msg_opl->sg.copybreak = 0;
530 msg_opl->apply_bytes = 0;
531 msg_opl->sg.size = bytes;
532
533 msg_npl = &new->msg_plaintext;
534 msg_npl->apply_bytes = apply;
535 msg_npl->sg.size = orig_size - bytes;
536
537 j = msg_npl->sg.start;
538 nsge = sk_msg_elem(msg_npl, j);
539 if (tmp.length) {
540 memcpy(nsge, &tmp, sizeof(*nsge));
541 sk_msg_iter_var_next(j);
542 nsge = sk_msg_elem(msg_npl, j);
543 }
240 544
241 ctx->sg_encrypted_num_elem = 0; 545 osge = sk_msg_elem(msg_opl, i);
242 ctx->sg_encrypted_size = 0; 546 while (osge->length) {
547 memcpy(nsge, osge, sizeof(*nsge));
548 sg_unmark_end(nsge);
549 sk_msg_iter_var_next(i);
550 sk_msg_iter_var_next(j);
551 if (i == *orig_end)
552 break;
553 osge = sk_msg_elem(msg_opl, i);
554 nsge = sk_msg_elem(msg_npl, j);
555 }
243 556
244 /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ 557 msg_npl->sg.end = j;
245 rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); 558 msg_npl->sg.curr = j;
246 if (rc < 0 && rc != -EAGAIN) 559 msg_npl->sg.copybreak = 0;
247 tls_err_abort(sk, EBADMSG);
248 560
249 tls_advance_record_sn(sk, &tls_ctx->tx); 561 *to = new;
250out_req: 562 return 0;
251 aead_request_free(req);
252 return rc;
253} 563}
254 564
255static int tls_sw_push_pending_record(struct sock *sk, int flags) 565static void tls_merge_open_record(struct sock *sk, struct tls_rec *to,
566 struct tls_rec *from, u32 orig_end)
256{ 567{
257 return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); 568 struct sk_msg *msg_npl = &from->msg_plaintext;
569 struct sk_msg *msg_opl = &to->msg_plaintext;
570 struct scatterlist *osge, *nsge;
571 u32 i, j;
572
573 i = msg_opl->sg.end;
574 sk_msg_iter_var_prev(i);
575 j = msg_npl->sg.start;
576
577 osge = sk_msg_elem(msg_opl, i);
578 nsge = sk_msg_elem(msg_npl, j);
579
580 if (sg_page(osge) == sg_page(nsge) &&
581 osge->offset + osge->length == nsge->offset) {
582 osge->length += nsge->length;
583 put_page(sg_page(nsge));
584 }
585
586 msg_opl->sg.end = orig_end;
587 msg_opl->sg.curr = orig_end;
588 msg_opl->sg.copybreak = 0;
589 msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size;
590 msg_opl->sg.size += msg_npl->sg.size;
591
592 sk_msg_free(sk, &to->msg_encrypted);
593 sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted);
594
595 kfree(from);
258} 596}
259 597
260static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, 598static int tls_push_record(struct sock *sk, int flags,
261 int length, int *pages_used, 599 unsigned char record_type)
262 unsigned int *size_used,
263 struct scatterlist *to, int to_max_pages,
264 bool charge)
265{ 600{
266 struct page *pages[MAX_SKB_FRAGS]; 601 struct tls_context *tls_ctx = tls_get_ctx(sk);
602 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
603 struct tls_rec *rec = ctx->open_rec, *tmp = NULL;
604 u32 i, split_point, uninitialized_var(orig_end);
605 struct sk_msg *msg_pl, *msg_en;
606 struct aead_request *req;
607 bool split;
608 int rc;
267 609
268 size_t offset; 610 if (!rec)
269 ssize_t copied, use; 611 return 0;
270 int i = 0;
271 unsigned int size = *size_used;
272 int num_elem = *pages_used;
273 int rc = 0;
274 int maxpages;
275 612
276 while (length > 0) { 613 msg_pl = &rec->msg_plaintext;
277 i = 0; 614 msg_en = &rec->msg_encrypted;
278 maxpages = to_max_pages - num_elem; 615
279 if (maxpages == 0) { 616 split_point = msg_pl->apply_bytes;
280 rc = -EFAULT; 617 split = split_point && split_point < msg_pl->sg.size;
281 goto out; 618 if (split) {
282 } 619 rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
283 copied = iov_iter_get_pages(from, pages, 620 split_point, tls_ctx->tx.overhead_size,
284 length, 621 &orig_end);
285 maxpages, &offset); 622 if (rc < 0)
286 if (copied <= 0) { 623 return rc;
287 rc = -EFAULT; 624 sk_msg_trim(sk, msg_en, msg_pl->sg.size +
288 goto out; 625 tls_ctx->tx.overhead_size);
289 } 626 }
290 627
291 iov_iter_advance(from, copied); 628 rec->tx_flags = flags;
629 req = &rec->aead_req;
292 630
293 length -= copied; 631 i = msg_pl->sg.end;
294 size += copied; 632 sk_msg_iter_var_prev(i);
295 while (copied) { 633 sg_mark_end(sk_msg_elem(msg_pl, i));
296 use = min_t(int, copied, PAGE_SIZE - offset);
297 634
298 sg_set_page(&to[num_elem], 635 i = msg_pl->sg.start;
299 pages[i], use, offset); 636 sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
300 sg_unmark_end(&to[num_elem]); 637 &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
301 if (charge)
302 sk_mem_charge(sk, use);
303 638
304 offset = 0; 639 i = msg_en->sg.end;
305 copied -= use; 640 sk_msg_iter_var_prev(i);
641 sg_mark_end(sk_msg_elem(msg_en, i));
642
643 i = msg_en->sg.start;
644 sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
645
646 tls_make_aad(rec->aad_space, msg_pl->sg.size,
647 tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
648 record_type);
649
650 tls_fill_prepend(tls_ctx,
651 page_address(sg_page(&msg_en->sg.data[i])) +
652 msg_en->sg.data[i].offset, msg_pl->sg.size,
653 record_type);
654
655 tls_ctx->pending_open_record_frags = false;
306 656
307 ++i; 657 rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i);
308 ++num_elem; 658 if (rc < 0) {
659 if (rc != -EINPROGRESS) {
660 tls_err_abort(sk, EBADMSG);
661 if (split) {
662 tls_ctx->pending_open_record_frags = true;
663 tls_merge_open_record(sk, rec, tmp, orig_end);
664 }
309 } 665 }
666 return rc;
667 } else if (split) {
668 msg_pl = &tmp->msg_plaintext;
669 msg_en = &tmp->msg_encrypted;
670 sk_msg_trim(sk, msg_en, msg_pl->sg.size +
671 tls_ctx->tx.overhead_size);
672 tls_ctx->pending_open_record_frags = true;
673 ctx->open_rec = tmp;
310 } 674 }
311 675
312 /* Mark the end in the last sg entry if newly added */ 676 return tls_tx_records(sk, flags);
313 if (num_elem > *pages_used)
314 sg_mark_end(&to[num_elem - 1]);
315out:
316 if (rc)
317 iov_iter_revert(from, size - *size_used);
318 *size_used = size;
319 *pages_used = num_elem;
320
321 return rc;
322} 677}
323 678
324static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, 679static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
325 int bytes) 680 bool full_record, u8 record_type,
681 size_t *copied, int flags)
326{ 682{
327 struct tls_context *tls_ctx = tls_get_ctx(sk); 683 struct tls_context *tls_ctx = tls_get_ctx(sk);
328 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 684 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
329 struct scatterlist *sg = ctx->sg_plaintext_data; 685 struct sk_msg msg_redir = { };
330 int copy, i, rc = 0; 686 struct sk_psock *psock;
331 687 struct sock *sk_redir;
332 for (i = tls_ctx->pending_open_record_frags; 688 struct tls_rec *rec;
333 i < ctx->sg_plaintext_num_elem; ++i) { 689 int err = 0, send;
334 copy = sg[i].length; 690 bool enospc;
335 if (copy_from_iter( 691
336 page_address(sg_page(&sg[i])) + sg[i].offset, 692 psock = sk_psock_get(sk);
337 copy, from) != copy) { 693 if (!psock)
338 rc = -EFAULT; 694 return tls_push_record(sk, flags, record_type);
339 goto out; 695more_data:
696 enospc = sk_msg_full(msg);
697 if (psock->eval == __SK_NONE)
698 psock->eval = sk_psock_msg_verdict(sk, psock, msg);
699 if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
700 !enospc && !full_record) {
701 err = -ENOSPC;
702 goto out_err;
703 }
704 msg->cork_bytes = 0;
705 send = msg->sg.size;
706 if (msg->apply_bytes && msg->apply_bytes < send)
707 send = msg->apply_bytes;
708
709 switch (psock->eval) {
710 case __SK_PASS:
711 err = tls_push_record(sk, flags, record_type);
712 if (err < 0) {
713 *copied -= sk_msg_free(sk, msg);
714 tls_free_open_rec(sk);
715 goto out_err;
716 }
717 break;
718 case __SK_REDIRECT:
719 sk_redir = psock->sk_redir;
720 memcpy(&msg_redir, msg, sizeof(*msg));
721 if (msg->apply_bytes < send)
722 msg->apply_bytes = 0;
723 else
724 msg->apply_bytes -= send;
725 sk_msg_return_zero(sk, msg, send);
726 msg->sg.size -= send;
727 release_sock(sk);
728 err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
729 lock_sock(sk);
730 if (err < 0) {
731 *copied -= sk_msg_free_nocharge(sk, &msg_redir);
732 msg->sg.size = 0;
340 } 733 }
341 bytes -= copy; 734 if (msg->sg.size == 0)
735 tls_free_open_rec(sk);
736 break;
737 case __SK_DROP:
738 default:
739 sk_msg_free_partial(sk, msg, send);
740 if (msg->apply_bytes < send)
741 msg->apply_bytes = 0;
742 else
743 msg->apply_bytes -= send;
744 if (msg->sg.size == 0)
745 tls_free_open_rec(sk);
746 *copied -= send;
747 err = -EACCES;
748 }
342 749
343 ++tls_ctx->pending_open_record_frags; 750 if (likely(!err)) {
751 bool reset_eval = !ctx->open_rec;
344 752
345 if (!bytes) 753 rec = ctx->open_rec;
346 break; 754 if (rec) {
755 msg = &rec->msg_plaintext;
756 if (!msg->apply_bytes)
757 reset_eval = true;
758 }
759 if (reset_eval) {
760 psock->eval = __SK_NONE;
761 if (psock->sk_redir) {
762 sock_put(psock->sk_redir);
763 psock->sk_redir = NULL;
764 }
765 }
766 if (rec)
767 goto more_data;
347 } 768 }
769 out_err:
770 sk_psock_put(sk, psock);
771 return err;
772}
348 773
349out: 774static int tls_sw_push_pending_record(struct sock *sk, int flags)
350 return rc; 775{
776 struct tls_context *tls_ctx = tls_get_ctx(sk);
777 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
778 struct tls_rec *rec = ctx->open_rec;
779 struct sk_msg *msg_pl;
780 size_t copied;
781
782 if (!rec)
783 return 0;
784
785 msg_pl = &rec->msg_plaintext;
786 copied = msg_pl->sg.size;
787 if (!copied)
788 return 0;
789
790 return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA,
791 &copied, flags);
351} 792}
352 793
353int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) 794int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
354{ 795{
796 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
355 struct tls_context *tls_ctx = tls_get_ctx(sk); 797 struct tls_context *tls_ctx = tls_get_ctx(sk);
356 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 798 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
357 int ret = 0; 799 struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send);
358 int required_size; 800 bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC;
359 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 801 unsigned char record_type = TLS_RECORD_TYPE_DATA;
802 bool is_kvec = msg->msg_iter.type & ITER_KVEC;
360 bool eor = !(msg->msg_flags & MSG_MORE); 803 bool eor = !(msg->msg_flags & MSG_MORE);
361 size_t try_to_copy, copied = 0; 804 size_t try_to_copy, copied = 0;
362 unsigned char record_type = TLS_RECORD_TYPE_DATA; 805 struct sk_msg *msg_pl, *msg_en;
363 int record_room; 806 struct tls_rec *rec;
807 int required_size;
808 int num_async = 0;
364 bool full_record; 809 bool full_record;
810 int record_room;
811 int num_zc = 0;
365 int orig_size; 812 int orig_size;
366 bool is_kvec = msg->msg_iter.type & ITER_KVEC; 813 int ret = 0;
367 814
368 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 815 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
369 return -ENOTSUPP; 816 return -ENOTSUPP;
370 817
371 lock_sock(sk); 818 lock_sock(sk);
372 819
373 if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) 820 /* Wait till there is any pending write on socket */
374 goto send_end; 821 if (unlikely(sk->sk_write_pending)) {
822 ret = wait_on_pending_writer(sk, &timeo);
823 if (unlikely(ret))
824 goto send_end;
825 }
375 826
376 if (unlikely(msg->msg_controllen)) { 827 if (unlikely(msg->msg_controllen)) {
377 ret = tls_proccess_cmsg(sk, msg, &record_type); 828 ret = tls_proccess_cmsg(sk, msg, &record_type);
378 if (ret) 829 if (ret) {
379 goto send_end; 830 if (ret == -EINPROGRESS)
831 num_async++;
832 else if (ret != -EAGAIN)
833 goto send_end;
834 }
380 } 835 }
381 836
382 while (msg_data_left(msg)) { 837 while (msg_data_left(msg)) {
@@ -385,22 +840,35 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
385 goto send_end; 840 goto send_end;
386 } 841 }
387 842
388 orig_size = ctx->sg_plaintext_size; 843 if (ctx->open_rec)
844 rec = ctx->open_rec;
845 else
846 rec = ctx->open_rec = tls_get_rec(sk);
847 if (!rec) {
848 ret = -ENOMEM;
849 goto send_end;
850 }
851
852 msg_pl = &rec->msg_plaintext;
853 msg_en = &rec->msg_encrypted;
854
855 orig_size = msg_pl->sg.size;
389 full_record = false; 856 full_record = false;
390 try_to_copy = msg_data_left(msg); 857 try_to_copy = msg_data_left(msg);
391 record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; 858 record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
392 if (try_to_copy >= record_room) { 859 if (try_to_copy >= record_room) {
393 try_to_copy = record_room; 860 try_to_copy = record_room;
394 full_record = true; 861 full_record = true;
395 } 862 }
396 863
397 required_size = ctx->sg_plaintext_size + try_to_copy + 864 required_size = msg_pl->sg.size + try_to_copy +
398 tls_ctx->tx.overhead_size; 865 tls_ctx->tx.overhead_size;
399 866
400 if (!sk_stream_memory_free(sk)) 867 if (!sk_stream_memory_free(sk))
401 goto wait_for_sndbuf; 868 goto wait_for_sndbuf;
869
402alloc_encrypted: 870alloc_encrypted:
403 ret = alloc_encrypted_sg(sk, required_size); 871 ret = tls_alloc_encrypted_msg(sk, required_size);
404 if (ret) { 872 if (ret) {
405 if (ret != -ENOSPC) 873 if (ret != -ENOSPC)
406 goto wait_for_memory; 874 goto wait_for_memory;
@@ -409,66 +877,88 @@ alloc_encrypted:
409 * actually allocated. The difference is due 877 * actually allocated. The difference is due
410 * to max sg elements limit 878 * to max sg elements limit
411 */ 879 */
412 try_to_copy -= required_size - ctx->sg_encrypted_size; 880 try_to_copy -= required_size - msg_en->sg.size;
413 full_record = true; 881 full_record = true;
414 } 882 }
415 if (!is_kvec && (full_record || eor)) { 883
416 ret = zerocopy_from_iter(sk, &msg->msg_iter, 884 if (!is_kvec && (full_record || eor) && !async_capable) {
417 try_to_copy, &ctx->sg_plaintext_num_elem, 885 u32 first = msg_pl->sg.end;
418 &ctx->sg_plaintext_size, 886
419 ctx->sg_plaintext_data, 887 ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
420 ARRAY_SIZE(ctx->sg_plaintext_data), 888 msg_pl, try_to_copy);
421 true);
422 if (ret) 889 if (ret)
423 goto fallback_to_reg_send; 890 goto fallback_to_reg_send;
424 891
892 rec->inplace_crypto = 0;
893
894 num_zc++;
425 copied += try_to_copy; 895 copied += try_to_copy;
426 ret = tls_push_record(sk, msg->msg_flags, record_type);
427 if (ret)
428 goto send_end;
429 continue;
430 896
897 sk_msg_sg_copy_set(msg_pl, first);
898 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
899 record_type, &copied,
900 msg->msg_flags);
901 if (ret) {
902 if (ret == -EINPROGRESS)
903 num_async++;
904 else if (ret == -ENOMEM)
905 goto wait_for_memory;
906 else if (ret == -ENOSPC)
907 goto rollback_iter;
908 else if (ret != -EAGAIN)
909 goto send_end;
910 }
911 continue;
912rollback_iter:
913 copied -= try_to_copy;
914 sk_msg_sg_copy_clear(msg_pl, first);
915 iov_iter_revert(&msg->msg_iter,
916 msg_pl->sg.size - orig_size);
431fallback_to_reg_send: 917fallback_to_reg_send:
432 trim_sg(sk, ctx->sg_plaintext_data, 918 sk_msg_trim(sk, msg_pl, orig_size);
433 &ctx->sg_plaintext_num_elem,
434 &ctx->sg_plaintext_size,
435 orig_size);
436 } 919 }
437 920
438 required_size = ctx->sg_plaintext_size + try_to_copy; 921 required_size = msg_pl->sg.size + try_to_copy;
439alloc_plaintext: 922
440 ret = alloc_plaintext_sg(sk, required_size); 923 ret = tls_clone_plaintext_msg(sk, required_size);
441 if (ret) { 924 if (ret) {
442 if (ret != -ENOSPC) 925 if (ret != -ENOSPC)
443 goto wait_for_memory; 926 goto send_end;
444 927
445 /* Adjust try_to_copy according to the amount that was 928 /* Adjust try_to_copy according to the amount that was
446 * actually allocated. The difference is due 929 * actually allocated. The difference is due
447 * to max sg elements limit 930 * to max sg elements limit
448 */ 931 */
449 try_to_copy -= required_size - ctx->sg_plaintext_size; 932 try_to_copy -= required_size - msg_pl->sg.size;
450 full_record = true; 933 full_record = true;
451 934 sk_msg_trim(sk, msg_en, msg_pl->sg.size +
452 trim_sg(sk, ctx->sg_encrypted_data, 935 tls_ctx->tx.overhead_size);
453 &ctx->sg_encrypted_num_elem,
454 &ctx->sg_encrypted_size,
455 ctx->sg_plaintext_size +
456 tls_ctx->tx.overhead_size);
457 } 936 }
458 937
459 ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); 938 ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl,
460 if (ret) 939 try_to_copy);
940 if (ret < 0)
461 goto trim_sgl; 941 goto trim_sgl;
462 942
943 /* Open records defined only if successfully copied, otherwise
944 * we would trim the sg but not reset the open record frags.
945 */
946 tls_ctx->pending_open_record_frags = true;
463 copied += try_to_copy; 947 copied += try_to_copy;
464 if (full_record || eor) { 948 if (full_record || eor) {
465push_record: 949 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
466 ret = tls_push_record(sk, msg->msg_flags, record_type); 950 record_type, &copied,
951 msg->msg_flags);
467 if (ret) { 952 if (ret) {
468 if (ret == -ENOMEM) 953 if (ret == -EINPROGRESS)
954 num_async++;
955 else if (ret == -ENOMEM)
469 goto wait_for_memory; 956 goto wait_for_memory;
470 957 else if (ret != -EAGAIN) {
471 goto send_end; 958 if (ret == -ENOSPC)
959 ret = 0;
960 goto send_end;
961 }
472 } 962 }
473 } 963 }
474 964
@@ -480,17 +970,37 @@ wait_for_memory:
480 ret = sk_stream_wait_memory(sk, &timeo); 970 ret = sk_stream_wait_memory(sk, &timeo);
481 if (ret) { 971 if (ret) {
482trim_sgl: 972trim_sgl:
483 trim_both_sgl(sk, orig_size); 973 tls_trim_both_msgs(sk, orig_size);
484 goto send_end; 974 goto send_end;
485 } 975 }
486 976
487 if (tls_is_pending_closed_record(tls_ctx)) 977 if (msg_en->sg.size < required_size)
488 goto push_record;
489
490 if (ctx->sg_encrypted_size < required_size)
491 goto alloc_encrypted; 978 goto alloc_encrypted;
979 }
492 980
493 goto alloc_plaintext; 981 if (!num_async) {
982 goto send_end;
983 } else if (num_zc) {
984 /* Wait for pending encryptions to get completed */
985 smp_store_mb(ctx->async_notify, true);
986
987 if (atomic_read(&ctx->encrypt_pending))
988 crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
989 else
990 reinit_completion(&ctx->async_wait.completion);
991
992 WRITE_ONCE(ctx->async_notify, false);
993
994 if (ctx->async_wait.err) {
995 ret = ctx->async_wait.err;
996 copied = 0;
997 }
998 }
999
1000 /* Transmit if any encryptions have completed */
1001 if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
1002 cancel_delayed_work(&ctx->tx_work.work);
1003 tls_tx_records(sk, msg->msg_flags);
494 } 1004 }
495 1005
496send_end: 1006send_end:
@@ -503,16 +1013,18 @@ send_end:
503int tls_sw_sendpage(struct sock *sk, struct page *page, 1013int tls_sw_sendpage(struct sock *sk, struct page *page,
504 int offset, size_t size, int flags) 1014 int offset, size_t size, int flags)
505{ 1015{
1016 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
506 struct tls_context *tls_ctx = tls_get_ctx(sk); 1017 struct tls_context *tls_ctx = tls_get_ctx(sk);
507 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 1018 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
508 int ret = 0;
509 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
510 bool eor;
511 size_t orig_size = size;
512 unsigned char record_type = TLS_RECORD_TYPE_DATA; 1019 unsigned char record_type = TLS_RECORD_TYPE_DATA;
513 struct scatterlist *sg; 1020 struct sk_msg *msg_pl;
1021 struct tls_rec *rec;
1022 int num_async = 0;
1023 size_t copied = 0;
514 bool full_record; 1024 bool full_record;
515 int record_room; 1025 int record_room;
1026 int ret = 0;
1027 bool eor;
516 1028
517 if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | 1029 if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
518 MSG_SENDPAGE_NOTLAST)) 1030 MSG_SENDPAGE_NOTLAST))
@@ -525,8 +1037,12 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
525 1037
526 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1038 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
527 1039
528 if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo)) 1040 /* Wait till there is any pending write on socket */
529 goto sendpage_end; 1041 if (unlikely(sk->sk_write_pending)) {
1042 ret = wait_on_pending_writer(sk, &timeo);
1043 if (unlikely(ret))
1044 goto sendpage_end;
1045 }
530 1046
531 /* Call the sk_stream functions to manage the sndbuf mem. */ 1047 /* Call the sk_stream functions to manage the sndbuf mem. */
532 while (size > 0) { 1048 while (size > 0) {
@@ -537,20 +1053,33 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
537 goto sendpage_end; 1053 goto sendpage_end;
538 } 1054 }
539 1055
1056 if (ctx->open_rec)
1057 rec = ctx->open_rec;
1058 else
1059 rec = ctx->open_rec = tls_get_rec(sk);
1060 if (!rec) {
1061 ret = -ENOMEM;
1062 goto sendpage_end;
1063 }
1064
1065 msg_pl = &rec->msg_plaintext;
1066
540 full_record = false; 1067 full_record = false;
541 record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; 1068 record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
1069 copied = 0;
542 copy = size; 1070 copy = size;
543 if (copy >= record_room) { 1071 if (copy >= record_room) {
544 copy = record_room; 1072 copy = record_room;
545 full_record = true; 1073 full_record = true;
546 } 1074 }
547 required_size = ctx->sg_plaintext_size + copy + 1075
548 tls_ctx->tx.overhead_size; 1076 required_size = msg_pl->sg.size + copy +
1077 tls_ctx->tx.overhead_size;
549 1078
550 if (!sk_stream_memory_free(sk)) 1079 if (!sk_stream_memory_free(sk))
551 goto wait_for_sndbuf; 1080 goto wait_for_sndbuf;
552alloc_payload: 1081alloc_payload:
553 ret = alloc_encrypted_sg(sk, required_size); 1082 ret = tls_alloc_encrypted_msg(sk, required_size);
554 if (ret) { 1083 if (ret) {
555 if (ret != -ENOSPC) 1084 if (ret != -ENOSPC)
556 goto wait_for_memory; 1085 goto wait_for_memory;
@@ -559,33 +1088,32 @@ alloc_payload:
559 * actually allocated. The difference is due 1088 * actually allocated. The difference is due
560 * to max sg elements limit 1089 * to max sg elements limit
561 */ 1090 */
562 copy -= required_size - ctx->sg_plaintext_size; 1091 copy -= required_size - msg_pl->sg.size;
563 full_record = true; 1092 full_record = true;
564 } 1093 }
565 1094
566 get_page(page); 1095 sk_msg_page_add(msg_pl, page, copy, offset);
567 sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem;
568 sg_set_page(sg, page, copy, offset);
569 sg_unmark_end(sg);
570
571 ctx->sg_plaintext_num_elem++;
572
573 sk_mem_charge(sk, copy); 1096 sk_mem_charge(sk, copy);
1097
574 offset += copy; 1098 offset += copy;
575 size -= copy; 1099 size -= copy;
576 ctx->sg_plaintext_size += copy; 1100 copied += copy;
577 tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem; 1101
578 1102 tls_ctx->pending_open_record_frags = true;
579 if (full_record || eor || 1103 if (full_record || eor || sk_msg_full(msg_pl)) {
580 ctx->sg_plaintext_num_elem == 1104 rec->inplace_crypto = 0;
581 ARRAY_SIZE(ctx->sg_plaintext_data)) { 1105 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
582push_record: 1106 record_type, &copied, flags);
583 ret = tls_push_record(sk, flags, record_type);
584 if (ret) { 1107 if (ret) {
585 if (ret == -ENOMEM) 1108 if (ret == -EINPROGRESS)
1109 num_async++;
1110 else if (ret == -ENOMEM)
586 goto wait_for_memory; 1111 goto wait_for_memory;
587 1112 else if (ret != -EAGAIN) {
588 goto sendpage_end; 1113 if (ret == -ENOSPC)
1114 ret = 0;
1115 goto sendpage_end;
1116 }
589 } 1117 }
590 } 1118 }
591 continue; 1119 continue;
@@ -594,35 +1122,35 @@ wait_for_sndbuf:
594wait_for_memory: 1122wait_for_memory:
595 ret = sk_stream_wait_memory(sk, &timeo); 1123 ret = sk_stream_wait_memory(sk, &timeo);
596 if (ret) { 1124 if (ret) {
597 trim_both_sgl(sk, ctx->sg_plaintext_size); 1125 tls_trim_both_msgs(sk, msg_pl->sg.size);
598 goto sendpage_end; 1126 goto sendpage_end;
599 } 1127 }
600 1128
601 if (tls_is_pending_closed_record(tls_ctx))
602 goto push_record;
603
604 goto alloc_payload; 1129 goto alloc_payload;
605 } 1130 }
606 1131
1132 if (num_async) {
1133 /* Transmit if any encryptions have completed */
1134 if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
1135 cancel_delayed_work(&ctx->tx_work.work);
1136 tls_tx_records(sk, flags);
1137 }
1138 }
607sendpage_end: 1139sendpage_end:
608 if (orig_size > size) 1140 ret = sk_stream_error(sk, flags, ret);
609 ret = orig_size - size;
610 else
611 ret = sk_stream_error(sk, flags, ret);
612
613 release_sock(sk); 1141 release_sock(sk);
614 return ret; 1142 return copied ? copied : ret;
615} 1143}
616 1144
617static struct sk_buff *tls_wait_data(struct sock *sk, int flags, 1145static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
618 long timeo, int *err) 1146 int flags, long timeo, int *err)
619{ 1147{
620 struct tls_context *tls_ctx = tls_get_ctx(sk); 1148 struct tls_context *tls_ctx = tls_get_ctx(sk);
621 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1149 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
622 struct sk_buff *skb; 1150 struct sk_buff *skb;
623 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1151 DEFINE_WAIT_FUNC(wait, woken_wake_function);
624 1152
625 while (!(skb = ctx->recv_pkt)) { 1153 while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) {
626 if (sk->sk_err) { 1154 if (sk->sk_err) {
627 *err = sock_error(sk); 1155 *err = sock_error(sk);
628 return NULL; 1156 return NULL;
@@ -641,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
641 1169
642 add_wait_queue(sk_sleep(sk), &wait); 1170 add_wait_queue(sk_sleep(sk), &wait);
643 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1171 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
644 sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); 1172 sk_wait_event(sk, &timeo,
1173 ctx->recv_pkt != skb ||
1174 !sk_psock_queue_empty(psock),
1175 &wait);
645 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1176 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
646 remove_wait_queue(sk_sleep(sk), &wait); 1177 remove_wait_queue(sk_sleep(sk), &wait);
647 1178
@@ -655,6 +1186,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
655 return skb; 1186 return skb;
656} 1187}
657 1188
1189static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from,
1190 int length, int *pages_used,
1191 unsigned int *size_used,
1192 struct scatterlist *to,
1193 int to_max_pages)
1194{
1195 int rc = 0, i = 0, num_elem = *pages_used, maxpages;
1196 struct page *pages[MAX_SKB_FRAGS];
1197 unsigned int size = *size_used;
1198 ssize_t copied, use;
1199 size_t offset;
1200
1201 while (length > 0) {
1202 i = 0;
1203 maxpages = to_max_pages - num_elem;
1204 if (maxpages == 0) {
1205 rc = -EFAULT;
1206 goto out;
1207 }
1208 copied = iov_iter_get_pages(from, pages,
1209 length,
1210 maxpages, &offset);
1211 if (copied <= 0) {
1212 rc = -EFAULT;
1213 goto out;
1214 }
1215
1216 iov_iter_advance(from, copied);
1217
1218 length -= copied;
1219 size += copied;
1220 while (copied) {
1221 use = min_t(int, copied, PAGE_SIZE - offset);
1222
1223 sg_set_page(&to[num_elem],
1224 pages[i], use, offset);
1225 sg_unmark_end(&to[num_elem]);
1226 /* We do not uncharge memory from this API */
1227
1228 offset = 0;
1229 copied -= use;
1230
1231 i++;
1232 num_elem++;
1233 }
1234 }
1235 /* Mark the end in the last sg entry if newly added */
1236 if (num_elem > *pages_used)
1237 sg_mark_end(&to[num_elem - 1]);
1238out:
1239 if (rc)
1240 iov_iter_revert(from, size - *size_used);
1241 *size_used = size;
1242 *pages_used = num_elem;
1243
1244 return rc;
1245}
1246
658/* This function decrypts the input skb into either out_iov or in out_sg 1247/* This function decrypts the input skb into either out_iov or in out_sg
659 * or in skb buffers itself. The input parameter 'zc' indicates if 1248 * or in skb buffers itself. The input parameter 'zc' indicates if
660 * zero-copy mode needs to be tried or not. With zero-copy mode, either 1249 * zero-copy mode needs to be tried or not. With zero-copy mode, either
@@ -684,12 +1273,14 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
684 n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1; 1273 n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
685 else 1274 else
686 n_sgout = sg_nents(out_sg); 1275 n_sgout = sg_nents(out_sg);
1276 n_sgin = skb_nsg(skb, rxm->offset + tls_ctx->rx.prepend_size,
1277 rxm->full_len - tls_ctx->rx.prepend_size);
687 } else { 1278 } else {
688 n_sgout = 0; 1279 n_sgout = 0;
689 *zc = false; 1280 *zc = false;
1281 n_sgin = skb_cow_data(skb, 0, &unused);
690 } 1282 }
691 1283
692 n_sgin = skb_cow_data(skb, 0, &unused);
693 if (n_sgin < 1) 1284 if (n_sgin < 1)
694 return -EBADMSG; 1285 return -EBADMSG;
695 1286
@@ -750,9 +1341,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
750 sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); 1341 sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE);
751 1342
752 *chunk = 0; 1343 *chunk = 0;
753 err = zerocopy_from_iter(sk, out_iov, data_len, &pages, 1344 err = tls_setup_from_iter(sk, out_iov, data_len,
754 chunk, &sgout[1], 1345 &pages, chunk, &sgout[1],
755 (n_sgout - 1), false); 1346 (n_sgout - 1));
756 if (err < 0) 1347 if (err < 0)
757 goto fallback_to_reg_recv; 1348 goto fallback_to_reg_recv;
758 } else if (out_sg) { 1349 } else if (out_sg) {
@@ -769,7 +1360,10 @@ fallback_to_reg_recv:
769 } 1360 }
770 1361
771 /* Prepare and submit AEAD request */ 1362 /* Prepare and submit AEAD request */
772 err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req); 1363 err = tls_do_decryption(sk, skb, sgin, sgout, iv,
1364 data_len, aead_req, *zc);
1365 if (err == -EINPROGRESS)
1366 return err;
773 1367
774 /* Release the pages in case iov was mapped to pages */ 1368 /* Release the pages in case iov was mapped to pages */
775 for (; pages > 0; pages--) 1369 for (; pages > 0; pages--)
@@ -794,8 +1388,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
794#endif 1388#endif
795 if (!ctx->decrypted) { 1389 if (!ctx->decrypted) {
796 err = decrypt_internal(sk, skb, dest, NULL, chunk, zc); 1390 err = decrypt_internal(sk, skb, dest, NULL, chunk, zc);
797 if (err < 0) 1391 if (err < 0) {
1392 if (err == -EINPROGRESS)
1393 tls_advance_record_sn(sk, &tls_ctx->rx);
1394
798 return err; 1395 return err;
1396 }
799 } else { 1397 } else {
800 *zc = false; 1398 *zc = false;
801 } 1399 }
@@ -823,18 +1421,20 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
823{ 1421{
824 struct tls_context *tls_ctx = tls_get_ctx(sk); 1422 struct tls_context *tls_ctx = tls_get_ctx(sk);
825 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1423 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
826 struct strp_msg *rxm = strp_msg(skb);
827 1424
828 if (len < rxm->full_len) { 1425 if (skb) {
829 rxm->offset += len; 1426 struct strp_msg *rxm = strp_msg(skb);
830 rxm->full_len -= len;
831 1427
832 return false; 1428 if (len < rxm->full_len) {
1429 rxm->offset += len;
1430 rxm->full_len -= len;
1431 return false;
1432 }
1433 kfree_skb(skb);
833 } 1434 }
834 1435
835 /* Finished with message */ 1436 /* Finished with message */
836 ctx->recv_pkt = NULL; 1437 ctx->recv_pkt = NULL;
837 kfree_skb(skb);
838 __strp_unpause(&ctx->strp); 1438 __strp_unpause(&ctx->strp);
839 1439
840 return true; 1440 return true;
@@ -849,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk,
849{ 1449{
850 struct tls_context *tls_ctx = tls_get_ctx(sk); 1450 struct tls_context *tls_ctx = tls_get_ctx(sk);
851 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1451 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1452 struct sk_psock *psock;
852 unsigned char control; 1453 unsigned char control;
853 struct strp_msg *rxm; 1454 struct strp_msg *rxm;
854 struct sk_buff *skb; 1455 struct sk_buff *skb;
@@ -857,25 +1458,40 @@ int tls_sw_recvmsg(struct sock *sk,
857 int target, err = 0; 1458 int target, err = 0;
858 long timeo; 1459 long timeo;
859 bool is_kvec = msg->msg_iter.type & ITER_KVEC; 1460 bool is_kvec = msg->msg_iter.type & ITER_KVEC;
1461 int num_async = 0;
860 1462
861 flags |= nonblock; 1463 flags |= nonblock;
862 1464
863 if (unlikely(flags & MSG_ERRQUEUE)) 1465 if (unlikely(flags & MSG_ERRQUEUE))
864 return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); 1466 return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
865 1467
1468 psock = sk_psock_get(sk);
866 lock_sock(sk); 1469 lock_sock(sk);
867 1470
868 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1471 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
869 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1472 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
870 do { 1473 do {
871 bool zc = false; 1474 bool zc = false;
1475 bool async = false;
872 int chunk = 0; 1476 int chunk = 0;
873 1477
874 skb = tls_wait_data(sk, flags, timeo, &err); 1478 skb = tls_wait_data(sk, psock, flags, timeo, &err);
875 if (!skb) 1479 if (!skb) {
1480 if (psock) {
1481 int ret = __tcp_bpf_recvmsg(sk, psock,
1482 msg, len, flags);
1483
1484 if (ret > 0) {
1485 copied += ret;
1486 len -= ret;
1487 continue;
1488 }
1489 }
876 goto recv_end; 1490 goto recv_end;
1491 }
877 1492
878 rxm = strp_msg(skb); 1493 rxm = strp_msg(skb);
1494
879 if (!cmsg) { 1495 if (!cmsg) {
880 int cerr; 1496 int cerr;
881 1497
@@ -902,26 +1518,39 @@ int tls_sw_recvmsg(struct sock *sk,
902 1518
903 err = decrypt_skb_update(sk, skb, &msg->msg_iter, 1519 err = decrypt_skb_update(sk, skb, &msg->msg_iter,
904 &chunk, &zc); 1520 &chunk, &zc);
905 if (err < 0) { 1521 if (err < 0 && err != -EINPROGRESS) {
906 tls_err_abort(sk, EBADMSG); 1522 tls_err_abort(sk, EBADMSG);
907 goto recv_end; 1523 goto recv_end;
908 } 1524 }
1525
1526 if (err == -EINPROGRESS) {
1527 async = true;
1528 num_async++;
1529 goto pick_next_record;
1530 }
1531
909 ctx->decrypted = true; 1532 ctx->decrypted = true;
910 } 1533 }
911 1534
912 if (!zc) { 1535 if (!zc) {
913 chunk = min_t(unsigned int, rxm->full_len, len); 1536 chunk = min_t(unsigned int, rxm->full_len, len);
1537
914 err = skb_copy_datagram_msg(skb, rxm->offset, msg, 1538 err = skb_copy_datagram_msg(skb, rxm->offset, msg,
915 chunk); 1539 chunk);
916 if (err < 0) 1540 if (err < 0)
917 goto recv_end; 1541 goto recv_end;
918 } 1542 }
919 1543
1544pick_next_record:
920 copied += chunk; 1545 copied += chunk;
921 len -= chunk; 1546 len -= chunk;
922 if (likely(!(flags & MSG_PEEK))) { 1547 if (likely(!(flags & MSG_PEEK))) {
923 u8 control = ctx->control; 1548 u8 control = ctx->control;
924 1549
1550 /* For async, drop current skb reference */
1551 if (async)
1552 skb = NULL;
1553
925 if (tls_sw_advance_skb(sk, skb, chunk)) { 1554 if (tls_sw_advance_skb(sk, skb, chunk)) {
926 /* Return full control message to 1555 /* Return full control message to
927 * userspace before trying to parse 1556 * userspace before trying to parse
@@ -930,6 +1559,8 @@ int tls_sw_recvmsg(struct sock *sk,
930 msg->msg_flags |= MSG_EOR; 1559 msg->msg_flags |= MSG_EOR;
931 if (control != TLS_RECORD_TYPE_DATA) 1560 if (control != TLS_RECORD_TYPE_DATA)
932 goto recv_end; 1561 goto recv_end;
1562 } else {
1563 break;
933 } 1564 }
934 } else { 1565 } else {
935 /* MSG_PEEK right now cannot look beyond current skb 1566 /* MSG_PEEK right now cannot look beyond current skb
@@ -946,7 +1577,25 @@ int tls_sw_recvmsg(struct sock *sk,
946 } while (len); 1577 } while (len);
947 1578
948recv_end: 1579recv_end:
1580 if (num_async) {
1581 /* Wait for all previously submitted records to be decrypted */
1582 smp_store_mb(ctx->async_notify, true);
1583 if (atomic_read(&ctx->decrypt_pending)) {
1584 err = crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
1585 if (err) {
1586 /* one of async decrypt failed */
1587 tls_err_abort(sk, err);
1588 copied = 0;
1589 }
1590 } else {
1591 reinit_completion(&ctx->async_wait.completion);
1592 }
1593 WRITE_ONCE(ctx->async_notify, false);
1594 }
1595
949 release_sock(sk); 1596 release_sock(sk);
1597 if (psock)
1598 sk_psock_put(sk, psock);
950 return copied ? : err; 1599 return copied ? : err;
951} 1600}
952 1601
@@ -969,7 +1618,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
969 1618
970 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1619 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
971 1620
972 skb = tls_wait_data(sk, flags, timeo, &err); 1621 skb = tls_wait_data(sk, NULL, flags, timeo, &err);
973 if (!skb) 1622 if (!skb)
974 goto splice_read_end; 1623 goto splice_read_end;
975 1624
@@ -1003,23 +1652,20 @@ splice_read_end:
1003 return copied ? : err; 1652 return copied ? : err;
1004} 1653}
1005 1654
1006unsigned int tls_sw_poll(struct file *file, struct socket *sock, 1655bool tls_sw_stream_read(const struct sock *sk)
1007 struct poll_table_struct *wait)
1008{ 1656{
1009 unsigned int ret;
1010 struct sock *sk = sock->sk;
1011 struct tls_context *tls_ctx = tls_get_ctx(sk); 1657 struct tls_context *tls_ctx = tls_get_ctx(sk);
1012 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1658 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1659 bool ingress_empty = true;
1660 struct sk_psock *psock;
1013 1661
1014 /* Grab POLLOUT and POLLHUP from the underlying socket */ 1662 rcu_read_lock();
1015 ret = ctx->sk_poll(file, sock, wait); 1663 psock = sk_psock(sk);
1016 1664 if (psock)
1017 /* Clear POLLIN bits, and set based on recv_pkt */ 1665 ingress_empty = list_empty(&psock->ingress_msg);
1018 ret &= ~(POLLIN | POLLRDNORM); 1666 rcu_read_unlock();
1019 if (ctx->recv_pkt)
1020 ret |= POLLIN | POLLRDNORM;
1021 1667
1022 return ret; 1668 return !ingress_empty || ctx->recv_pkt;
1023} 1669}
1024 1670
1025static int tls_read_size(struct strparser *strp, struct sk_buff *skb) 1671static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1098,17 +1744,66 @@ static void tls_data_ready(struct sock *sk)
1098{ 1744{
1099 struct tls_context *tls_ctx = tls_get_ctx(sk); 1745 struct tls_context *tls_ctx = tls_get_ctx(sk);
1100 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1746 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1747 struct sk_psock *psock;
1101 1748
1102 strp_data_ready(&ctx->strp); 1749 strp_data_ready(&ctx->strp);
1750
1751 psock = sk_psock_get(sk);
1752 if (psock && !list_empty(&psock->ingress_msg)) {
1753 ctx->saved_data_ready(sk);
1754 sk_psock_put(sk, psock);
1755 }
1103} 1756}
1104 1757
1105void tls_sw_free_resources_tx(struct sock *sk) 1758void tls_sw_free_resources_tx(struct sock *sk)
1106{ 1759{
1107 struct tls_context *tls_ctx = tls_get_ctx(sk); 1760 struct tls_context *tls_ctx = tls_get_ctx(sk);
1108 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 1761 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
1762 struct tls_rec *rec, *tmp;
1763
1764 /* Wait for any pending async encryptions to complete */
1765 smp_store_mb(ctx->async_notify, true);
1766 if (atomic_read(&ctx->encrypt_pending))
1767 crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
1768
1769 cancel_delayed_work_sync(&ctx->tx_work.work);
1770
1771 /* Tx whatever records we can transmit and abandon the rest */
1772 tls_tx_records(sk, -1);
1773
1774 /* Free up un-sent records in tx_list. First, free
1775 * the partially sent record if any at head of tx_list.
1776 */
1777 if (tls_ctx->partially_sent_record) {
1778 struct scatterlist *sg = tls_ctx->partially_sent_record;
1779
1780 while (1) {
1781 put_page(sg_page(sg));
1782 sk_mem_uncharge(sk, sg->length);
1783
1784 if (sg_is_last(sg))
1785 break;
1786 sg++;
1787 }
1788
1789 tls_ctx->partially_sent_record = NULL;
1790
1791 rec = list_first_entry(&ctx->tx_list,
1792 struct tls_rec, list);
1793 list_del(&rec->list);
1794 sk_msg_free(sk, &rec->msg_plaintext);
1795 kfree(rec);
1796 }
1797
1798 list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
1799 list_del(&rec->list);
1800 sk_msg_free(sk, &rec->msg_encrypted);
1801 sk_msg_free(sk, &rec->msg_plaintext);
1802 kfree(rec);
1803 }
1109 1804
1110 crypto_free_aead(ctx->aead_send); 1805 crypto_free_aead(ctx->aead_send);
1111 tls_free_both_sg(sk); 1806 tls_free_open_rec(sk);
1112 1807
1113 kfree(ctx); 1808 kfree(ctx);
1114} 1809}
@@ -1142,6 +1837,24 @@ void tls_sw_free_resources_rx(struct sock *sk)
1142 kfree(ctx); 1837 kfree(ctx);
1143} 1838}
1144 1839
1840/* The work handler to transmitt the encrypted records in tx_list */
1841static void tx_work_handler(struct work_struct *work)
1842{
1843 struct delayed_work *delayed_work = to_delayed_work(work);
1844 struct tx_work *tx_work = container_of(delayed_work,
1845 struct tx_work, work);
1846 struct sock *sk = tx_work->sk;
1847 struct tls_context *tls_ctx = tls_get_ctx(sk);
1848 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
1849
1850 if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
1851 return;
1852
1853 lock_sock(sk);
1854 tls_tx_records(sk, -1);
1855 release_sock(sk);
1856}
1857
1145int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) 1858int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1146{ 1859{
1147 struct tls_crypto_info *crypto_info; 1860 struct tls_crypto_info *crypto_info;
@@ -1191,6 +1904,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1191 crypto_info = &ctx->crypto_send.info; 1904 crypto_info = &ctx->crypto_send.info;
1192 cctx = &ctx->tx; 1905 cctx = &ctx->tx;
1193 aead = &sw_ctx_tx->aead_send; 1906 aead = &sw_ctx_tx->aead_send;
1907 INIT_LIST_HEAD(&sw_ctx_tx->tx_list);
1908 INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler);
1909 sw_ctx_tx->tx_work.sk = sk;
1194 } else { 1910 } else {
1195 crypto_init_wait(&sw_ctx_rx->async_wait); 1911 crypto_init_wait(&sw_ctx_rx->async_wait);
1196 crypto_info = &ctx->crypto_recv.info; 1912 crypto_info = &ctx->crypto_recv.info;
@@ -1241,26 +1957,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1241 goto free_iv; 1957 goto free_iv;
1242 } 1958 }
1243 1959
1244 if (sw_ctx_tx) {
1245 sg_init_table(sw_ctx_tx->sg_encrypted_data,
1246 ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
1247 sg_init_table(sw_ctx_tx->sg_plaintext_data,
1248 ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
1249
1250 sg_init_table(sw_ctx_tx->sg_aead_in, 2);
1251 sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
1252 sizeof(sw_ctx_tx->aad_space));
1253 sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
1254 sg_chain(sw_ctx_tx->sg_aead_in, 2,
1255 sw_ctx_tx->sg_plaintext_data);
1256 sg_init_table(sw_ctx_tx->sg_aead_out, 2);
1257 sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
1258 sizeof(sw_ctx_tx->aad_space));
1259 sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
1260 sg_chain(sw_ctx_tx->sg_aead_out, 2,
1261 sw_ctx_tx->sg_encrypted_data);
1262 }
1263
1264 if (!*aead) { 1960 if (!*aead) {
1265 *aead = crypto_alloc_aead("gcm(aes)", 0, 0); 1961 *aead = crypto_alloc_aead("gcm(aes)", 0, 0);
1266 if (IS_ERR(*aead)) { 1962 if (IS_ERR(*aead)) {
@@ -1294,8 +1990,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1294 sk->sk_data_ready = tls_data_ready; 1990 sk->sk_data_ready = tls_data_ready;
1295 write_unlock_bh(&sk->sk_callback_lock); 1991 write_unlock_bh(&sk->sk_callback_lock);
1296 1992
1297 sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
1298
1299 strp_check_rcv(&sw_ctx_rx->strp); 1993 strp_check_rcv(&sw_ctx_rx->strp);
1300 } 1994 }
1301 1995
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d1edfa3cad61..74d1eed7cbd4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -225,6 +225,8 @@ static inline void unix_release_addr(struct unix_address *addr)
225 225
226static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) 226static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
227{ 227{
228 *hashp = 0;
229
228 if (len <= sizeof(short) || len > sizeof(*sunaddr)) 230 if (len <= sizeof(short) || len > sizeof(*sunaddr))
229 return -EINVAL; 231 return -EINVAL;
230 if (!sunaddr || sunaddr->sun_family != AF_UNIX) 232 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
@@ -2640,7 +2642,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
2640 struct sock *sk = sock->sk; 2642 struct sock *sk = sock->sk;
2641 __poll_t mask; 2643 __poll_t mask;
2642 2644
2643 sock_poll_wait(file, wait); 2645 sock_poll_wait(file, sock, wait);
2644 mask = 0; 2646 mask = 0;
2645 2647
2646 /* exceptional events? */ 2648 /* exceptional events? */
@@ -2677,7 +2679,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2677 unsigned int writable; 2679 unsigned int writable;
2678 __poll_t mask; 2680 __poll_t mask;
2679 2681
2680 sock_poll_wait(file, wait); 2682 sock_poll_wait(file, sock, wait);
2681 mask = 0; 2683 mask = 0;
2682 2684
2683 /* exceptional events? */ 2685 /* exceptional events? */
diff --git a/net/wireless/core.c b/net/wireless/core.c
index a88551f3bc43..5bd01058b9e6 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1019,36 +1019,49 @@ void cfg80211_cqm_config_free(struct wireless_dev *wdev)
1019 wdev->cqm_config = NULL; 1019 wdev->cqm_config = NULL;
1020} 1020}
1021 1021
1022void cfg80211_unregister_wdev(struct wireless_dev *wdev) 1022static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync)
1023{ 1023{
1024 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 1024 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
1025 1025
1026 ASSERT_RTNL(); 1026 ASSERT_RTNL();
1027 1027
1028 if (WARN_ON(wdev->netdev))
1029 return;
1030
1031 nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); 1028 nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
1032 1029
1033 list_del_rcu(&wdev->list); 1030 list_del_rcu(&wdev->list);
1034 synchronize_rcu(); 1031 if (sync)
1032 synchronize_rcu();
1035 rdev->devlist_generation++; 1033 rdev->devlist_generation++;
1036 1034
1035 cfg80211_mlme_purge_registrations(wdev);
1036
1037 switch (wdev->iftype) { 1037 switch (wdev->iftype) {
1038 case NL80211_IFTYPE_P2P_DEVICE: 1038 case NL80211_IFTYPE_P2P_DEVICE:
1039 cfg80211_mlme_purge_registrations(wdev);
1040 cfg80211_stop_p2p_device(rdev, wdev); 1039 cfg80211_stop_p2p_device(rdev, wdev);
1041 break; 1040 break;
1042 case NL80211_IFTYPE_NAN: 1041 case NL80211_IFTYPE_NAN:
1043 cfg80211_stop_nan(rdev, wdev); 1042 cfg80211_stop_nan(rdev, wdev);
1044 break; 1043 break;
1045 default: 1044 default:
1046 WARN_ON_ONCE(1);
1047 break; 1045 break;
1048 } 1046 }
1049 1047
1048#ifdef CONFIG_CFG80211_WEXT
1049 kzfree(wdev->wext.keys);
1050#endif
1051 /* only initialized if we have a netdev */
1052 if (wdev->netdev)
1053 flush_work(&wdev->disconnect_wk);
1054
1050 cfg80211_cqm_config_free(wdev); 1055 cfg80211_cqm_config_free(wdev);
1051} 1056}
1057
1058void cfg80211_unregister_wdev(struct wireless_dev *wdev)
1059{
1060 if (WARN_ON(wdev->netdev))
1061 return;
1062
1063 __cfg80211_unregister_wdev(wdev, true);
1064}
1052EXPORT_SYMBOL(cfg80211_unregister_wdev); 1065EXPORT_SYMBOL(cfg80211_unregister_wdev);
1053 1066
1054static const struct device_type wiphy_type = { 1067static const struct device_type wiphy_type = {
@@ -1153,6 +1166,30 @@ void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
1153} 1166}
1154EXPORT_SYMBOL(cfg80211_stop_iface); 1167EXPORT_SYMBOL(cfg80211_stop_iface);
1155 1168
1169void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
1170 struct wireless_dev *wdev)
1171{
1172 mutex_init(&wdev->mtx);
1173 INIT_LIST_HEAD(&wdev->event_list);
1174 spin_lock_init(&wdev->event_lock);
1175 INIT_LIST_HEAD(&wdev->mgmt_registrations);
1176 spin_lock_init(&wdev->mgmt_registrations_lock);
1177
1178 /*
1179 * We get here also when the interface changes network namespaces,
1180 * as it's registered into the new one, but we don't want it to
1181 * change ID in that case. Checking if the ID is already assigned
1182 * works, because 0 isn't considered a valid ID and the memory is
1183 * 0-initialized.
1184 */
1185 if (!wdev->identifier)
1186 wdev->identifier = ++rdev->wdev_id;
1187 list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
1188 rdev->devlist_generation++;
1189
1190 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
1191}
1192
1156static int cfg80211_netdev_notifier_call(struct notifier_block *nb, 1193static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1157 unsigned long state, void *ptr) 1194 unsigned long state, void *ptr)
1158{ 1195{
@@ -1178,23 +1215,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1178 * called within code protected by it when interfaces 1215 * called within code protected by it when interfaces
1179 * are added with nl80211. 1216 * are added with nl80211.
1180 */ 1217 */
1181 mutex_init(&wdev->mtx);
1182 INIT_LIST_HEAD(&wdev->event_list);
1183 spin_lock_init(&wdev->event_lock);
1184 INIT_LIST_HEAD(&wdev->mgmt_registrations);
1185 spin_lock_init(&wdev->mgmt_registrations_lock);
1186
1187 /*
1188 * We get here also when the interface changes network namespaces,
1189 * as it's registered into the new one, but we don't want it to
1190 * change ID in that case. Checking if the ID is already assigned
1191 * works, because 0 isn't considered a valid ID and the memory is
1192 * 0-initialized.
1193 */
1194 if (!wdev->identifier)
1195 wdev->identifier = ++rdev->wdev_id;
1196 list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
1197 rdev->devlist_generation++;
1198 /* can only change netns with wiphy */ 1218 /* can only change netns with wiphy */
1199 dev->features |= NETIF_F_NETNS_LOCAL; 1219 dev->features |= NETIF_F_NETNS_LOCAL;
1200 1220
@@ -1223,7 +1243,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1223 1243
1224 INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); 1244 INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
1225 1245
1226 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); 1246 cfg80211_init_wdev(rdev, wdev);
1227 break; 1247 break;
1228 case NETDEV_GOING_DOWN: 1248 case NETDEV_GOING_DOWN:
1229 cfg80211_leave(rdev, wdev); 1249 cfg80211_leave(rdev, wdev);
@@ -1238,7 +1258,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1238 1258
1239 list_for_each_entry_safe(pos, tmp, 1259 list_for_each_entry_safe(pos, tmp,
1240 &rdev->sched_scan_req_list, list) { 1260 &rdev->sched_scan_req_list, list) {
1241 if (WARN_ON(pos && pos->dev == wdev->netdev)) 1261 if (WARN_ON(pos->dev == wdev->netdev))
1242 cfg80211_stop_sched_scan_req(rdev, pos, false); 1262 cfg80211_stop_sched_scan_req(rdev, pos, false);
1243 } 1263 }
1244 1264
@@ -1302,17 +1322,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1302 * remove and clean it up. 1322 * remove and clean it up.
1303 */ 1323 */
1304 if (!list_empty(&wdev->list)) { 1324 if (!list_empty(&wdev->list)) {
1305 nl80211_notify_iface(rdev, wdev, 1325 __cfg80211_unregister_wdev(wdev, false);
1306 NL80211_CMD_DEL_INTERFACE);
1307 sysfs_remove_link(&dev->dev.kobj, "phy80211"); 1326 sysfs_remove_link(&dev->dev.kobj, "phy80211");
1308 list_del_rcu(&wdev->list);
1309 rdev->devlist_generation++;
1310 cfg80211_mlme_purge_registrations(wdev);
1311#ifdef CONFIG_CFG80211_WEXT
1312 kzfree(wdev->wext.keys);
1313#endif
1314 flush_work(&wdev->disconnect_wk);
1315 cfg80211_cqm_config_free(wdev);
1316 } 1327 }
1317 /* 1328 /*
1318 * synchronise (so that we won't find this netdev 1329 * synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 7f52ef569320..c61dbba8bf47 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -66,6 +66,7 @@ struct cfg80211_registered_device {
66 /* protected by RTNL only */ 66 /* protected by RTNL only */
67 int num_running_ifaces; 67 int num_running_ifaces;
68 int num_running_monitor_ifaces; 68 int num_running_monitor_ifaces;
69 u64 cookie_counter;
69 70
70 /* BSSes/scanning */ 71 /* BSSes/scanning */
71 spinlock_t bss_lock; 72 spinlock_t bss_lock;
@@ -133,6 +134,16 @@ cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
133#endif 134#endif
134} 135}
135 136
137static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev)
138{
139 u64 r = ++rdev->cookie_counter;
140
141 if (WARN_ON(r == 0))
142 r = ++rdev->cookie_counter;
143
144 return r;
145}
146
136extern struct workqueue_struct *cfg80211_wq; 147extern struct workqueue_struct *cfg80211_wq;
137extern struct list_head cfg80211_rdev_list; 148extern struct list_head cfg80211_rdev_list;
138extern int cfg80211_rdev_list_generation; 149extern int cfg80211_rdev_list_generation;
@@ -187,6 +198,9 @@ struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
187int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, 198int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
188 struct net *net); 199 struct net *net);
189 200
201void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
202 struct wireless_dev *wdev);
203
190static inline void wdev_lock(struct wireless_dev *wdev) 204static inline void wdev_lock(struct wireless_dev *wdev)
191 __acquires(wdev) 205 __acquires(wdev)
192{ 206{
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index e6bce1f130c9..b5e235573c8a 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -30,7 +30,7 @@
30#include <net/iw_handler.h> 30#include <net/iw_handler.h>
31 31
32#include <crypto/hash.h> 32#include <crypto/hash.h>
33#include <crypto/skcipher.h> 33#include <linux/crypto.h>
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35 35
36#include <net/lib80211.h> 36#include <net/lib80211.h>
@@ -64,9 +64,9 @@ struct lib80211_tkip_data {
64 64
65 int key_idx; 65 int key_idx;
66 66
67 struct crypto_skcipher *rx_tfm_arc4; 67 struct crypto_cipher *rx_tfm_arc4;
68 struct crypto_shash *rx_tfm_michael; 68 struct crypto_shash *rx_tfm_michael;
69 struct crypto_skcipher *tx_tfm_arc4; 69 struct crypto_cipher *tx_tfm_arc4;
70 struct crypto_shash *tx_tfm_michael; 70 struct crypto_shash *tx_tfm_michael;
71 71
72 /* scratch buffers for virt_to_page() (crypto API) */ 72 /* scratch buffers for virt_to_page() (crypto API) */
@@ -99,8 +99,7 @@ static void *lib80211_tkip_init(int key_idx)
99 99
100 priv->key_idx = key_idx; 100 priv->key_idx = key_idx;
101 101
102 priv->tx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, 102 priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
103 CRYPTO_ALG_ASYNC);
104 if (IS_ERR(priv->tx_tfm_arc4)) { 103 if (IS_ERR(priv->tx_tfm_arc4)) {
105 priv->tx_tfm_arc4 = NULL; 104 priv->tx_tfm_arc4 = NULL;
106 goto fail; 105 goto fail;
@@ -112,8 +111,7 @@ static void *lib80211_tkip_init(int key_idx)
112 goto fail; 111 goto fail;
113 } 112 }
114 113
115 priv->rx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, 114 priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
116 CRYPTO_ALG_ASYNC);
117 if (IS_ERR(priv->rx_tfm_arc4)) { 115 if (IS_ERR(priv->rx_tfm_arc4)) {
118 priv->rx_tfm_arc4 = NULL; 116 priv->rx_tfm_arc4 = NULL;
119 goto fail; 117 goto fail;
@@ -130,9 +128,9 @@ static void *lib80211_tkip_init(int key_idx)
130 fail: 128 fail:
131 if (priv) { 129 if (priv) {
132 crypto_free_shash(priv->tx_tfm_michael); 130 crypto_free_shash(priv->tx_tfm_michael);
133 crypto_free_skcipher(priv->tx_tfm_arc4); 131 crypto_free_cipher(priv->tx_tfm_arc4);
134 crypto_free_shash(priv->rx_tfm_michael); 132 crypto_free_shash(priv->rx_tfm_michael);
135 crypto_free_skcipher(priv->rx_tfm_arc4); 133 crypto_free_cipher(priv->rx_tfm_arc4);
136 kfree(priv); 134 kfree(priv);
137 } 135 }
138 136
@@ -144,9 +142,9 @@ static void lib80211_tkip_deinit(void *priv)
144 struct lib80211_tkip_data *_priv = priv; 142 struct lib80211_tkip_data *_priv = priv;
145 if (_priv) { 143 if (_priv) {
146 crypto_free_shash(_priv->tx_tfm_michael); 144 crypto_free_shash(_priv->tx_tfm_michael);
147 crypto_free_skcipher(_priv->tx_tfm_arc4); 145 crypto_free_cipher(_priv->tx_tfm_arc4);
148 crypto_free_shash(_priv->rx_tfm_michael); 146 crypto_free_shash(_priv->rx_tfm_michael);
149 crypto_free_skcipher(_priv->rx_tfm_arc4); 147 crypto_free_cipher(_priv->rx_tfm_arc4);
150 } 148 }
151 kfree(priv); 149 kfree(priv);
152} 150}
@@ -344,12 +342,10 @@ static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
344static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) 342static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
345{ 343{
346 struct lib80211_tkip_data *tkey = priv; 344 struct lib80211_tkip_data *tkey = priv;
347 SKCIPHER_REQUEST_ON_STACK(req, tkey->tx_tfm_arc4);
348 int len; 345 int len;
349 u8 rc4key[16], *pos, *icv; 346 u8 rc4key[16], *pos, *icv;
350 u32 crc; 347 u32 crc;
351 struct scatterlist sg; 348 int i;
352 int err;
353 349
354 if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { 350 if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
355 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; 351 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
@@ -374,14 +370,10 @@ static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
374 icv[2] = crc >> 16; 370 icv[2] = crc >> 16;
375 icv[3] = crc >> 24; 371 icv[3] = crc >> 24;
376 372
377 crypto_skcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16); 373 crypto_cipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
378 sg_init_one(&sg, pos, len + 4); 374 for (i = 0; i < len + 4; i++)
379 skcipher_request_set_tfm(req, tkey->tx_tfm_arc4); 375 crypto_cipher_encrypt_one(tkey->tx_tfm_arc4, pos + i, pos + i);
380 skcipher_request_set_callback(req, 0, NULL, NULL); 376 return 0;
381 skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
382 err = crypto_skcipher_encrypt(req);
383 skcipher_request_zero(req);
384 return err;
385} 377}
386 378
387/* 379/*
@@ -400,7 +392,6 @@ static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
400static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) 392static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
401{ 393{
402 struct lib80211_tkip_data *tkey = priv; 394 struct lib80211_tkip_data *tkey = priv;
403 SKCIPHER_REQUEST_ON_STACK(req, tkey->rx_tfm_arc4);
404 u8 rc4key[16]; 395 u8 rc4key[16];
405 u8 keyidx, *pos; 396 u8 keyidx, *pos;
406 u32 iv32; 397 u32 iv32;
@@ -408,9 +399,8 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
408 struct ieee80211_hdr *hdr; 399 struct ieee80211_hdr *hdr;
409 u8 icv[4]; 400 u8 icv[4];
410 u32 crc; 401 u32 crc;
411 struct scatterlist sg;
412 int plen; 402 int plen;
413 int err; 403 int i;
414 404
415 hdr = (struct ieee80211_hdr *)skb->data; 405 hdr = (struct ieee80211_hdr *)skb->data;
416 406
@@ -463,18 +453,9 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
463 453
464 plen = skb->len - hdr_len - 12; 454 plen = skb->len - hdr_len - 12;
465 455
466 crypto_skcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16); 456 crypto_cipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
467 sg_init_one(&sg, pos, plen + 4); 457 for (i = 0; i < plen + 4; i++)
468 skcipher_request_set_tfm(req, tkey->rx_tfm_arc4); 458 crypto_cipher_decrypt_one(tkey->rx_tfm_arc4, pos + i, pos + i);
469 skcipher_request_set_callback(req, 0, NULL, NULL);
470 skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
471 err = crypto_skcipher_decrypt(req);
472 skcipher_request_zero(req);
473 if (err) {
474 net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n",
475 hdr->addr2);
476 return -7;
477 }
478 459
479 crc = ~crc32_le(~0, pos, plen); 460 crc = ~crc32_le(~0, pos, plen);
480 icv[0] = crc; 461 icv[0] = crc;
@@ -660,9 +641,9 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
660 struct lib80211_tkip_data *tkey = priv; 641 struct lib80211_tkip_data *tkey = priv;
661 int keyidx; 642 int keyidx;
662 struct crypto_shash *tfm = tkey->tx_tfm_michael; 643 struct crypto_shash *tfm = tkey->tx_tfm_michael;
663 struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4; 644 struct crypto_cipher *tfm2 = tkey->tx_tfm_arc4;
664 struct crypto_shash *tfm3 = tkey->rx_tfm_michael; 645 struct crypto_shash *tfm3 = tkey->rx_tfm_michael;
665 struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4; 646 struct crypto_cipher *tfm4 = tkey->rx_tfm_arc4;
666 647
667 keyidx = tkey->key_idx; 648 keyidx = tkey->key_idx;
668 memset(tkey, 0, sizeof(*tkey)); 649 memset(tkey, 0, sizeof(*tkey));
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index d05f58b0fd04..6015f6b542a6 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -22,7 +22,7 @@
22 22
23#include <net/lib80211.h> 23#include <net/lib80211.h>
24 24
25#include <crypto/skcipher.h> 25#include <linux/crypto.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27 27
28MODULE_AUTHOR("Jouni Malinen"); 28MODULE_AUTHOR("Jouni Malinen");
@@ -35,8 +35,8 @@ struct lib80211_wep_data {
35 u8 key[WEP_KEY_LEN + 1]; 35 u8 key[WEP_KEY_LEN + 1];
36 u8 key_len; 36 u8 key_len;
37 u8 key_idx; 37 u8 key_idx;
38 struct crypto_skcipher *tx_tfm; 38 struct crypto_cipher *tx_tfm;
39 struct crypto_skcipher *rx_tfm; 39 struct crypto_cipher *rx_tfm;
40}; 40};
41 41
42static void *lib80211_wep_init(int keyidx) 42static void *lib80211_wep_init(int keyidx)
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
48 goto fail; 48 goto fail;
49 priv->key_idx = keyidx; 49 priv->key_idx = keyidx;
50 50
51 priv->tx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); 51 priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
52 if (IS_ERR(priv->tx_tfm)) { 52 if (IS_ERR(priv->tx_tfm)) {
53 priv->tx_tfm = NULL; 53 priv->tx_tfm = NULL;
54 goto fail; 54 goto fail;
55 } 55 }
56 56
57 priv->rx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); 57 priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
58 if (IS_ERR(priv->rx_tfm)) { 58 if (IS_ERR(priv->rx_tfm)) {
59 priv->rx_tfm = NULL; 59 priv->rx_tfm = NULL;
60 goto fail; 60 goto fail;
@@ -66,8 +66,8 @@ static void *lib80211_wep_init(int keyidx)
66 66
67 fail: 67 fail:
68 if (priv) { 68 if (priv) {
69 crypto_free_skcipher(priv->tx_tfm); 69 crypto_free_cipher(priv->tx_tfm);
70 crypto_free_skcipher(priv->rx_tfm); 70 crypto_free_cipher(priv->rx_tfm);
71 kfree(priv); 71 kfree(priv);
72 } 72 }
73 return NULL; 73 return NULL;
@@ -77,8 +77,8 @@ static void lib80211_wep_deinit(void *priv)
77{ 77{
78 struct lib80211_wep_data *_priv = priv; 78 struct lib80211_wep_data *_priv = priv;
79 if (_priv) { 79 if (_priv) {
80 crypto_free_skcipher(_priv->tx_tfm); 80 crypto_free_cipher(_priv->tx_tfm);
81 crypto_free_skcipher(_priv->rx_tfm); 81 crypto_free_cipher(_priv->rx_tfm);
82 } 82 }
83 kfree(priv); 83 kfree(priv);
84} 84}
@@ -129,12 +129,10 @@ static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len,
129static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) 129static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
130{ 130{
131 struct lib80211_wep_data *wep = priv; 131 struct lib80211_wep_data *wep = priv;
132 SKCIPHER_REQUEST_ON_STACK(req, wep->tx_tfm);
133 u32 crc, klen, len; 132 u32 crc, klen, len;
134 u8 *pos, *icv; 133 u8 *pos, *icv;
135 struct scatterlist sg;
136 u8 key[WEP_KEY_LEN + 3]; 134 u8 key[WEP_KEY_LEN + 3];
137 int err; 135 int i;
138 136
139 /* other checks are in lib80211_wep_build_iv */ 137 /* other checks are in lib80211_wep_build_iv */
140 if (skb_tailroom(skb) < 4) 138 if (skb_tailroom(skb) < 4)
@@ -162,14 +160,12 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
162 icv[2] = crc >> 16; 160 icv[2] = crc >> 16;
163 icv[3] = crc >> 24; 161 icv[3] = crc >> 24;
164 162
165 crypto_skcipher_setkey(wep->tx_tfm, key, klen); 163 crypto_cipher_setkey(wep->tx_tfm, key, klen);
166 sg_init_one(&sg, pos, len + 4); 164
167 skcipher_request_set_tfm(req, wep->tx_tfm); 165 for (i = 0; i < len + 4; i++)
168 skcipher_request_set_callback(req, 0, NULL, NULL); 166 crypto_cipher_encrypt_one(wep->tx_tfm, pos + i, pos + i);
169 skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL); 167
170 err = crypto_skcipher_encrypt(req); 168 return 0;
171 skcipher_request_zero(req);
172 return err;
173} 169}
174 170
175/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of 171/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
@@ -182,12 +178,10 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
182static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) 178static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
183{ 179{
184 struct lib80211_wep_data *wep = priv; 180 struct lib80211_wep_data *wep = priv;
185 SKCIPHER_REQUEST_ON_STACK(req, wep->rx_tfm);
186 u32 crc, klen, plen; 181 u32 crc, klen, plen;
187 u8 key[WEP_KEY_LEN + 3]; 182 u8 key[WEP_KEY_LEN + 3];
188 u8 keyidx, *pos, icv[4]; 183 u8 keyidx, *pos, icv[4];
189 struct scatterlist sg; 184 int i;
190 int err;
191 185
192 if (skb->len < hdr_len + 8) 186 if (skb->len < hdr_len + 8)
193 return -1; 187 return -1;
@@ -208,15 +202,9 @@ static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
208 /* Apply RC4 to data and compute CRC32 over decrypted data */ 202 /* Apply RC4 to data and compute CRC32 over decrypted data */
209 plen = skb->len - hdr_len - 8; 203 plen = skb->len - hdr_len - 8;
210 204
211 crypto_skcipher_setkey(wep->rx_tfm, key, klen); 205 crypto_cipher_setkey(wep->rx_tfm, key, klen);
212 sg_init_one(&sg, pos, plen + 4); 206 for (i = 0; i < plen + 4; i++)
213 skcipher_request_set_tfm(req, wep->rx_tfm); 207 crypto_cipher_decrypt_one(wep->rx_tfm, pos + i, pos + i);
214 skcipher_request_set_callback(req, 0, NULL, NULL);
215 skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
216 err = crypto_skcipher_decrypt(req);
217 skcipher_request_zero(req);
218 if (err)
219 return -7;
220 208
221 crc = ~crc32_le(~0, pos, plen); 209 crc = ~crc32_le(~0, pos, plen);
222 icv[0] = crc; 210 icv[0] = crc;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 176edfefcbaa..744b5851bbf9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -200,7 +200,46 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
200 return __cfg80211_rdev_from_attrs(netns, info->attrs); 200 return __cfg80211_rdev_from_attrs(netns, info->attrs);
201} 201}
202 202
203static int validate_ie_attr(const struct nlattr *attr,
204 struct netlink_ext_ack *extack)
205{
206 const u8 *pos;
207 int len;
208
209 pos = nla_data(attr);
210 len = nla_len(attr);
211
212 while (len) {
213 u8 elemlen;
214
215 if (len < 2)
216 goto error;
217 len -= 2;
218
219 elemlen = pos[1];
220 if (elemlen > len)
221 goto error;
222
223 len -= elemlen;
224 pos += 2 + elemlen;
225 }
226
227 return 0;
228error:
229 NL_SET_ERR_MSG_ATTR(extack, attr, "malformed information elements");
230 return -EINVAL;
231}
232
203/* policy for the attributes */ 233/* policy for the attributes */
234static const struct nla_policy
235nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
236 [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, },
237 [NL80211_FTM_RESP_ATTR_LCI] = { .type = NLA_BINARY,
238 .len = U8_MAX },
239 [NL80211_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_BINARY,
240 .len = U8_MAX },
241};
242
204static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { 243static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
205 [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, 244 [NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
206 [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, 245 [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
@@ -213,14 +252,14 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
213 [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, 252 [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
214 [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, 253 [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },
215 254
216 [NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 }, 255 [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1),
217 [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 }, 256 [NL80211_ATTR_WIPHY_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1),
218 [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, 257 [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 },
219 [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, 258 [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 },
220 [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, 259 [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 },
221 [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG }, 260 [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG },
222 261
223 [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, 262 [NL80211_ATTR_IFTYPE] = NLA_POLICY_MAX(NLA_U32, NL80211_IFTYPE_MAX),
224 [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, 263 [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
225 [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, 264 [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
226 265
@@ -230,24 +269,28 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
230 [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, 269 [NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
231 [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, 270 [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
232 .len = WLAN_MAX_KEY_LEN }, 271 .len = WLAN_MAX_KEY_LEN },
233 [NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 }, 272 [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 5),
234 [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, 273 [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
235 [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, 274 [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
236 [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 }, 275 [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
237 [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 }, 276 [NL80211_ATTR_KEY_TYPE] =
277 NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES),
238 278
239 [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, 279 [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
240 [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, 280 [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
241 [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, 281 [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
242 .len = IEEE80211_MAX_DATA_LEN }, 282 .len = IEEE80211_MAX_DATA_LEN },
243 [NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY, 283 [NL80211_ATTR_BEACON_TAIL] =
244 .len = IEEE80211_MAX_DATA_LEN }, 284 NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
245 [NL80211_ATTR_STA_AID] = { .type = NLA_U16 }, 285 IEEE80211_MAX_DATA_LEN),
286 [NL80211_ATTR_STA_AID] =
287 NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
246 [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED }, 288 [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED },
247 [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 }, 289 [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 },
248 [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY, 290 [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY,
249 .len = NL80211_MAX_SUPP_RATES }, 291 .len = NL80211_MAX_SUPP_RATES },
250 [NL80211_ATTR_STA_PLINK_ACTION] = { .type = NLA_U8 }, 292 [NL80211_ATTR_STA_PLINK_ACTION] =
293 NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_ACTIONS - 1),
251 [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 }, 294 [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 },
252 [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, 295 [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ },
253 [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, 296 [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
@@ -270,8 +313,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
270 [NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN }, 313 [NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN },
271 314
272 [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, 315 [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
273 [NL80211_ATTR_IE] = { .type = NLA_BINARY, 316 [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
274 .len = IEEE80211_MAX_DATA_LEN }, 317 validate_ie_attr,
318 IEEE80211_MAX_DATA_LEN),
275 [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED }, 319 [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
276 [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED }, 320 [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },
277 321
@@ -281,7 +325,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
281 [NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 }, 325 [NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
282 [NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG }, 326 [NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
283 [NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG }, 327 [NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
284 [NL80211_ATTR_USE_MFP] = { .type = NLA_U32 }, 328 [NL80211_ATTR_USE_MFP] = NLA_POLICY_RANGE(NLA_U32,
329 NL80211_MFP_NO,
330 NL80211_MFP_OPTIONAL),
285 [NL80211_ATTR_STA_FLAGS2] = { 331 [NL80211_ATTR_STA_FLAGS2] = {
286 .len = sizeof(struct nl80211_sta_flag_update), 332 .len = sizeof(struct nl80211_sta_flag_update),
287 }, 333 },
@@ -301,7 +347,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
301 [NL80211_ATTR_FRAME] = { .type = NLA_BINARY, 347 [NL80211_ATTR_FRAME] = { .type = NLA_BINARY,
302 .len = IEEE80211_MAX_DATA_LEN }, 348 .len = IEEE80211_MAX_DATA_LEN },
303 [NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, }, 349 [NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, },
304 [NL80211_ATTR_PS_STATE] = { .type = NLA_U32 }, 350 [NL80211_ATTR_PS_STATE] = NLA_POLICY_RANGE(NLA_U32,
351 NL80211_PS_DISABLED,
352 NL80211_PS_ENABLED),
305 [NL80211_ATTR_CQM] = { .type = NLA_NESTED, }, 353 [NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
306 [NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG }, 354 [NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
307 [NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 }, 355 [NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
@@ -314,15 +362,23 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
314 [NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG }, 362 [NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG },
315 [NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, 363 [NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
316 [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED }, 364 [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
317 [NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 }, 365 [NL80211_ATTR_STA_PLINK_STATE] =
366 NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1),
367 [NL80211_ATTR_MESH_PEER_AID] =
368 NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
318 [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 }, 369 [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
319 [NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED }, 370 [NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED },
320 [NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED }, 371 [NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED },
321 [NL80211_ATTR_HIDDEN_SSID] = { .type = NLA_U32 }, 372 [NL80211_ATTR_HIDDEN_SSID] =
322 [NL80211_ATTR_IE_PROBE_RESP] = { .type = NLA_BINARY, 373 NLA_POLICY_RANGE(NLA_U32,
323 .len = IEEE80211_MAX_DATA_LEN }, 374 NL80211_HIDDEN_SSID_NOT_IN_USE,
324 [NL80211_ATTR_IE_ASSOC_RESP] = { .type = NLA_BINARY, 375 NL80211_HIDDEN_SSID_ZERO_CONTENTS),
325 .len = IEEE80211_MAX_DATA_LEN }, 376 [NL80211_ATTR_IE_PROBE_RESP] =
377 NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
378 IEEE80211_MAX_DATA_LEN),
379 [NL80211_ATTR_IE_ASSOC_RESP] =
380 NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
381 IEEE80211_MAX_DATA_LEN),
326 [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG }, 382 [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG },
327 [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED }, 383 [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED },
328 [NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG }, 384 [NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG },
@@ -348,9 +404,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
348 [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, 404 [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
349 [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN }, 405 [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN },
350 [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, 406 [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
351 [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 }, 407 [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127),
352 [NL80211_ATTR_P2P_OPPPS] = { .type = NLA_U8 }, 408 [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1),
353 [NL80211_ATTR_LOCAL_MESH_POWER_MODE] = {. type = NLA_U32 }, 409 [NL80211_ATTR_LOCAL_MESH_POWER_MODE] =
410 NLA_POLICY_RANGE(NLA_U32,
411 NL80211_MESH_POWER_UNKNOWN + 1,
412 NL80211_MESH_POWER_MAX),
354 [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 }, 413 [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 },
355 [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED }, 414 [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED },
356 [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 }, 415 [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 },
@@ -363,7 +422,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
363 [NL80211_ATTR_MDID] = { .type = NLA_U16 }, 422 [NL80211_ATTR_MDID] = { .type = NLA_U16 },
364 [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, 423 [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
365 .len = IEEE80211_MAX_DATA_LEN }, 424 .len = IEEE80211_MAX_DATA_LEN },
366 [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 }, 425 [NL80211_ATTR_PEER_AID] =
426 NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
367 [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, 427 [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
368 [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, 428 [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
369 [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, 429 [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
@@ -384,8 +444,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
384 [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG }, 444 [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG },
385 [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, 445 [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY },
386 [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, 446 [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG },
387 [NL80211_ATTR_TSID] = { .type = NLA_U8 }, 447 [NL80211_ATTR_TSID] = NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_TIDS - 1),
388 [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, 448 [NL80211_ATTR_USER_PRIO] =
449 NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1),
389 [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, 450 [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 },
390 [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, 451 [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 },
391 [NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN }, 452 [NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN },
@@ -395,12 +456,13 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
395 [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, 456 [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
396 [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, 457 [NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
397 [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED }, 458 [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED },
398 [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 }, 459 [NL80211_ATTR_STA_SUPPORT_P2P_PS] =
460 NLA_POLICY_MAX(NLA_U8, NUM_NL80211_P2P_PS_STATUS - 1),
399 [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { 461 [NL80211_ATTR_MU_MIMO_GROUP_DATA] = {
400 .len = VHT_MUMIMO_GROUPS_DATA_LEN 462 .len = VHT_MUMIMO_GROUPS_DATA_LEN
401 }, 463 },
402 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, 464 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
403 [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, 465 [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1),
404 [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, 466 [NL80211_ATTR_BANDS] = { .type = NLA_U32 },
405 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, 467 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
406 [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, 468 [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
@@ -430,6 +492,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
430 [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, 492 [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
431 [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, 493 [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
432 .len = NL80211_HE_MAX_CAPABILITY_LEN }, 494 .len = NL80211_HE_MAX_CAPABILITY_LEN },
495
496 [NL80211_ATTR_FTM_RESPONDER] = {
497 .type = NLA_NESTED,
498 .validation_data = nl80211_ftm_responder_policy,
499 },
433}; 500};
434 501
435/* policy for the key attributes */ 502/* policy for the key attributes */
@@ -440,7 +507,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
440 [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 }, 507 [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
441 [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG }, 508 [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
442 [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, 509 [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
443 [NL80211_KEY_TYPE] = { .type = NLA_U32 }, 510 [NL80211_KEY_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES - 1),
444 [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, 511 [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
445}; 512};
446 513
@@ -491,7 +558,10 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
491static const struct nla_policy 558static const struct nla_policy
492nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { 559nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
493 [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 }, 560 [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 },
494 [NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 }, 561 [NL80211_ATTR_COALESCE_RULE_CONDITION] =
562 NLA_POLICY_RANGE(NLA_U32,
563 NL80211_COALESCE_CONDITION_MATCH,
564 NL80211_COALESCE_CONDITION_NO_MATCH),
495 [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED }, 565 [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED },
496}; 566};
497 567
@@ -567,8 +637,7 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
567 [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, 637 [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
568}; 638};
569 639
570static int nl80211_prepare_wdev_dump(struct sk_buff *skb, 640static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
571 struct netlink_callback *cb,
572 struct cfg80211_registered_device **rdev, 641 struct cfg80211_registered_device **rdev,
573 struct wireless_dev **wdev) 642 struct wireless_dev **wdev)
574{ 643{
@@ -582,7 +651,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
582 return err; 651 return err;
583 652
584 *wdev = __cfg80211_wdev_from_attrs( 653 *wdev = __cfg80211_wdev_from_attrs(
585 sock_net(skb->sk), 654 sock_net(cb->skb->sk),
586 genl_family_attrbuf(&nl80211_fam)); 655 genl_family_attrbuf(&nl80211_fam));
587 if (IS_ERR(*wdev)) 656 if (IS_ERR(*wdev))
588 return PTR_ERR(*wdev); 657 return PTR_ERR(*wdev);
@@ -614,36 +683,6 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
614 return 0; 683 return 0;
615} 684}
616 685
617/* IE validation */
618static bool is_valid_ie_attr(const struct nlattr *attr)
619{
620 const u8 *pos;
621 int len;
622
623 if (!attr)
624 return true;
625
626 pos = nla_data(attr);
627 len = nla_len(attr);
628
629 while (len) {
630 u8 elemlen;
631
632 if (len < 2)
633 return false;
634 len -= 2;
635
636 elemlen = pos[1];
637 if (elemlen > len)
638 return false;
639
640 len -= elemlen;
641 pos += 2 + elemlen;
642 }
643
644 return true;
645}
646
647/* message building helper */ 686/* message building helper */
648static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, 687static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
649 int flags, u8 cmd) 688 int flags, u8 cmd)
@@ -858,12 +897,8 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key,
858 if (tb[NL80211_KEY_CIPHER]) 897 if (tb[NL80211_KEY_CIPHER])
859 k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]); 898 k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
860 899
861 if (tb[NL80211_KEY_TYPE]) { 900 if (tb[NL80211_KEY_TYPE])
862 k->type = nla_get_u32(tb[NL80211_KEY_TYPE]); 901 k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
863 if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
864 return genl_err_attr(info, -EINVAL,
865 tb[NL80211_KEY_TYPE]);
866 }
867 902
868 if (tb[NL80211_KEY_DEFAULT_TYPES]) { 903 if (tb[NL80211_KEY_DEFAULT_TYPES]) {
869 struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; 904 struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
@@ -910,13 +945,8 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
910 if (k->defmgmt) 945 if (k->defmgmt)
911 k->def_multi = true; 946 k->def_multi = true;
912 947
913 if (info->attrs[NL80211_ATTR_KEY_TYPE]) { 948 if (info->attrs[NL80211_ATTR_KEY_TYPE])
914 k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); 949 k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
915 if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) {
916 GENL_SET_ERR_MSG(info, "key type out of range");
917 return -EINVAL;
918 }
919 }
920 950
921 if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) { 951 if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) {
922 struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; 952 struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
@@ -2292,12 +2322,14 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
2292 struct genl_info *info, 2322 struct genl_info *info,
2293 struct cfg80211_chan_def *chandef) 2323 struct cfg80211_chan_def *chandef)
2294{ 2324{
2325 struct netlink_ext_ack *extack = info->extack;
2326 struct nlattr **attrs = info->attrs;
2295 u32 control_freq; 2327 u32 control_freq;
2296 2328
2297 if (!info->attrs[NL80211_ATTR_WIPHY_FREQ]) 2329 if (!attrs[NL80211_ATTR_WIPHY_FREQ])
2298 return -EINVAL; 2330 return -EINVAL;
2299 2331
2300 control_freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); 2332 control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]);
2301 2333
2302 chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); 2334 chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq);
2303 chandef->width = NL80211_CHAN_WIDTH_20_NOHT; 2335 chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
@@ -2305,14 +2337,16 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
2305 chandef->center_freq2 = 0; 2337 chandef->center_freq2 = 0;
2306 2338
2307 /* Primary channel not allowed */ 2339 /* Primary channel not allowed */
2308 if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) 2340 if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) {
2341 NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ],
2342 "Channel is disabled");
2309 return -EINVAL; 2343 return -EINVAL;
2344 }
2310 2345
2311 if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { 2346 if (attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
2312 enum nl80211_channel_type chantype; 2347 enum nl80211_channel_type chantype;
2313 2348
2314 chantype = nla_get_u32( 2349 chantype = nla_get_u32(attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
2315 info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
2316 2350
2317 switch (chantype) { 2351 switch (chantype) {
2318 case NL80211_CHAN_NO_HT: 2352 case NL80211_CHAN_NO_HT:
@@ -2322,42 +2356,56 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
2322 cfg80211_chandef_create(chandef, chandef->chan, 2356 cfg80211_chandef_create(chandef, chandef->chan,
2323 chantype); 2357 chantype);
2324 /* user input for center_freq is incorrect */ 2358 /* user input for center_freq is incorrect */
2325 if (info->attrs[NL80211_ATTR_CENTER_FREQ1] && 2359 if (attrs[NL80211_ATTR_CENTER_FREQ1] &&
2326 chandef->center_freq1 != nla_get_u32( 2360 chandef->center_freq1 != nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1])) {
2327 info->attrs[NL80211_ATTR_CENTER_FREQ1])) 2361 NL_SET_ERR_MSG_ATTR(extack,
2362 attrs[NL80211_ATTR_CENTER_FREQ1],
2363 "bad center frequency 1");
2328 return -EINVAL; 2364 return -EINVAL;
2365 }
2329 /* center_freq2 must be zero */ 2366 /* center_freq2 must be zero */
2330 if (info->attrs[NL80211_ATTR_CENTER_FREQ2] && 2367 if (attrs[NL80211_ATTR_CENTER_FREQ2] &&
2331 nla_get_u32(info->attrs[NL80211_ATTR_CENTER_FREQ2])) 2368 nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2])) {
2369 NL_SET_ERR_MSG_ATTR(extack,
2370 attrs[NL80211_ATTR_CENTER_FREQ2],
2371 "center frequency 2 can't be used");
2332 return -EINVAL; 2372 return -EINVAL;
2373 }
2333 break; 2374 break;
2334 default: 2375 default:
2376 NL_SET_ERR_MSG_ATTR(extack,
2377 attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE],
2378 "invalid channel type");
2335 return -EINVAL; 2379 return -EINVAL;
2336 } 2380 }
2337 } else if (info->attrs[NL80211_ATTR_CHANNEL_WIDTH]) { 2381 } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
2338 chandef->width = 2382 chandef->width =
2339 nla_get_u32(info->attrs[NL80211_ATTR_CHANNEL_WIDTH]); 2383 nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]);
2340 if (info->attrs[NL80211_ATTR_CENTER_FREQ1]) 2384 if (attrs[NL80211_ATTR_CENTER_FREQ1])
2341 chandef->center_freq1 = 2385 chandef->center_freq1 =
2342 nla_get_u32( 2386 nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]);
2343 info->attrs[NL80211_ATTR_CENTER_FREQ1]); 2387 if (attrs[NL80211_ATTR_CENTER_FREQ2])
2344 if (info->attrs[NL80211_ATTR_CENTER_FREQ2])
2345 chandef->center_freq2 = 2388 chandef->center_freq2 =
2346 nla_get_u32( 2389 nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
2347 info->attrs[NL80211_ATTR_CENTER_FREQ2]);
2348 } 2390 }
2349 2391
2350 if (!cfg80211_chandef_valid(chandef)) 2392 if (!cfg80211_chandef_valid(chandef)) {
2393 NL_SET_ERR_MSG(extack, "invalid channel definition");
2351 return -EINVAL; 2394 return -EINVAL;
2395 }
2352 2396
2353 if (!cfg80211_chandef_usable(&rdev->wiphy, chandef, 2397 if (!cfg80211_chandef_usable(&rdev->wiphy, chandef,
2354 IEEE80211_CHAN_DISABLED)) 2398 IEEE80211_CHAN_DISABLED)) {
2399 NL_SET_ERR_MSG(extack, "(extension) channel is disabled");
2355 return -EINVAL; 2400 return -EINVAL;
2401 }
2356 2402
2357 if ((chandef->width == NL80211_CHAN_WIDTH_5 || 2403 if ((chandef->width == NL80211_CHAN_WIDTH_5 ||
2358 chandef->width == NL80211_CHAN_WIDTH_10) && 2404 chandef->width == NL80211_CHAN_WIDTH_10) &&
2359 !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) 2405 !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) {
2406 NL_SET_ERR_MSG(extack, "5/10 MHz not supported");
2360 return -EINVAL; 2407 return -EINVAL;
2408 }
2361 2409
2362 return 0; 2410 return 0;
2363} 2411}
@@ -2617,8 +2665,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2617 if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) { 2665 if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
2618 retry_short = nla_get_u8( 2666 retry_short = nla_get_u8(
2619 info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]); 2667 info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]);
2620 if (retry_short == 0)
2621 return -EINVAL;
2622 2668
2623 changed |= WIPHY_PARAM_RETRY_SHORT; 2669 changed |= WIPHY_PARAM_RETRY_SHORT;
2624 } 2670 }
@@ -2626,8 +2672,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2626 if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) { 2672 if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) {
2627 retry_long = nla_get_u8( 2673 retry_long = nla_get_u8(
2628 info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]); 2674 info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]);
2629 if (retry_long == 0)
2630 return -EINVAL;
2631 2675
2632 changed |= WIPHY_PARAM_RETRY_LONG; 2676 changed |= WIPHY_PARAM_RETRY_LONG;
2633 } 2677 }
@@ -3119,8 +3163,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
3119 ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); 3163 ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
3120 if (otype != ntype) 3164 if (otype != ntype)
3121 change = true; 3165 change = true;
3122 if (ntype > NL80211_IFTYPE_MAX)
3123 return -EINVAL;
3124 } 3166 }
3125 3167
3126 if (info->attrs[NL80211_ATTR_MESH_ID]) { 3168 if (info->attrs[NL80211_ATTR_MESH_ID]) {
@@ -3185,11 +3227,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
3185 if (!info->attrs[NL80211_ATTR_IFNAME]) 3227 if (!info->attrs[NL80211_ATTR_IFNAME])
3186 return -EINVAL; 3228 return -EINVAL;
3187 3229
3188 if (info->attrs[NL80211_ATTR_IFTYPE]) { 3230 if (info->attrs[NL80211_ATTR_IFTYPE])
3189 type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); 3231 type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
3190 if (type > NL80211_IFTYPE_MAX)
3191 return -EINVAL;
3192 }
3193 3232
3194 if (!rdev->ops->add_virtual_intf || 3233 if (!rdev->ops->add_virtual_intf ||
3195 !(rdev->wiphy.interface_modes & (1 << type))) 3234 !(rdev->wiphy.interface_modes & (1 << type)))
@@ -3252,15 +3291,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
3252 * P2P Device and NAN do not have a netdev, so don't go 3291 * P2P Device and NAN do not have a netdev, so don't go
3253 * through the netdev notifier and must be added here 3292 * through the netdev notifier and must be added here
3254 */ 3293 */
3255 mutex_init(&wdev->mtx); 3294 cfg80211_init_wdev(rdev, wdev);
3256 INIT_LIST_HEAD(&wdev->event_list);
3257 spin_lock_init(&wdev->event_lock);
3258 INIT_LIST_HEAD(&wdev->mgmt_registrations);
3259 spin_lock_init(&wdev->mgmt_registrations_lock);
3260
3261 wdev->identifier = ++rdev->wdev_id;
3262 list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
3263 rdev->devlist_generation++;
3264 break; 3295 break;
3265 default: 3296 default:
3266 break; 3297 break;
@@ -3272,15 +3303,6 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
3272 return -ENOBUFS; 3303 return -ENOBUFS;
3273 } 3304 }
3274 3305
3275 /*
3276 * For wdevs which have no associated netdev object (e.g. of type
3277 * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here.
3278 * For all other types, the event will be generated from the
3279 * netdev notifier
3280 */
3281 if (!wdev->netdev)
3282 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
3283
3284 return genlmsg_reply(msg, info); 3306 return genlmsg_reply(msg, info);
3285} 3307}
3286 3308
@@ -3359,7 +3381,7 @@ static void get_key_callback(void *c, struct key_params *params)
3359 params->cipher))) 3381 params->cipher)))
3360 goto nla_put_failure; 3382 goto nla_put_failure;
3361 3383
3362 if (nla_put_u8(cookie->msg, NL80211_ATTR_KEY_IDX, cookie->idx)) 3384 if (nla_put_u8(cookie->msg, NL80211_KEY_IDX, cookie->idx))
3363 goto nla_put_failure; 3385 goto nla_put_failure;
3364 3386
3365 nla_nest_end(cookie->msg, key); 3387 nla_nest_end(cookie->msg, key);
@@ -3386,9 +3408,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
3386 if (info->attrs[NL80211_ATTR_KEY_IDX]) 3408 if (info->attrs[NL80211_ATTR_KEY_IDX])
3387 key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); 3409 key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
3388 3410
3389 if (key_idx > 5)
3390 return -EINVAL;
3391
3392 if (info->attrs[NL80211_ATTR_MAC]) 3411 if (info->attrs[NL80211_ATTR_MAC])
3393 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); 3412 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
3394 3413
@@ -3396,8 +3415,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
3396 if (info->attrs[NL80211_ATTR_KEY_TYPE]) { 3415 if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
3397 u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); 3416 u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
3398 3417
3399 if (kt >= NUM_NL80211_KEYTYPES)
3400 return -EINVAL;
3401 if (kt != NL80211_KEYTYPE_GROUP && 3418 if (kt != NL80211_KEYTYPE_GROUP &&
3402 kt != NL80211_KEYTYPE_PAIRWISE) 3419 kt != NL80211_KEYTYPE_PAIRWISE)
3403 return -EINVAL; 3420 return -EINVAL;
@@ -3998,16 +4015,12 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
3998 return 0; 4015 return 0;
3999} 4016}
4000 4017
4001static int nl80211_parse_beacon(struct nlattr *attrs[], 4018static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
4019 struct nlattr *attrs[],
4002 struct cfg80211_beacon_data *bcn) 4020 struct cfg80211_beacon_data *bcn)
4003{ 4021{
4004 bool haveinfo = false; 4022 bool haveinfo = false;
4005 4023 int err;
4006 if (!is_valid_ie_attr(attrs[NL80211_ATTR_BEACON_TAIL]) ||
4007 !is_valid_ie_attr(attrs[NL80211_ATTR_IE]) ||
4008 !is_valid_ie_attr(attrs[NL80211_ATTR_IE_PROBE_RESP]) ||
4009 !is_valid_ie_attr(attrs[NL80211_ATTR_IE_ASSOC_RESP]))
4010 return -EINVAL;
4011 4024
4012 memset(bcn, 0, sizeof(*bcn)); 4025 memset(bcn, 0, sizeof(*bcn));
4013 4026
@@ -4052,6 +4065,35 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
4052 bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]); 4065 bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]);
4053 } 4066 }
4054 4067
4068 if (attrs[NL80211_ATTR_FTM_RESPONDER]) {
4069 struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1];
4070
4071 err = nla_parse_nested(tb, NL80211_FTM_RESP_ATTR_MAX,
4072 attrs[NL80211_ATTR_FTM_RESPONDER],
4073 NULL, NULL);
4074 if (err)
4075 return err;
4076
4077 if (tb[NL80211_FTM_RESP_ATTR_ENABLED] &&
4078 wiphy_ext_feature_isset(&rdev->wiphy,
4079 NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
4080 bcn->ftm_responder = 1;
4081 else
4082 return -EOPNOTSUPP;
4083
4084 if (tb[NL80211_FTM_RESP_ATTR_LCI]) {
4085 bcn->lci = nla_data(tb[NL80211_FTM_RESP_ATTR_LCI]);
4086 bcn->lci_len = nla_len(tb[NL80211_FTM_RESP_ATTR_LCI]);
4087 }
4088
4089 if (tb[NL80211_FTM_RESP_ATTR_CIVICLOC]) {
4090 bcn->civicloc = nla_data(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
4091 bcn->civicloc_len = nla_len(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
4092 }
4093 } else {
4094 bcn->ftm_responder = -1;
4095 }
4096
4055 return 0; 4097 return 0;
4056} 4098}
4057 4099
@@ -4096,6 +4138,9 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
4096 cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len); 4138 cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
4097 if (cap && cap[1] >= sizeof(*params->vht_cap)) 4139 if (cap && cap[1] >= sizeof(*params->vht_cap))
4098 params->vht_cap = (void *)(cap + 2); 4140 params->vht_cap = (void *)(cap + 2);
4141 cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len);
4142 if (cap && cap[1] >= sizeof(*params->he_cap) + 1)
4143 params->he_cap = (void *)(cap + 3);
4099} 4144}
4100 4145
4101static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, 4146static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
@@ -4195,7 +4240,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
4195 !info->attrs[NL80211_ATTR_BEACON_HEAD]) 4240 !info->attrs[NL80211_ATTR_BEACON_HEAD])
4196 return -EINVAL; 4241 return -EINVAL;
4197 4242
4198 err = nl80211_parse_beacon(info->attrs, &params.beacon); 4243 err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon);
4199 if (err) 4244 if (err)
4200 return err; 4245 return err;
4201 4246
@@ -4225,14 +4270,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
4225 return -EINVAL; 4270 return -EINVAL;
4226 } 4271 }
4227 4272
4228 if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) { 4273 if (info->attrs[NL80211_ATTR_HIDDEN_SSID])
4229 params.hidden_ssid = nla_get_u32( 4274 params.hidden_ssid = nla_get_u32(
4230 info->attrs[NL80211_ATTR_HIDDEN_SSID]); 4275 info->attrs[NL80211_ATTR_HIDDEN_SSID]);
4231 if (params.hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE &&
4232 params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_LEN &&
4233 params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_CONTENTS)
4234 return -EINVAL;
4235 }
4236 4276
4237 params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; 4277 params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
4238 4278
@@ -4262,8 +4302,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
4262 return -EINVAL; 4302 return -EINVAL;
4263 params.p2p_ctwindow = 4303 params.p2p_ctwindow =
4264 nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); 4304 nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
4265 if (params.p2p_ctwindow > 127)
4266 return -EINVAL;
4267 if (params.p2p_ctwindow != 0 && 4305 if (params.p2p_ctwindow != 0 &&
4268 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) 4306 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
4269 return -EINVAL; 4307 return -EINVAL;
@@ -4275,8 +4313,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
4275 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) 4313 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
4276 return -EINVAL; 4314 return -EINVAL;
4277 tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); 4315 tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
4278 if (tmp > 1)
4279 return -EINVAL;
4280 params.p2p_opp_ps = tmp; 4316 params.p2p_opp_ps = tmp;
4281 if (params.p2p_opp_ps != 0 && 4317 if (params.p2p_opp_ps != 0 &&
4282 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) 4318 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
@@ -4379,7 +4415,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
4379 if (!wdev->beacon_interval) 4415 if (!wdev->beacon_interval)
4380 return -EINVAL; 4416 return -EINVAL;
4381 4417
4382 err = nl80211_parse_beacon(info->attrs, &params); 4418 err = nl80211_parse_beacon(rdev, info->attrs, &params);
4383 if (err) 4419 if (err)
4384 return err; 4420 return err;
4385 4421
@@ -4725,10 +4761,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4725 PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); 4761 PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
4726 PUT_SINFO_U64(BEACON_RX, rx_beacon); 4762 PUT_SINFO_U64(BEACON_RX, rx_beacon);
4727 PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); 4763 PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
4728 PUT_SINFO(ACK_SIGNAL, ack_signal, u8); 4764 PUT_SINFO(RX_MPDUS, rx_mpdu_count, u32);
4765 PUT_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32);
4729 if (wiphy_ext_feature_isset(&rdev->wiphy, 4766 if (wiphy_ext_feature_isset(&rdev->wiphy,
4730 NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT)) 4767 NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) {
4731 PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8); 4768 PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
4769 PUT_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8);
4770 }
4732 4771
4733#undef PUT_SINFO 4772#undef PUT_SINFO
4734#undef PUT_SINFO_U64 4773#undef PUT_SINFO_U64
@@ -4807,7 +4846,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
4807 int err; 4846 int err;
4808 4847
4809 rtnl_lock(); 4848 rtnl_lock();
4810 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 4849 err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
4811 if (err) 4850 if (err)
4812 goto out_err; 4851 goto out_err;
4813 4852
@@ -5212,17 +5251,11 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
5212 else 5251 else
5213 params.listen_interval = -1; 5252 params.listen_interval = -1;
5214 5253
5215 if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { 5254 if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS])
5216 u8 tmp; 5255 params.support_p2p_ps =
5217 5256 nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
5218 tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); 5257 else
5219 if (tmp >= NUM_NL80211_P2P_PS_STATUS)
5220 return -EINVAL;
5221
5222 params.support_p2p_ps = tmp;
5223 } else {
5224 params.support_p2p_ps = -1; 5258 params.support_p2p_ps = -1;
5225 }
5226 5259
5227 if (!info->attrs[NL80211_ATTR_MAC]) 5260 if (!info->attrs[NL80211_ATTR_MAC])
5228 return -EINVAL; 5261 return -EINVAL;
@@ -5252,38 +5285,23 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
5252 if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params)) 5285 if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
5253 return -EINVAL; 5286 return -EINVAL;
5254 5287
5255 if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { 5288 if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
5256 params.plink_action = 5289 params.plink_action =
5257 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); 5290 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
5258 if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS)
5259 return -EINVAL;
5260 }
5261 5291
5262 if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) { 5292 if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) {
5263 params.plink_state = 5293 params.plink_state =
5264 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); 5294 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
5265 if (params.plink_state >= NUM_NL80211_PLINK_STATES) 5295 if (info->attrs[NL80211_ATTR_MESH_PEER_AID])
5266 return -EINVAL;
5267 if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) {
5268 params.peer_aid = nla_get_u16( 5296 params.peer_aid = nla_get_u16(
5269 info->attrs[NL80211_ATTR_MESH_PEER_AID]); 5297 info->attrs[NL80211_ATTR_MESH_PEER_AID]);
5270 if (params.peer_aid > IEEE80211_MAX_AID)
5271 return -EINVAL;
5272 }
5273 params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE; 5298 params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE;
5274 } 5299 }
5275 5300
5276 if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]) { 5301 if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE])
5277 enum nl80211_mesh_power_mode pm = nla_get_u32( 5302 params.local_pm = nla_get_u32(
5278 info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]); 5303 info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]);
5279 5304
5280 if (pm <= NL80211_MESH_POWER_UNKNOWN ||
5281 pm > NL80211_MESH_POWER_MAX)
5282 return -EINVAL;
5283
5284 params.local_pm = pm;
5285 }
5286
5287 if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { 5305 if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
5288 params.opmode_notif_used = true; 5306 params.opmode_notif_used = true;
5289 params.opmode_notif = 5307 params.opmode_notif =
@@ -5360,13 +5378,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
5360 nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); 5378 nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
5361 5379
5362 if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { 5380 if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
5363 u8 tmp; 5381 params.support_p2p_ps =
5364 5382 nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
5365 tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
5366 if (tmp >= NUM_NL80211_P2P_PS_STATUS)
5367 return -EINVAL;
5368
5369 params.support_p2p_ps = tmp;
5370 } else { 5383 } else {
5371 /* 5384 /*
5372 * if not specified, assume it's supported for P2P GO interface, 5385 * if not specified, assume it's supported for P2P GO interface,
@@ -5380,8 +5393,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
5380 params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); 5393 params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
5381 else 5394 else
5382 params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); 5395 params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
5383 if (!params.aid || params.aid > IEEE80211_MAX_AID)
5384 return -EINVAL;
5385 5396
5386 if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) { 5397 if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) {
5387 params.capability = 5398 params.capability =
@@ -5421,12 +5432,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
5421 nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); 5432 nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
5422 } 5433 }
5423 5434
5424 if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { 5435 if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
5425 params.plink_action = 5436 params.plink_action =
5426 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); 5437 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
5427 if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS)
5428 return -EINVAL;
5429 }
5430 5438
5431 err = nl80211_parse_sta_channel_info(info, &params); 5439 err = nl80211_parse_sta_channel_info(info, &params);
5432 if (err) 5440 if (err)
@@ -5658,7 +5666,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
5658 int err; 5666 int err;
5659 5667
5660 rtnl_lock(); 5668 rtnl_lock();
5661 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 5669 err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
5662 if (err) 5670 if (err)
5663 goto out_err; 5671 goto out_err;
5664 5672
@@ -5854,7 +5862,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
5854 int err; 5862 int err;
5855 5863
5856 rtnl_lock(); 5864 rtnl_lock();
5857 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 5865 err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
5858 if (err) 5866 if (err)
5859 goto out_err; 5867 goto out_err;
5860 5868
@@ -5936,9 +5944,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
5936 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) 5944 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
5937 return -EINVAL; 5945 return -EINVAL;
5938 params.p2p_ctwindow = 5946 params.p2p_ctwindow =
5939 nla_get_s8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); 5947 nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
5940 if (params.p2p_ctwindow < 0)
5941 return -EINVAL;
5942 if (params.p2p_ctwindow != 0 && 5948 if (params.p2p_ctwindow != 0 &&
5943 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) 5949 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
5944 return -EINVAL; 5950 return -EINVAL;
@@ -5950,8 +5956,6 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
5950 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) 5956 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
5951 return -EINVAL; 5957 return -EINVAL;
5952 tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); 5958 tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
5953 if (tmp > 1)
5954 return -EINVAL;
5955 params.p2p_opp_ps = tmp; 5959 params.p2p_opp_ps = tmp;
5956 if (params.p2p_opp_ps && 5960 if (params.p2p_opp_ps &&
5957 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) 5961 !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
@@ -6130,33 +6134,49 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
6130 return -ENOBUFS; 6134 return -ENOBUFS;
6131} 6135}
6132 6136
6133static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { 6137static const struct nla_policy
6134 [NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 }, 6138nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
6135 [NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 }, 6139 [NL80211_MESHCONF_RETRY_TIMEOUT] =
6136 [NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 }, 6140 NLA_POLICY_RANGE(NLA_U16, 1, 255),
6137 [NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 }, 6141 [NL80211_MESHCONF_CONFIRM_TIMEOUT] =
6138 [NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 }, 6142 NLA_POLICY_RANGE(NLA_U16, 1, 255),
6139 [NL80211_MESHCONF_TTL] = { .type = NLA_U8 }, 6143 [NL80211_MESHCONF_HOLDING_TIMEOUT] =
6140 [NL80211_MESHCONF_ELEMENT_TTL] = { .type = NLA_U8 }, 6144 NLA_POLICY_RANGE(NLA_U16, 1, 255),
6141 [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 }, 6145 [NL80211_MESHCONF_MAX_PEER_LINKS] =
6142 [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] = { .type = NLA_U32 }, 6146 NLA_POLICY_RANGE(NLA_U16, 0, 255),
6147 [NL80211_MESHCONF_MAX_RETRIES] = NLA_POLICY_MAX(NLA_U8, 16),
6148 [NL80211_MESHCONF_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
6149 [NL80211_MESHCONF_ELEMENT_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
6150 [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = NLA_POLICY_MAX(NLA_U8, 1),
6151 [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] =
6152 NLA_POLICY_RANGE(NLA_U32, 1, 255),
6143 [NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 }, 6153 [NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 },
6144 [NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 }, 6154 [NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 },
6145 [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 }, 6155 [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = NLA_POLICY_MIN(NLA_U16, 1),
6146 [NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 }, 6156 [NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 },
6147 [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 }, 6157 [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] =
6148 [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] = { .type = NLA_U16 }, 6158 NLA_POLICY_MIN(NLA_U16, 1),
6149 [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 }, 6159 [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] =
6150 [NL80211_MESHCONF_HWMP_ROOTMODE] = { .type = NLA_U8 }, 6160 NLA_POLICY_MIN(NLA_U16, 1),
6151 [NL80211_MESHCONF_HWMP_RANN_INTERVAL] = { .type = NLA_U16 }, 6161 [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] =
6152 [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = { .type = NLA_U8 }, 6162 NLA_POLICY_MIN(NLA_U16, 1),
6153 [NL80211_MESHCONF_FORWARDING] = { .type = NLA_U8 }, 6163 [NL80211_MESHCONF_HWMP_ROOTMODE] = NLA_POLICY_MAX(NLA_U8, 4),
6154 [NL80211_MESHCONF_RSSI_THRESHOLD] = { .type = NLA_U32 }, 6164 [NL80211_MESHCONF_HWMP_RANN_INTERVAL] =
6165 NLA_POLICY_MIN(NLA_U16, 1),
6166 [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = NLA_POLICY_MAX(NLA_U8, 1),
6167 [NL80211_MESHCONF_FORWARDING] = NLA_POLICY_MAX(NLA_U8, 1),
6168 [NL80211_MESHCONF_RSSI_THRESHOLD] =
6169 NLA_POLICY_RANGE(NLA_S32, -255, 0),
6155 [NL80211_MESHCONF_HT_OPMODE] = { .type = NLA_U16 }, 6170 [NL80211_MESHCONF_HT_OPMODE] = { .type = NLA_U16 },
6156 [NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 }, 6171 [NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 },
6157 [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] = { .type = NLA_U16 }, 6172 [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] =
6158 [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 }, 6173 NLA_POLICY_MIN(NLA_U16, 1),
6159 [NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 }, 6174 [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] =
6175 NLA_POLICY_MIN(NLA_U16, 1),
6176 [NL80211_MESHCONF_POWER_MODE] =
6177 NLA_POLICY_RANGE(NLA_U32,
6178 NL80211_MESH_POWER_ACTIVE,
6179 NL80211_MESH_POWER_MAX),
6160 [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, 6180 [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
6161 [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, 6181 [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
6162}; 6182};
@@ -6169,68 +6189,12 @@ static const struct nla_policy
6169 [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG }, 6189 [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG },
6170 [NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 }, 6190 [NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 },
6171 [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG }, 6191 [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG },
6172 [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY, 6192 [NL80211_MESH_SETUP_IE] =
6173 .len = IEEE80211_MAX_DATA_LEN }, 6193 NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
6194 IEEE80211_MAX_DATA_LEN),
6174 [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG }, 6195 [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG },
6175}; 6196};
6176 6197
6177static int nl80211_check_bool(const struct nlattr *nla, u8 min, u8 max, bool *out)
6178{
6179 u8 val = nla_get_u8(nla);
6180 if (val < min || val > max)
6181 return -EINVAL;
6182 *out = val;
6183 return 0;
6184}
6185
6186static int nl80211_check_u8(const struct nlattr *nla, u8 min, u8 max, u8 *out)
6187{
6188 u8 val = nla_get_u8(nla);
6189 if (val < min || val > max)
6190 return -EINVAL;
6191 *out = val;
6192 return 0;
6193}
6194
6195static int nl80211_check_u16(const struct nlattr *nla, u16 min, u16 max, u16 *out)
6196{
6197 u16 val = nla_get_u16(nla);
6198 if (val < min || val > max)
6199 return -EINVAL;
6200 *out = val;
6201 return 0;
6202}
6203
6204static int nl80211_check_u32(const struct nlattr *nla, u32 min, u32 max, u32 *out)
6205{
6206 u32 val = nla_get_u32(nla);
6207 if (val < min || val > max)
6208 return -EINVAL;
6209 *out = val;
6210 return 0;
6211}
6212
6213static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *out)
6214{
6215 s32 val = nla_get_s32(nla);
6216 if (val < min || val > max)
6217 return -EINVAL;
6218 *out = val;
6219 return 0;
6220}
6221
6222static int nl80211_check_power_mode(const struct nlattr *nla,
6223 enum nl80211_mesh_power_mode min,
6224 enum nl80211_mesh_power_mode max,
6225 enum nl80211_mesh_power_mode *out)
6226{
6227 u32 val = nla_get_u32(nla);
6228 if (val < min || val > max)
6229 return -EINVAL;
6230 *out = val;
6231 return 0;
6232}
6233
6234static int nl80211_parse_mesh_config(struct genl_info *info, 6198static int nl80211_parse_mesh_config(struct genl_info *info,
6235 struct mesh_config *cfg, 6199 struct mesh_config *cfg,
6236 u32 *mask_out) 6200 u32 *mask_out)
@@ -6239,13 +6203,12 @@ static int nl80211_parse_mesh_config(struct genl_info *info,
6239 u32 mask = 0; 6203 u32 mask = 0;
6240 u16 ht_opmode; 6204 u16 ht_opmode;
6241 6205
6242#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \ 6206#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, mask, attr, fn) \
6243do { \ 6207do { \
6244 if (tb[attr]) { \ 6208 if (tb[attr]) { \
6245 if (fn(tb[attr], min, max, &cfg->param)) \ 6209 cfg->param = fn(tb[attr]); \
6246 return -EINVAL; \ 6210 mask |= BIT((attr) - 1); \
6247 mask |= (1 << (attr - 1)); \ 6211 } \
6248 } \
6249} while (0) 6212} while (0)
6250 6213
6251 if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) 6214 if (!info->attrs[NL80211_ATTR_MESH_CONFIG])
@@ -6260,75 +6223,73 @@ do { \
6260 BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32); 6223 BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
6261 6224
6262 /* Fill in the params struct */ 6225 /* Fill in the params struct */
6263 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255, 6226 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, mask,
6264 mask, NL80211_MESHCONF_RETRY_TIMEOUT, 6227 NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16);
6265 nl80211_check_u16); 6228 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, mask,
6266 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255, 6229 NL80211_MESHCONF_CONFIRM_TIMEOUT,
6267 mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, 6230 nla_get_u16);
6268 nl80211_check_u16); 6231 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, mask,
6269 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255, 6232 NL80211_MESHCONF_HOLDING_TIMEOUT,
6270 mask, NL80211_MESHCONF_HOLDING_TIMEOUT, 6233 nla_get_u16);
6271 nl80211_check_u16); 6234 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, mask,
6272 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255, 6235 NL80211_MESHCONF_MAX_PEER_LINKS,
6273 mask, NL80211_MESHCONF_MAX_PEER_LINKS, 6236 nla_get_u16);
6274 nl80211_check_u16); 6237 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, mask,
6275 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16, 6238 NL80211_MESHCONF_MAX_RETRIES, nla_get_u8);
6276 mask, NL80211_MESHCONF_MAX_RETRIES, 6239 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, mask,
6277 nl80211_check_u8); 6240 NL80211_MESHCONF_TTL, nla_get_u8);
6278 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255, 6241 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, mask,
6279 mask, NL80211_MESHCONF_TTL, nl80211_check_u8); 6242 NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8);
6280 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255, 6243 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, mask,
6281 mask, NL80211_MESHCONF_ELEMENT_TTL, 6244 NL80211_MESHCONF_AUTO_OPEN_PLINKS,
6282 nl80211_check_u8); 6245 nla_get_u8);
6283 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1,
6284 mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
6285 nl80211_check_bool);
6286 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor, 6246 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor,
6287 1, 255, mask, 6247 mask,
6288 NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, 6248 NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
6289 nl80211_check_u32); 6249 nla_get_u32);
6290 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255, 6250 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, mask,
6291 mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, 6251 NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
6292 nl80211_check_u8); 6252 nla_get_u8);
6293 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535, 6253 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, mask,
6294 mask, NL80211_MESHCONF_PATH_REFRESH_TIME, 6254 NL80211_MESHCONF_PATH_REFRESH_TIME,
6295 nl80211_check_u32); 6255 nla_get_u32);
6296 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535, 6256 if (mask & BIT(NL80211_MESHCONF_PATH_REFRESH_TIME) &&
6297 mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, 6257 (cfg->path_refresh_time < 1 || cfg->path_refresh_time > 65535))
6298 nl80211_check_u16); 6258 return -EINVAL;
6259 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, mask,
6260 NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
6261 nla_get_u16);
6299 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, 6262 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
6300 1, 65535, mask, 6263 mask,
6301 NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, 6264 NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
6302 nl80211_check_u32); 6265 nla_get_u32);
6303 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, 6266 if (mask & BIT(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT) &&
6304 1, 65535, mask, 6267 (cfg->dot11MeshHWMPactivePathTimeout < 1 ||
6268 cfg->dot11MeshHWMPactivePathTimeout > 65535))
6269 return -EINVAL;
6270 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, mask,
6305 NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, 6271 NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
6306 nl80211_check_u16); 6272 nla_get_u16);
6307 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, 6273 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, mask,
6308 1, 65535, mask,
6309 NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, 6274 NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
6310 nl80211_check_u16); 6275 nla_get_u16);
6311 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, 6276 FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
6312 dot11MeshHWMPnetDiameterTraversalTime, 6277 dot11MeshHWMPnetDiameterTraversalTime, mask,
6313 1, 65535, mask,
6314 NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, 6278 NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
6315 nl80211_check_u16); 6279 nla_get_u16);
6316 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4, 6280 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, mask,
6317 mask, NL80211_MESHCONF_HWMP_ROOTMODE, 6281 NL80211_MESHCONF_HWMP_ROOTMODE, nla_get_u8);
6318 nl80211_check_u8); 6282 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, mask,
6319 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535, 6283 NL80211_MESHCONF_HWMP_RANN_INTERVAL,
6320 mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL, 6284 nla_get_u16);
6321 nl80211_check_u16); 6285 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshGateAnnouncementProtocol,
6322 FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
6323 dot11MeshGateAnnouncementProtocol, 0, 1,
6324 mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS, 6286 mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
6325 nl80211_check_bool); 6287 nla_get_u8);
6326 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1, 6288 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, mask,
6327 mask, NL80211_MESHCONF_FORWARDING, 6289 NL80211_MESHCONF_FORWARDING, nla_get_u8);
6328 nl80211_check_bool); 6290 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask,
6329 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0, 6291 NL80211_MESHCONF_RSSI_THRESHOLD,
6330 mask, NL80211_MESHCONF_RSSI_THRESHOLD, 6292 nla_get_s32);
6331 nl80211_check_s32);
6332 /* 6293 /*
6333 * Check HT operation mode based on 6294 * Check HT operation mode based on
6334 * IEEE 802.11-2016 9.4.2.57 HT Operation element. 6295 * IEEE 802.11-2016 9.4.2.57 HT Operation element.
@@ -6347,29 +6308,27 @@ do { \
6347 cfg->ht_opmode = ht_opmode; 6308 cfg->ht_opmode = ht_opmode;
6348 mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1)); 6309 mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
6349 } 6310 }
6350 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
6351 1, 65535, mask,
6352 NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
6353 nl80211_check_u32);
6354 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535,
6355 mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
6356 nl80211_check_u16);
6357 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, 6311 FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
6358 dot11MeshHWMPconfirmationInterval, 6312 dot11MeshHWMPactivePathToRootTimeout, mask,
6359 1, 65535, mask, 6313 NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
6314 nla_get_u32);
6315 if (mask & BIT(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT) &&
6316 (cfg->dot11MeshHWMPactivePathToRootTimeout < 1 ||
6317 cfg->dot11MeshHWMPactivePathToRootTimeout > 65535))
6318 return -EINVAL;
6319 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, mask,
6320 NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
6321 nla_get_u16);
6322 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPconfirmationInterval,
6323 mask,
6360 NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, 6324 NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
6361 nl80211_check_u16); 6325 nla_get_u16);
6362 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, 6326 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, mask,
6363 NL80211_MESH_POWER_ACTIVE, 6327 NL80211_MESHCONF_POWER_MODE, nla_get_u32);
6364 NL80211_MESH_POWER_MAX, 6328 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, mask,
6365 mask, NL80211_MESHCONF_POWER_MODE, 6329 NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16);
6366 nl80211_check_power_mode); 6330 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask,
6367 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 6331 NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32);
6368 0, 65535, mask,
6369 NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16);
6370 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff,
6371 mask, NL80211_MESHCONF_PLINK_TIMEOUT,
6372 nl80211_check_u32);
6373 if (mask_out) 6332 if (mask_out)
6374 *mask_out = mask; 6333 *mask_out = mask;
6375 6334
@@ -6412,8 +6371,6 @@ static int nl80211_parse_mesh_setup(struct genl_info *info,
6412 if (tb[NL80211_MESH_SETUP_IE]) { 6371 if (tb[NL80211_MESH_SETUP_IE]) {
6413 struct nlattr *ieattr = 6372 struct nlattr *ieattr =
6414 tb[NL80211_MESH_SETUP_IE]; 6373 tb[NL80211_MESH_SETUP_IE];
6415 if (!is_valid_ie_attr(ieattr))
6416 return -EINVAL;
6417 setup->ie = nla_data(ieattr); 6374 setup->ie = nla_data(ieattr);
6418 setup->ie_len = nla_len(ieattr); 6375 setup->ie_len = nla_len(ieattr);
6419 } 6376 }
@@ -7046,9 +7003,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
7046 int err, tmp, n_ssids = 0, n_channels, i; 7003 int err, tmp, n_ssids = 0, n_channels, i;
7047 size_t ie_len; 7004 size_t ie_len;
7048 7005
7049 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
7050 return -EINVAL;
7051
7052 wiphy = &rdev->wiphy; 7006 wiphy = &rdev->wiphy;
7053 7007
7054 if (wdev->iftype == NL80211_IFTYPE_NAN) 7008 if (wdev->iftype == NL80211_IFTYPE_NAN)
@@ -7402,9 +7356,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
7402 struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; 7356 struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
7403 s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF; 7357 s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF;
7404 7358
7405 if (!is_valid_ie_attr(attrs[NL80211_ATTR_IE]))
7406 return ERR_PTR(-EINVAL);
7407
7408 if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { 7359 if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
7409 n_channels = validate_scan_freqs( 7360 n_channels = validate_scan_freqs(
7410 attrs[NL80211_ATTR_SCAN_FREQUENCIES]); 7361 attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
@@ -7764,7 +7715,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
7764 */ 7715 */
7765 if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { 7716 if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) {
7766 while (!sched_scan_req->reqid) 7717 while (!sched_scan_req->reqid)
7767 sched_scan_req->reqid = rdev->wiphy.cookie_counter++; 7718 sched_scan_req->reqid = cfg80211_assign_cookie(rdev);
7768 } 7719 }
7769 7720
7770 err = rdev_sched_scan_start(rdev, dev, sched_scan_req); 7721 err = rdev_sched_scan_start(rdev, dev, sched_scan_req);
@@ -7940,7 +7891,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
7940 if (!need_new_beacon) 7891 if (!need_new_beacon)
7941 goto skip_beacons; 7892 goto skip_beacons;
7942 7893
7943 err = nl80211_parse_beacon(info->attrs, &params.beacon_after); 7894 err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after);
7944 if (err) 7895 if (err)
7945 return err; 7896 return err;
7946 7897
@@ -7950,7 +7901,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
7950 if (err) 7901 if (err)
7951 return err; 7902 return err;
7952 7903
7953 err = nl80211_parse_beacon(csa_attrs, &params.beacon_csa); 7904 err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa);
7954 if (err) 7905 if (err)
7955 return err; 7906 return err;
7956 7907
@@ -8187,7 +8138,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
8187 int err; 8138 int err;
8188 8139
8189 rtnl_lock(); 8140 rtnl_lock();
8190 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 8141 err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
8191 if (err) { 8142 if (err) {
8192 rtnl_unlock(); 8143 rtnl_unlock();
8193 return err; 8144 return err;
@@ -8308,7 +8259,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
8308 bool radio_stats; 8259 bool radio_stats;
8309 8260
8310 rtnl_lock(); 8261 rtnl_lock();
8311 res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 8262 res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
8312 if (res) 8263 if (res)
8313 goto out_err; 8264 goto out_err;
8314 8265
@@ -8372,9 +8323,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
8372 struct key_parse key; 8323 struct key_parse key;
8373 bool local_state_change; 8324 bool local_state_change;
8374 8325
8375 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8376 return -EINVAL;
8377
8378 if (!info->attrs[NL80211_ATTR_MAC]) 8326 if (!info->attrs[NL80211_ATTR_MAC])
8379 return -EINVAL; 8327 return -EINVAL;
8380 8328
@@ -8613,9 +8561,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
8613 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) 8561 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
8614 return -EPERM; 8562 return -EPERM;
8615 8563
8616 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8617 return -EINVAL;
8618
8619 if (!info->attrs[NL80211_ATTR_MAC] || 8564 if (!info->attrs[NL80211_ATTR_MAC] ||
8620 !info->attrs[NL80211_ATTR_SSID] || 8565 !info->attrs[NL80211_ATTR_SSID] ||
8621 !info->attrs[NL80211_ATTR_WIPHY_FREQ]) 8566 !info->attrs[NL80211_ATTR_WIPHY_FREQ])
@@ -8739,9 +8684,6 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
8739 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) 8684 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
8740 return -EPERM; 8685 return -EPERM;
8741 8686
8742 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8743 return -EINVAL;
8744
8745 if (!info->attrs[NL80211_ATTR_MAC]) 8687 if (!info->attrs[NL80211_ATTR_MAC])
8746 return -EINVAL; 8688 return -EINVAL;
8747 8689
@@ -8790,9 +8732,6 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
8790 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) 8732 dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
8791 return -EPERM; 8733 return -EPERM;
8792 8734
8793 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8794 return -EINVAL;
8795
8796 if (!info->attrs[NL80211_ATTR_MAC]) 8735 if (!info->attrs[NL80211_ATTR_MAC])
8797 return -EINVAL; 8736 return -EINVAL;
8798 8737
@@ -8867,9 +8806,6 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
8867 8806
8868 memset(&ibss, 0, sizeof(ibss)); 8807 memset(&ibss, 0, sizeof(ibss));
8869 8808
8870 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8871 return -EINVAL;
8872
8873 if (!info->attrs[NL80211_ATTR_SSID] || 8809 if (!info->attrs[NL80211_ATTR_SSID] ||
8874 !nla_len(info->attrs[NL80211_ATTR_SSID])) 8810 !nla_len(info->attrs[NL80211_ATTR_SSID]))
8875 return -EINVAL; 8811 return -EINVAL;
@@ -9307,9 +9243,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
9307 9243
9308 memset(&connect, 0, sizeof(connect)); 9244 memset(&connect, 0, sizeof(connect));
9309 9245
9310 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
9311 return -EINVAL;
9312
9313 if (!info->attrs[NL80211_ATTR_SSID] || 9246 if (!info->attrs[NL80211_ATTR_SSID] ||
9314 !nla_len(info->attrs[NL80211_ATTR_SSID])) 9247 !nla_len(info->attrs[NL80211_ATTR_SSID]))
9315 return -EINVAL; 9248 return -EINVAL;
@@ -9368,11 +9301,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
9368 !wiphy_ext_feature_isset(&rdev->wiphy, 9301 !wiphy_ext_feature_isset(&rdev->wiphy,
9369 NL80211_EXT_FEATURE_MFP_OPTIONAL)) 9302 NL80211_EXT_FEATURE_MFP_OPTIONAL))
9370 return -EOPNOTSUPP; 9303 return -EOPNOTSUPP;
9371
9372 if (connect.mfp != NL80211_MFP_REQUIRED &&
9373 connect.mfp != NL80211_MFP_NO &&
9374 connect.mfp != NL80211_MFP_OPTIONAL)
9375 return -EINVAL;
9376 } else { 9304 } else {
9377 connect.mfp = NL80211_MFP_NO; 9305 connect.mfp = NL80211_MFP_NO;
9378 } 9306 }
@@ -9545,8 +9473,6 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
9545 return -EOPNOTSUPP; 9473 return -EOPNOTSUPP;
9546 9474
9547 if (info->attrs[NL80211_ATTR_IE]) { 9475 if (info->attrs[NL80211_ATTR_IE]) {
9548 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
9549 return -EINVAL;
9550 connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]); 9476 connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
9551 connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); 9477 connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
9552 changed |= UPDATE_ASSOC_IES; 9478 changed |= UPDATE_ASSOC_IES;
@@ -10131,9 +10057,6 @@ static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
10131 10057
10132 ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]); 10058 ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
10133 10059
10134 if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
10135 return -EINVAL;
10136
10137 wdev = dev->ieee80211_ptr; 10060 wdev = dev->ieee80211_ptr;
10138 10061
10139 if (!rdev->ops->set_power_mgmt) 10062 if (!rdev->ops->set_power_mgmt)
@@ -10696,8 +10619,7 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
10696 if (!scan_plan) 10619 if (!scan_plan)
10697 return -ENOBUFS; 10620 return -ENOBUFS;
10698 10621
10699 if (!scan_plan || 10622 if (nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
10700 nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
10701 req->scan_plans[i].interval) || 10623 req->scan_plans[i].interval) ||
10702 (req->scan_plans[i].iterations && 10624 (req->scan_plans[i].iterations &&
10703 nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS, 10625 nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS,
@@ -11295,9 +11217,6 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
11295 if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION]) 11217 if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION])
11296 new_rule->condition = 11218 new_rule->condition =
11297 nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]); 11219 nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]);
11298 if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH &&
11299 new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH)
11300 return -EINVAL;
11301 11220
11302 if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN]) 11221 if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN])
11303 return -EINVAL; 11222 return -EINVAL;
@@ -11650,8 +11569,6 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
11650 11569
11651 conf.master_pref = 11570 conf.master_pref =
11652 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); 11571 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
11653 if (!conf.master_pref)
11654 return -EINVAL;
11655 11572
11656 if (info->attrs[NL80211_ATTR_BANDS]) { 11573 if (info->attrs[NL80211_ATTR_BANDS]) {
11657 u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); 11574 u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
@@ -11769,7 +11686,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
11769 if (!func) 11686 if (!func)
11770 return -ENOMEM; 11687 return -ENOMEM;
11771 11688
11772 func->cookie = wdev->wiphy->cookie_counter++; 11689 func->cookie = cfg80211_assign_cookie(rdev);
11773 11690
11774 if (!tb[NL80211_NAN_FUNC_TYPE] || 11691 if (!tb[NL80211_NAN_FUNC_TYPE] ||
11775 nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) { 11692 nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
@@ -12215,8 +12132,7 @@ static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info)
12215 return -EOPNOTSUPP; 12132 return -EOPNOTSUPP;
12216 12133
12217 if (!info->attrs[NL80211_ATTR_MDID] || 12134 if (!info->attrs[NL80211_ATTR_MDID] ||
12218 !info->attrs[NL80211_ATTR_IE] || 12135 !info->attrs[NL80211_ATTR_IE])
12219 !is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
12220 return -EINVAL; 12136 return -EINVAL;
12221 12137
12222 memset(&ft_params, 0, sizeof(ft_params)); 12138 memset(&ft_params, 0, sizeof(ft_params));
@@ -12636,12 +12552,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
12636 return -EINVAL; 12552 return -EINVAL;
12637 12553
12638 tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); 12554 tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
12639 if (tsid >= IEEE80211_NUM_TIDS)
12640 return -EINVAL;
12641
12642 up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); 12555 up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]);
12643 if (up >= IEEE80211_NUM_UPS)
12644 return -EINVAL;
12645 12556
12646 /* WMM uses TIDs 0-7 even for TSPEC */ 12557 /* WMM uses TIDs 0-7 even for TSPEC */
12647 if (tsid >= IEEE80211_FIRST_TSPEC_TSID) { 12558 if (tsid >= IEEE80211_FIRST_TSPEC_TSID) {
@@ -12999,6 +12910,76 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
12999 return err; 12910 return err;
13000} 12911}
13001 12912
12913static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
12914 struct genl_info *info)
12915{
12916 struct cfg80211_registered_device *rdev = info->user_ptr[0];
12917 struct net_device *dev = info->user_ptr[1];
12918 struct wireless_dev *wdev = dev->ieee80211_ptr;
12919 struct cfg80211_ftm_responder_stats ftm_stats = {};
12920 struct sk_buff *msg;
12921 void *hdr;
12922 struct nlattr *ftm_stats_attr;
12923 int err;
12924
12925 if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval)
12926 return -EOPNOTSUPP;
12927
12928 err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats);
12929 if (err)
12930 return err;
12931
12932 if (!ftm_stats.filled)
12933 return -ENODATA;
12934
12935 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
12936 if (!msg)
12937 return -ENOMEM;
12938
12939 hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
12940 NL80211_CMD_GET_FTM_RESPONDER_STATS);
12941 if (!hdr)
12942 return -ENOBUFS;
12943
12944 if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
12945 goto nla_put_failure;
12946
12947 ftm_stats_attr = nla_nest_start(msg, NL80211_ATTR_FTM_RESPONDER_STATS);
12948 if (!ftm_stats_attr)
12949 goto nla_put_failure;
12950
12951#define SET_FTM(field, name, type) \
12952 do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
12953 nla_put_ ## type(msg, NL80211_FTM_STATS_ ## name, \
12954 ftm_stats.field)) \
12955 goto nla_put_failure; } while (0)
12956#define SET_FTM_U64(field, name) \
12957 do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
12958 nla_put_u64_64bit(msg, NL80211_FTM_STATS_ ## name, \
12959 ftm_stats.field, NL80211_FTM_STATS_PAD)) \
12960 goto nla_put_failure; } while (0)
12961
12962 SET_FTM(success_num, SUCCESS_NUM, u32);
12963 SET_FTM(partial_num, PARTIAL_NUM, u32);
12964 SET_FTM(failed_num, FAILED_NUM, u32);
12965 SET_FTM(asap_num, ASAP_NUM, u32);
12966 SET_FTM(non_asap_num, NON_ASAP_NUM, u32);
12967 SET_FTM_U64(total_duration_ms, TOTAL_DURATION_MSEC);
12968 SET_FTM(unknown_triggers_num, UNKNOWN_TRIGGERS_NUM, u32);
12969 SET_FTM(reschedule_requests_num, RESCHEDULE_REQUESTS_NUM, u32);
12970 SET_FTM(out_of_window_triggers_num, OUT_OF_WINDOW_TRIGGERS_NUM, u32);
12971#undef SET_FTM
12972
12973 nla_nest_end(msg, ftm_stats_attr);
12974
12975 genlmsg_end(msg, hdr);
12976 return genlmsg_reply(msg, info);
12977
12978nla_put_failure:
12979 nlmsg_free(msg);
12980 return -ENOBUFS;
12981}
12982
13002#define NL80211_FLAG_NEED_WIPHY 0x01 12983#define NL80211_FLAG_NEED_WIPHY 0x01
13003#define NL80211_FLAG_NEED_NETDEV 0x02 12984#define NL80211_FLAG_NEED_NETDEV 0x02
13004#define NL80211_FLAG_NEED_RTNL 0x04 12985#define NL80211_FLAG_NEED_RTNL 0x04
@@ -13910,6 +13891,13 @@ static const struct genl_ops nl80211_ops[] = {
13910 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | 13891 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
13911 NL80211_FLAG_NEED_RTNL, 13892 NL80211_FLAG_NEED_RTNL,
13912 }, 13893 },
13894 {
13895 .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS,
13896 .doit = nl80211_get_ftm_responder_stats,
13897 .policy = nl80211_policy,
13898 .internal_flags = NL80211_FLAG_NEED_NETDEV |
13899 NL80211_FLAG_NEED_RTNL,
13900 },
13913}; 13901};
13914 13902
13915static struct genl_family nl80211_fam __ro_after_init = { 13903static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 364f5d67f05b..51380b5c32f2 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1232,4 +1232,19 @@ rdev_external_auth(struct cfg80211_registered_device *rdev,
1232 return ret; 1232 return ret;
1233} 1233}
1234 1234
1235static inline int
1236rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
1237 struct net_device *dev,
1238 struct cfg80211_ftm_responder_stats *ftm_stats)
1239{
1240 int ret = -EOPNOTSUPP;
1241
1242 trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats);
1243 if (rdev->ops->get_ftm_responder_stats)
1244 ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev,
1245 ftm_stats);
1246 trace_rdev_return_int(&rdev->wiphy, ret);
1247 return ret;
1248}
1249
1235#endif /* __CFG80211_RDEV_OPS */ 1250#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 24cfa2776f50..ecfb1a06dbb2 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -847,22 +847,36 @@ static bool valid_regdb(const u8 *data, unsigned int size)
847 return true; 847 return true;
848} 848}
849 849
850static void set_wmm_rule(struct ieee80211_reg_rule *rrule, 850static void set_wmm_rule(const struct fwdb_header *db,
851 struct fwdb_wmm_rule *wmm) 851 const struct fwdb_country *country,
852{ 852 const struct fwdb_rule *rule,
853 struct ieee80211_wmm_rule *rule = &rrule->wmm_rule; 853 struct ieee80211_reg_rule *rrule)
854 unsigned int i; 854{
855 struct ieee80211_wmm_rule *wmm_rule = &rrule->wmm_rule;
856 struct fwdb_wmm_rule *wmm;
857 unsigned int i, wmm_ptr;
858
859 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
860 wmm = (void *)((u8 *)db + wmm_ptr);
861
862 if (!valid_wmm(wmm)) {
863 pr_err("Invalid regulatory WMM rule %u-%u in domain %c%c\n",
864 be32_to_cpu(rule->start), be32_to_cpu(rule->end),
865 country->alpha2[0], country->alpha2[1]);
866 return;
867 }
855 868
856 for (i = 0; i < IEEE80211_NUM_ACS; i++) { 869 for (i = 0; i < IEEE80211_NUM_ACS; i++) {
857 rule->client[i].cw_min = 870 wmm_rule->client[i].cw_min =
858 ecw2cw((wmm->client[i].ecw & 0xf0) >> 4); 871 ecw2cw((wmm->client[i].ecw & 0xf0) >> 4);
859 rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); 872 wmm_rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f);
860 rule->client[i].aifsn = wmm->client[i].aifsn; 873 wmm_rule->client[i].aifsn = wmm->client[i].aifsn;
861 rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot); 874 wmm_rule->client[i].cot =
862 rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); 875 1000 * be16_to_cpu(wmm->client[i].cot);
863 rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); 876 wmm_rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4);
864 rule->ap[i].aifsn = wmm->ap[i].aifsn; 877 wmm_rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f);
865 rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); 878 wmm_rule->ap[i].aifsn = wmm->ap[i].aifsn;
879 wmm_rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot);
866 } 880 }
867 881
868 rrule->has_wmm = true; 882 rrule->has_wmm = true;
@@ -870,7 +884,7 @@ static void set_wmm_rule(struct ieee80211_reg_rule *rrule,
870 884
871static int __regdb_query_wmm(const struct fwdb_header *db, 885static int __regdb_query_wmm(const struct fwdb_header *db,
872 const struct fwdb_country *country, int freq, 886 const struct fwdb_country *country, int freq,
873 struct ieee80211_reg_rule *rule) 887 struct ieee80211_reg_rule *rrule)
874{ 888{
875 unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; 889 unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
876 struct fwdb_collection *coll = (void *)((u8 *)db + ptr); 890 struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
@@ -879,18 +893,14 @@ static int __regdb_query_wmm(const struct fwdb_header *db,
879 for (i = 0; i < coll->n_rules; i++) { 893 for (i = 0; i < coll->n_rules; i++) {
880 __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); 894 __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2));
881 unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; 895 unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2;
882 struct fwdb_rule *rrule = (void *)((u8 *)db + rule_ptr); 896 struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr);
883 struct fwdb_wmm_rule *wmm;
884 unsigned int wmm_ptr;
885 897
886 if (rrule->len < offsetofend(struct fwdb_rule, wmm_ptr)) 898 if (rule->len < offsetofend(struct fwdb_rule, wmm_ptr))
887 continue; 899 continue;
888 900
889 if (freq >= KHZ_TO_MHZ(be32_to_cpu(rrule->start)) && 901 if (freq >= KHZ_TO_MHZ(be32_to_cpu(rule->start)) &&
890 freq <= KHZ_TO_MHZ(be32_to_cpu(rrule->end))) { 902 freq <= KHZ_TO_MHZ(be32_to_cpu(rule->end))) {
891 wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2; 903 set_wmm_rule(db, country, rule, rrule);
892 wmm = (void *)((u8 *)db + wmm_ptr);
893 set_wmm_rule(rule, wmm);
894 return 0; 904 return 0;
895 } 905 }
896 } 906 }
@@ -972,12 +982,8 @@ static int regdb_query_country(const struct fwdb_header *db,
972 if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) 982 if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout))
973 rrule->dfs_cac_ms = 983 rrule->dfs_cac_ms =
974 1000 * be16_to_cpu(rule->cac_timeout); 984 1000 * be16_to_cpu(rule->cac_timeout);
975 if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { 985 if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr))
976 u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; 986 set_wmm_rule(db, country, rule, rrule);
977 struct fwdb_wmm_rule *wmm = (void *)((u8 *)db + wmm_ptr);
978
979 set_wmm_rule(rrule, wmm);
980 }
981 } 987 }
982 988
983 return reg_schedule_apply(regdom); 989 return reg_schedule_apply(regdom);
@@ -3186,13 +3192,59 @@ static void restore_regulatory_settings(bool reset_user)
3186 schedule_work(&reg_work); 3192 schedule_work(&reg_work);
3187} 3193}
3188 3194
3195static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag)
3196{
3197 struct cfg80211_registered_device *rdev;
3198 struct wireless_dev *wdev;
3199
3200 list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
3201 list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
3202 wdev_lock(wdev);
3203 if (!(wdev->wiphy->regulatory_flags & flag)) {
3204 wdev_unlock(wdev);
3205 return false;
3206 }
3207 wdev_unlock(wdev);
3208 }
3209 }
3210
3211 return true;
3212}
3213
3189void regulatory_hint_disconnect(void) 3214void regulatory_hint_disconnect(void)
3190{ 3215{
3216 /* Restore of regulatory settings is not required when wiphy(s)
3217 * ignore IE from connected access point but clearance of beacon hints
3218 * is required when wiphy(s) supports beacon hints.
3219 */
3220 if (is_wiphy_all_set_reg_flag(REGULATORY_COUNTRY_IE_IGNORE)) {
3221 struct reg_beacon *reg_beacon, *btmp;
3222
3223 if (is_wiphy_all_set_reg_flag(REGULATORY_DISABLE_BEACON_HINTS))
3224 return;
3225
3226 spin_lock_bh(&reg_pending_beacons_lock);
3227 list_for_each_entry_safe(reg_beacon, btmp,
3228 &reg_pending_beacons, list) {
3229 list_del(&reg_beacon->list);
3230 kfree(reg_beacon);
3231 }
3232 spin_unlock_bh(&reg_pending_beacons_lock);
3233
3234 list_for_each_entry_safe(reg_beacon, btmp,
3235 &reg_beacon_list, list) {
3236 list_del(&reg_beacon->list);
3237 kfree(reg_beacon);
3238 }
3239
3240 return;
3241 }
3242
3191 pr_debug("All devices are disconnected, going to restore regulatory settings\n"); 3243 pr_debug("All devices are disconnected, going to restore regulatory settings\n");
3192 restore_regulatory_settings(false); 3244 restore_regulatory_settings(false);
3193} 3245}
3194 3246
3195static bool freq_is_chan_12_13_14(u16 freq) 3247static bool freq_is_chan_12_13_14(u32 freq)
3196{ 3248{
3197 if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) || 3249 if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) ||
3198 freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) || 3250 freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) ||
@@ -3779,6 +3831,15 @@ static int __init regulatory_init_db(void)
3779{ 3831{
3780 int err; 3832 int err;
3781 3833
3834 /*
3835 * It's possible that - due to other bugs/issues - cfg80211
3836 * never called regulatory_init() below, or that it failed;
3837 * in that case, don't try to do any further work here as
3838 * it's doomed to lead to crashes.
3839 */
3840 if (IS_ERR_OR_NULL(reg_pdev))
3841 return -EINVAL;
3842
3782 err = load_builtin_regdb_keys(); 3843 err = load_builtin_regdb_keys();
3783 if (err) 3844 if (err)
3784 return err; 3845 return err;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 7c73510b161f..c6a9446b4e6b 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -112,7 +112,7 @@
112 } while (0) 112 } while (0)
113 113
114#define CHAN_ENTRY __field(enum nl80211_band, band) \ 114#define CHAN_ENTRY __field(enum nl80211_band, band) \
115 __field(u16, center_freq) 115 __field(u32, center_freq)
116#define CHAN_ASSIGN(chan) \ 116#define CHAN_ASSIGN(chan) \
117 do { \ 117 do { \
118 if (chan) { \ 118 if (chan) { \
@@ -2368,6 +2368,140 @@ TRACE_EVENT(rdev_external_auth,
2368 __entry->bssid, __entry->ssid, __entry->status) 2368 __entry->bssid, __entry->ssid, __entry->status)
2369); 2369);
2370 2370
2371TRACE_EVENT(rdev_start_radar_detection,
2372 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
2373 struct cfg80211_chan_def *chandef,
2374 u32 cac_time_ms),
2375 TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
2376 TP_STRUCT__entry(
2377 WIPHY_ENTRY
2378 NETDEV_ENTRY
2379 CHAN_DEF_ENTRY
2380 __field(u32, cac_time_ms)
2381 ),
2382 TP_fast_assign(
2383 WIPHY_ASSIGN;
2384 NETDEV_ASSIGN;
2385 CHAN_DEF_ASSIGN(chandef);
2386 __entry->cac_time_ms = cac_time_ms;
2387 ),
2388 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
2389 ", cac_time_ms=%u",
2390 WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
2391 __entry->cac_time_ms)
2392);
2393
2394TRACE_EVENT(rdev_set_mcast_rate,
2395 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
2396 int *mcast_rate),
2397 TP_ARGS(wiphy, netdev, mcast_rate),
2398 TP_STRUCT__entry(
2399 WIPHY_ENTRY
2400 NETDEV_ENTRY
2401 __array(int, mcast_rate, NUM_NL80211_BANDS)
2402 ),
2403 TP_fast_assign(
2404 WIPHY_ASSIGN;
2405 NETDEV_ASSIGN;
2406 memcpy(__entry->mcast_rate, mcast_rate,
2407 sizeof(int) * NUM_NL80211_BANDS);
2408 ),
2409 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
2410 "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
2411 WIPHY_PR_ARG, NETDEV_PR_ARG,
2412 __entry->mcast_rate[NL80211_BAND_2GHZ],
2413 __entry->mcast_rate[NL80211_BAND_5GHZ],
2414 __entry->mcast_rate[NL80211_BAND_60GHZ])
2415);
2416
2417TRACE_EVENT(rdev_set_coalesce,
2418 TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
2419 TP_ARGS(wiphy, coalesce),
2420 TP_STRUCT__entry(
2421 WIPHY_ENTRY
2422 __field(int, n_rules)
2423 ),
2424 TP_fast_assign(
2425 WIPHY_ASSIGN;
2426 __entry->n_rules = coalesce ? coalesce->n_rules : 0;
2427 ),
2428 TP_printk(WIPHY_PR_FMT ", n_rules=%d",
2429 WIPHY_PR_ARG, __entry->n_rules)
2430);
2431
2432DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
2433 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
2434 TP_ARGS(wiphy, wdev)
2435);
2436
2437TRACE_EVENT(rdev_set_multicast_to_unicast,
2438 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
2439 const bool enabled),
2440 TP_ARGS(wiphy, netdev, enabled),
2441 TP_STRUCT__entry(
2442 WIPHY_ENTRY
2443 NETDEV_ENTRY
2444 __field(bool, enabled)
2445 ),
2446 TP_fast_assign(
2447 WIPHY_ASSIGN;
2448 NETDEV_ASSIGN;
2449 __entry->enabled = enabled;
2450 ),
2451 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
2452 WIPHY_PR_ARG, NETDEV_PR_ARG,
2453 BOOL_TO_STR(__entry->enabled))
2454);
2455
2456DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats,
2457 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
2458 TP_ARGS(wiphy, wdev)
2459);
2460
2461TRACE_EVENT(rdev_get_ftm_responder_stats,
2462 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
2463 struct cfg80211_ftm_responder_stats *ftm_stats),
2464
2465 TP_ARGS(wiphy, netdev, ftm_stats),
2466
2467 TP_STRUCT__entry(
2468 WIPHY_ENTRY
2469 NETDEV_ENTRY
2470 __field(u64, timestamp)
2471 __field(u32, success_num)
2472 __field(u32, partial_num)
2473 __field(u32, failed_num)
2474 __field(u32, asap_num)
2475 __field(u32, non_asap_num)
2476 __field(u64, duration)
2477 __field(u32, unknown_triggers)
2478 __field(u32, reschedule)
2479 __field(u32, out_of_window)
2480 ),
2481
2482 TP_fast_assign(
2483 WIPHY_ASSIGN;
2484 NETDEV_ASSIGN;
2485 __entry->success_num = ftm_stats->success_num;
2486 __entry->partial_num = ftm_stats->partial_num;
2487 __entry->failed_num = ftm_stats->failed_num;
2488 __entry->asap_num = ftm_stats->asap_num;
2489 __entry->non_asap_num = ftm_stats->non_asap_num;
2490 __entry->duration = ftm_stats->total_duration_ms;
2491 __entry->unknown_triggers = ftm_stats->unknown_triggers_num;
2492 __entry->reschedule = ftm_stats->reschedule_requests_num;
2493 __entry->out_of_window = ftm_stats->out_of_window_triggers_num;
2494 ),
2495
2496 TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, "
2497 "failed %u, asap %u, non asap %u, total duration %llu, unknown "
2498 "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG,
2499 __entry->success_num, __entry->partial_num, __entry->failed_num,
2500 __entry->asap_num, __entry->non_asap_num, __entry->duration,
2501 __entry->unknown_triggers, __entry->reschedule,
2502 __entry->out_of_window)
2503);
2504
2371/************************************************************* 2505/*************************************************************
2372 * cfg80211 exported functions traces * 2506 * cfg80211 exported functions traces *
2373 *************************************************************/ 2507 *************************************************************/
@@ -3160,105 +3294,6 @@ TRACE_EVENT(cfg80211_stop_iface,
3160 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, 3294 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
3161 WIPHY_PR_ARG, WDEV_PR_ARG) 3295 WIPHY_PR_ARG, WDEV_PR_ARG)
3162); 3296);
3163
3164TRACE_EVENT(rdev_start_radar_detection,
3165 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
3166 struct cfg80211_chan_def *chandef,
3167 u32 cac_time_ms),
3168 TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
3169 TP_STRUCT__entry(
3170 WIPHY_ENTRY
3171 NETDEV_ENTRY
3172 CHAN_DEF_ENTRY
3173 __field(u32, cac_time_ms)
3174 ),
3175 TP_fast_assign(
3176 WIPHY_ASSIGN;
3177 NETDEV_ASSIGN;
3178 CHAN_DEF_ASSIGN(chandef);
3179 __entry->cac_time_ms = cac_time_ms;
3180 ),
3181 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
3182 ", cac_time_ms=%u",
3183 WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
3184 __entry->cac_time_ms)
3185);
3186
3187TRACE_EVENT(rdev_set_mcast_rate,
3188 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
3189 int *mcast_rate),
3190 TP_ARGS(wiphy, netdev, mcast_rate),
3191 TP_STRUCT__entry(
3192 WIPHY_ENTRY
3193 NETDEV_ENTRY
3194 __array(int, mcast_rate, NUM_NL80211_BANDS)
3195 ),
3196 TP_fast_assign(
3197 WIPHY_ASSIGN;
3198 NETDEV_ASSIGN;
3199 memcpy(__entry->mcast_rate, mcast_rate,
3200 sizeof(int) * NUM_NL80211_BANDS);
3201 ),
3202 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
3203 "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
3204 WIPHY_PR_ARG, NETDEV_PR_ARG,
3205 __entry->mcast_rate[NL80211_BAND_2GHZ],
3206 __entry->mcast_rate[NL80211_BAND_5GHZ],
3207 __entry->mcast_rate[NL80211_BAND_60GHZ])
3208);
3209
3210TRACE_EVENT(rdev_set_coalesce,
3211 TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
3212 TP_ARGS(wiphy, coalesce),
3213 TP_STRUCT__entry(
3214 WIPHY_ENTRY
3215 __field(int, n_rules)
3216 ),
3217 TP_fast_assign(
3218 WIPHY_ASSIGN;
3219 __entry->n_rules = coalesce ? coalesce->n_rules : 0;
3220 ),
3221 TP_printk(WIPHY_PR_FMT ", n_rules=%d",
3222 WIPHY_PR_ARG, __entry->n_rules)
3223);
3224
3225DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
3226 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
3227 TP_ARGS(wiphy, wdev)
3228);
3229
3230TRACE_EVENT(rdev_set_multicast_to_unicast,
3231 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
3232 const bool enabled),
3233 TP_ARGS(wiphy, netdev, enabled),
3234 TP_STRUCT__entry(
3235 WIPHY_ENTRY
3236 NETDEV_ENTRY
3237 __field(bool, enabled)
3238 ),
3239 TP_fast_assign(
3240 WIPHY_ASSIGN;
3241 NETDEV_ASSIGN;
3242 __entry->enabled = enabled;
3243 ),
3244 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
3245 WIPHY_PR_ARG, NETDEV_PR_ARG,
3246 BOOL_TO_STR(__entry->enabled))
3247);
3248
3249TRACE_EVENT(rdev_get_txq_stats,
3250 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
3251 TP_ARGS(wiphy, wdev),
3252 TP_STRUCT__entry(
3253 WIPHY_ENTRY
3254 WDEV_ENTRY
3255 ),
3256 TP_fast_assign(
3257 WIPHY_ASSIGN;
3258 WDEV_ASSIGN;
3259 ),
3260 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
3261);
3262#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ 3297#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
3263 3298
3264#undef TRACE_INCLUDE_PATH 3299#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 959ed3acd240..ef14d80ca03e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,17 +5,20 @@
5 * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2013-2014 Intel Mobile Communications GmbH 6 * Copyright 2013-2014 Intel Mobile Communications GmbH
7 * Copyright 2017 Intel Deutschland GmbH 7 * Copyright 2017 Intel Deutschland GmbH
8 * Copyright (C) 2018 Intel Corporation
8 */ 9 */
9#include <linux/export.h> 10#include <linux/export.h>
10#include <linux/bitops.h> 11#include <linux/bitops.h>
11#include <linux/etherdevice.h> 12#include <linux/etherdevice.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/ieee80211.h>
13#include <net/cfg80211.h> 15#include <net/cfg80211.h>
14#include <net/ip.h> 16#include <net/ip.h>
15#include <net/dsfield.h> 17#include <net/dsfield.h>
16#include <linux/if_vlan.h> 18#include <linux/if_vlan.h>
17#include <linux/mpls.h> 19#include <linux/mpls.h>
18#include <linux/gcd.h> 20#include <linux/gcd.h>
21#include <linux/bitfield.h>
19#include "core.h" 22#include "core.h"
20#include "rdev-ops.h" 23#include "rdev-ops.h"
21 24
@@ -88,7 +91,7 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
88 return 5000 + chan * 5; 91 return 5000 + chan * 5;
89 break; 92 break;
90 case NL80211_BAND_60GHZ: 93 case NL80211_BAND_60GHZ:
91 if (chan < 5) 94 if (chan < 7)
92 return 56160 + chan * 2160; 95 return 56160 + chan * 2160;
93 break; 96 break;
94 default: 97 default:
@@ -109,7 +112,7 @@ int ieee80211_frequency_to_channel(int freq)
109 return (freq - 4000) / 5; 112 return (freq - 4000) / 5;
110 else if (freq <= 45000) /* DMG band lower limit */ 113 else if (freq <= 45000) /* DMG band lower limit */
111 return (freq - 5000) / 5; 114 return (freq - 5000) / 5;
112 else if (freq >= 58320 && freq <= 64800) 115 else if (freq >= 58320 && freq <= 70200)
113 return (freq - 56160) / 2160; 116 return (freq - 56160) / 2160;
114 else 117 else
115 return 0; 118 return 0;
@@ -1568,7 +1571,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
1568 } 1571 }
1569 1572
1570 /* 56.16 GHz, channel 1..4 */ 1573 /* 56.16 GHz, channel 1..4 */
1571 if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 4) { 1574 if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) {
1572 if (chandef->width >= NL80211_CHAN_WIDTH_40) 1575 if (chandef->width >= NL80211_CHAN_WIDTH_40)
1573 return false; 1576 return false;
1574 1577
@@ -1893,3 +1896,154 @@ EXPORT_SYMBOL(rfc1042_header);
1893const unsigned char bridge_tunnel_header[] __aligned(2) = 1896const unsigned char bridge_tunnel_header[] __aligned(2) =
1894 { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; 1897 { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
1895EXPORT_SYMBOL(bridge_tunnel_header); 1898EXPORT_SYMBOL(bridge_tunnel_header);
1899
1900/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
1901struct iapp_layer2_update {
1902 u8 da[ETH_ALEN]; /* broadcast */
1903 u8 sa[ETH_ALEN]; /* STA addr */
1904 __be16 len; /* 6 */
1905 u8 dsap; /* 0 */
1906 u8 ssap; /* 0 */
1907 u8 control;
1908 u8 xid_info[3];
1909} __packed;
1910
1911void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr)
1912{
1913 struct iapp_layer2_update *msg;
1914 struct sk_buff *skb;
1915
1916 /* Send Level 2 Update Frame to update forwarding tables in layer 2
1917 * bridge devices */
1918
1919 skb = dev_alloc_skb(sizeof(*msg));
1920 if (!skb)
1921 return;
1922 msg = skb_put(skb, sizeof(*msg));
1923
1924 /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
1925 * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
1926
1927 eth_broadcast_addr(msg->da);
1928 ether_addr_copy(msg->sa, addr);
1929 msg->len = htons(6);
1930 msg->dsap = 0;
1931 msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */
1932 msg->control = 0xaf; /* XID response lsb.1111F101.
1933 * F=0 (no poll command; unsolicited frame) */
1934 msg->xid_info[0] = 0x81; /* XID format identifier */
1935 msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */
1936 msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */
1937
1938 skb->dev = dev;
1939 skb->protocol = eth_type_trans(skb, dev);
1940 memset(skb->cb, 0, sizeof(skb->cb));
1941 netif_rx_ni(skb);
1942}
1943EXPORT_SYMBOL(cfg80211_send_layer2_update);
1944
1945int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
1946 enum ieee80211_vht_chanwidth bw,
1947 int mcs, bool ext_nss_bw_capable)
1948{
1949 u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map);
1950 int max_vht_nss = 0;
1951 int ext_nss_bw;
1952 int supp_width;
1953 int i, mcs_encoding;
1954
1955 if (map == 0xffff)
1956 return 0;
1957
1958 if (WARN_ON(mcs > 9))
1959 return 0;
1960 if (mcs <= 7)
1961 mcs_encoding = 0;
1962 else if (mcs == 8)
1963 mcs_encoding = 1;
1964 else
1965 mcs_encoding = 2;
1966
1967 /* find max_vht_nss for the given MCS */
1968 for (i = 7; i >= 0; i--) {
1969 int supp = (map >> (2 * i)) & 3;
1970
1971 if (supp == 3)
1972 continue;
1973
1974 if (supp >= mcs_encoding) {
1975 max_vht_nss = i;
1976 break;
1977 }
1978 }
1979
1980 if (!(cap->supp_mcs.tx_mcs_map &
1981 cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
1982 return max_vht_nss;
1983
1984 ext_nss_bw = le32_get_bits(cap->vht_cap_info,
1985 IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
1986 supp_width = le32_get_bits(cap->vht_cap_info,
1987 IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK);
1988
1989 /* if not capable, treat ext_nss_bw as 0 */
1990 if (!ext_nss_bw_capable)
1991 ext_nss_bw = 0;
1992
1993 /* This is invalid */
1994 if (supp_width == 3)
1995 return 0;
1996
1997 /* This is an invalid combination so pretend nothing is supported */
1998 if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2))
1999 return 0;
2000
2001 /*
2002 * Cover all the special cases according to IEEE 802.11-2016
2003 * Table 9-250. All other cases are either factor of 1 or not
2004 * valid/supported.
2005 */
2006 switch (bw) {
2007 case IEEE80211_VHT_CHANWIDTH_USE_HT:
2008 case IEEE80211_VHT_CHANWIDTH_80MHZ:
2009 if ((supp_width == 1 || supp_width == 2) &&
2010 ext_nss_bw == 3)
2011 return 2 * max_vht_nss;
2012 break;
2013 case IEEE80211_VHT_CHANWIDTH_160MHZ:
2014 if (supp_width == 0 &&
2015 (ext_nss_bw == 1 || ext_nss_bw == 2))
2016 return DIV_ROUND_UP(max_vht_nss, 2);
2017 if (supp_width == 0 &&
2018 ext_nss_bw == 3)
2019 return DIV_ROUND_UP(3 * max_vht_nss, 4);
2020 if (supp_width == 1 &&
2021 ext_nss_bw == 3)
2022 return 2 * max_vht_nss;
2023 break;
2024 case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
2025 if (supp_width == 0 &&
2026 (ext_nss_bw == 1 || ext_nss_bw == 2))
2027 return 0; /* not possible */
2028 if (supp_width == 0 &&
2029 ext_nss_bw == 2)
2030 return DIV_ROUND_UP(max_vht_nss, 2);
2031 if (supp_width == 0 &&
2032 ext_nss_bw == 3)
2033 return DIV_ROUND_UP(3 * max_vht_nss, 4);
2034 if (supp_width == 1 &&
2035 ext_nss_bw == 0)
2036 return 0; /* not possible */
2037 if (supp_width == 1 &&
2038 ext_nss_bw == 1)
2039 return DIV_ROUND_UP(max_vht_nss, 2);
2040 if (supp_width == 1 &&
2041 ext_nss_bw == 2)
2042 return DIV_ROUND_UP(3 * max_vht_nss, 4);
2043 break;
2044 }
2045
2046 /* not covered or invalid combination received */
2047 return max_vht_nss;
2048}
2049EXPORT_SYMBOL(ieee80211_get_vht_max_nss);
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index bfe2dbea480b..a264cf2accd0 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -32,37 +32,49 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
32{ 32{
33 unsigned long flags; 33 unsigned long flags;
34 34
35 if (xs->dev) { 35 spin_lock_irqsave(&umem->xsk_list_lock, flags);
36 spin_lock_irqsave(&umem->xsk_list_lock, flags); 36 list_del_rcu(&xs->list);
37 list_del_rcu(&xs->list); 37 spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
38 spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
39
40 if (umem->zc)
41 synchronize_net();
42 }
43} 38}
44 39
45int xdp_umem_query(struct net_device *dev, u16 queue_id) 40/* The umem is stored both in the _rx struct and the _tx struct as we do
41 * not know if the device has more tx queues than rx, or the opposite.
42 * This might also change during run time.
43 */
44static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
45 u16 queue_id)
46{ 46{
47 struct netdev_bpf bpf; 47 if (queue_id < dev->real_num_rx_queues)
48 dev->_rx[queue_id].umem = umem;
49 if (queue_id < dev->real_num_tx_queues)
50 dev->_tx[queue_id].umem = umem;
51}
48 52
49 ASSERT_RTNL(); 53struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
54 u16 queue_id)
55{
56 if (queue_id < dev->real_num_rx_queues)
57 return dev->_rx[queue_id].umem;
58 if (queue_id < dev->real_num_tx_queues)
59 return dev->_tx[queue_id].umem;
50 60
51 memset(&bpf, 0, sizeof(bpf)); 61 return NULL;
52 bpf.command = XDP_QUERY_XSK_UMEM; 62}
53 bpf.xsk.queue_id = queue_id;
54 63
55 if (!dev->netdev_ops->ndo_bpf) 64static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
56 return 0; 65{
57 return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; 66 if (queue_id < dev->real_num_rx_queues)
67 dev->_rx[queue_id].umem = NULL;
68 if (queue_id < dev->real_num_tx_queues)
69 dev->_tx[queue_id].umem = NULL;
58} 70}
59 71
60int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, 72int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
61 u32 queue_id, u16 flags) 73 u16 queue_id, u16 flags)
62{ 74{
63 bool force_zc, force_copy; 75 bool force_zc, force_copy;
64 struct netdev_bpf bpf; 76 struct netdev_bpf bpf;
65 int err; 77 int err = 0;
66 78
67 force_zc = flags & XDP_ZEROCOPY; 79 force_zc = flags & XDP_ZEROCOPY;
68 force_copy = flags & XDP_COPY; 80 force_copy = flags & XDP_COPY;
@@ -70,19 +82,23 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
70 if (force_zc && force_copy) 82 if (force_zc && force_copy)
71 return -EINVAL; 83 return -EINVAL;
72 84
73 if (force_copy) 85 rtnl_lock();
74 return 0; 86 if (xdp_get_umem_from_qid(dev, queue_id)) {
75 87 err = -EBUSY;
76 if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) 88 goto out_rtnl_unlock;
77 return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ 89 }
78 90
79 bpf.command = XDP_QUERY_XSK_UMEM; 91 xdp_reg_umem_at_qid(dev, umem, queue_id);
92 umem->dev = dev;
93 umem->queue_id = queue_id;
94 if (force_copy)
95 /* For copy-mode, we are done. */
96 goto out_rtnl_unlock;
80 97
81 rtnl_lock(); 98 if (!dev->netdev_ops->ndo_bpf ||
82 err = xdp_umem_query(dev, queue_id); 99 !dev->netdev_ops->ndo_xsk_async_xmit) {
83 if (err) { 100 err = -EOPNOTSUPP;
84 err = err < 0 ? -EOPNOTSUPP : -EBUSY; 101 goto err_unreg_umem;
85 goto err_rtnl_unlock;
86 } 102 }
87 103
88 bpf.command = XDP_SETUP_XSK_UMEM; 104 bpf.command = XDP_SETUP_XSK_UMEM;
@@ -91,18 +107,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
91 107
92 err = dev->netdev_ops->ndo_bpf(dev, &bpf); 108 err = dev->netdev_ops->ndo_bpf(dev, &bpf);
93 if (err) 109 if (err)
94 goto err_rtnl_unlock; 110 goto err_unreg_umem;
95 rtnl_unlock(); 111 rtnl_unlock();
96 112
97 dev_hold(dev); 113 dev_hold(dev);
98 umem->dev = dev;
99 umem->queue_id = queue_id;
100 umem->zc = true; 114 umem->zc = true;
101 return 0; 115 return 0;
102 116
103err_rtnl_unlock: 117err_unreg_umem:
118 xdp_clear_umem_at_qid(dev, queue_id);
119 if (!force_zc)
120 err = 0; /* fallback to copy mode */
121out_rtnl_unlock:
104 rtnl_unlock(); 122 rtnl_unlock();
105 return force_zc ? err : 0; /* fail or fallback */ 123 return err;
106} 124}
107 125
108static void xdp_umem_clear_dev(struct xdp_umem *umem) 126static void xdp_umem_clear_dev(struct xdp_umem *umem)
@@ -110,7 +128,7 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
110 struct netdev_bpf bpf; 128 struct netdev_bpf bpf;
111 int err; 129 int err;
112 130
113 if (umem->dev) { 131 if (umem->zc) {
114 bpf.command = XDP_SETUP_XSK_UMEM; 132 bpf.command = XDP_SETUP_XSK_UMEM;
115 bpf.xsk.umem = NULL; 133 bpf.xsk.umem = NULL;
116 bpf.xsk.queue_id = umem->queue_id; 134 bpf.xsk.queue_id = umem->queue_id;
@@ -121,9 +139,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
121 139
122 if (err) 140 if (err)
123 WARN(1, "failed to disable umem!\n"); 141 WARN(1, "failed to disable umem!\n");
142 }
143
144 if (umem->dev) {
145 rtnl_lock();
146 xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
147 rtnl_unlock();
148 }
124 149
150 if (umem->zc) {
125 dev_put(umem->dev); 151 dev_put(umem->dev);
126 umem->dev = NULL; 152 umem->zc = false;
127 } 153 }
128} 154}
129 155
@@ -167,6 +193,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
167 umem->cq = NULL; 193 umem->cq = NULL;
168 } 194 }
169 195
196 xsk_reuseq_destroy(umem);
197
170 xdp_umem_unpin_pages(umem); 198 xdp_umem_unpin_pages(umem);
171 199
172 task = get_pid_task(umem->pid, PIDTYPE_PID); 200 task = get_pid_task(umem->pid, PIDTYPE_PID);
@@ -314,8 +342,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
314 342
315 umem->pid = get_task_pid(current, PIDTYPE_PID); 343 umem->pid = get_task_pid(current, PIDTYPE_PID);
316 umem->address = (unsigned long)addr; 344 umem->address = (unsigned long)addr;
317 umem->props.chunk_mask = ~((u64)chunk_size - 1); 345 umem->chunk_mask = ~((u64)chunk_size - 1);
318 umem->props.size = size; 346 umem->size = size;
319 umem->headroom = headroom; 347 umem->headroom = headroom;
320 umem->chunk_size_nohr = chunk_size - headroom; 348 umem->chunk_size_nohr = chunk_size - headroom;
321 umem->npgs = size / PAGE_SIZE; 349 umem->npgs = size / PAGE_SIZE;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index f11560334f88..27603227601b 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -8,18 +8,8 @@
8 8
9#include <net/xdp_sock.h> 9#include <net/xdp_sock.h>
10 10
11static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
12{
13 return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
14}
15
16static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
17{
18 return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
19}
20
21int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, 11int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
22 u32 queue_id, u16 flags); 12 u16 queue_id, u16 flags);
23bool xdp_umem_validate_queues(struct xdp_umem *umem); 13bool xdp_umem_validate_queues(struct xdp_umem *umem);
24void xdp_get_umem(struct xdp_umem *umem); 14void xdp_get_umem(struct xdp_umem *umem);
25void xdp_put_umem(struct xdp_umem *umem); 15void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
deleted file mode 100644
index 40eab10dfc49..000000000000
--- a/net/xdp/xdp_umem_props.h
+++ /dev/null
@@ -1,14 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 */
5
6#ifndef XDP_UMEM_PROPS_H_
7#define XDP_UMEM_PROPS_H_
8
9struct xdp_umem_props {
10 u64 chunk_mask;
11 u64 size;
12};
13
14#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 661504042d30..07156f43d295 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -55,20 +55,30 @@ EXPORT_SYMBOL(xsk_umem_discard_addr);
55 55
56static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 56static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
57{ 57{
58 void *buffer; 58 void *to_buf, *from_buf;
59 u32 metalen;
59 u64 addr; 60 u64 addr;
60 int err; 61 int err;
61 62
62 if (!xskq_peek_addr(xs->umem->fq, &addr) || 63 if (!xskq_peek_addr(xs->umem->fq, &addr) ||
63 len > xs->umem->chunk_size_nohr) { 64 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
64 xs->rx_dropped++; 65 xs->rx_dropped++;
65 return -ENOSPC; 66 return -ENOSPC;
66 } 67 }
67 68
68 addr += xs->umem->headroom; 69 addr += xs->umem->headroom;
69 70
70 buffer = xdp_umem_get_data(xs->umem, addr); 71 if (unlikely(xdp_data_meta_unsupported(xdp))) {
71 memcpy(buffer, xdp->data, len); 72 from_buf = xdp->data;
73 metalen = 0;
74 } else {
75 from_buf = xdp->data_meta;
76 metalen = xdp->data - xdp->data_meta;
77 }
78
79 to_buf = xdp_umem_get_data(xs->umem, addr);
80 memcpy(to_buf, from_buf, len + metalen);
81 addr += metalen;
72 err = xskq_produce_batch_desc(xs->rx, addr, len); 82 err = xskq_produce_batch_desc(xs->rx, addr, len);
73 if (!err) { 83 if (!err) {
74 xskq_discard_addr(xs->umem->fq); 84 xskq_discard_addr(xs->umem->fq);
@@ -111,6 +121,7 @@ void xsk_flush(struct xdp_sock *xs)
111 121
112int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 122int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
113{ 123{
124 u32 metalen = xdp->data - xdp->data_meta;
114 u32 len = xdp->data_end - xdp->data; 125 u32 len = xdp->data_end - xdp->data;
115 void *buffer; 126 void *buffer;
116 u64 addr; 127 u64 addr;
@@ -120,7 +131,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
120 return -EINVAL; 131 return -EINVAL;
121 132
122 if (!xskq_peek_addr(xs->umem->fq, &addr) || 133 if (!xskq_peek_addr(xs->umem->fq, &addr) ||
123 len > xs->umem->chunk_size_nohr) { 134 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
124 xs->rx_dropped++; 135 xs->rx_dropped++;
125 return -ENOSPC; 136 return -ENOSPC;
126 } 137 }
@@ -128,7 +139,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
128 addr += xs->umem->headroom; 139 addr += xs->umem->headroom;
129 140
130 buffer = xdp_umem_get_data(xs->umem, addr); 141 buffer = xdp_umem_get_data(xs->umem, addr);
131 memcpy(buffer, xdp->data, len); 142 memcpy(buffer, xdp->data_meta, len + metalen);
143 addr += metalen;
132 err = xskq_produce_batch_desc(xs->rx, addr, len); 144 err = xskq_produce_batch_desc(xs->rx, addr, len);
133 if (!err) { 145 if (!err) {
134 xskq_discard_addr(xs->umem->fq); 146 xskq_discard_addr(xs->umem->fq);
@@ -343,12 +355,18 @@ static int xsk_release(struct socket *sock)
343 local_bh_enable(); 355 local_bh_enable();
344 356
345 if (xs->dev) { 357 if (xs->dev) {
358 struct net_device *dev = xs->dev;
359
346 /* Wait for driver to stop using the xdp socket. */ 360 /* Wait for driver to stop using the xdp socket. */
347 synchronize_net(); 361 xdp_del_sk_umem(xs->umem, xs);
348 dev_put(xs->dev);
349 xs->dev = NULL; 362 xs->dev = NULL;
363 synchronize_net();
364 dev_put(dev);
350 } 365 }
351 366
367 xskq_destroy(xs->rx);
368 xskq_destroy(xs->tx);
369
352 sock_orphan(sk); 370 sock_orphan(sk);
353 sock->sk = NULL; 371 sock->sk = NULL;
354 372
@@ -407,13 +425,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
407 } 425 }
408 426
409 qid = sxdp->sxdp_queue_id; 427 qid = sxdp->sxdp_queue_id;
410
411 if ((xs->rx && qid >= dev->real_num_rx_queues) ||
412 (xs->tx && qid >= dev->real_num_tx_queues)) {
413 err = -EINVAL;
414 goto out_unlock;
415 }
416
417 flags = sxdp->sxdp_flags; 428 flags = sxdp->sxdp_flags;
418 429
419 if (flags & XDP_SHARED_UMEM) { 430 if (flags & XDP_SHARED_UMEM) {
@@ -458,8 +469,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
458 goto out_unlock; 469 goto out_unlock;
459 } else { 470 } else {
460 /* This xsk has its own umem. */ 471 /* This xsk has its own umem. */
461 xskq_set_umem(xs->umem->fq, &xs->umem->props); 472 xskq_set_umem(xs->umem->fq, xs->umem->size,
462 xskq_set_umem(xs->umem->cq, &xs->umem->props); 473 xs->umem->chunk_mask);
474 xskq_set_umem(xs->umem->cq, xs->umem->size,
475 xs->umem->chunk_mask);
463 476
464 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); 477 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
465 if (err) 478 if (err)
@@ -469,8 +482,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
469 xs->dev = dev; 482 xs->dev = dev;
470 xs->zc = xs->umem->zc; 483 xs->zc = xs->umem->zc;
471 xs->queue_id = qid; 484 xs->queue_id = qid;
472 xskq_set_umem(xs->rx, &xs->umem->props); 485 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
473 xskq_set_umem(xs->tx, &xs->umem->props); 486 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
474 xdp_add_sk_umem(xs->umem, xs); 487 xdp_add_sk_umem(xs->umem, xs);
475 488
476out_unlock: 489out_unlock:
@@ -707,9 +720,6 @@ static void xsk_destruct(struct sock *sk)
707 if (!sock_flag(sk, SOCK_DEAD)) 720 if (!sock_flag(sk, SOCK_DEAD))
708 return; 721 return;
709 722
710 xskq_destroy(xs->rx);
711 xskq_destroy(xs->tx);
712 xdp_del_sk_umem(xs->umem, xs);
713 xdp_put_umem(xs->umem); 723 xdp_put_umem(xs->umem);
714 724
715 sk_refcnt_debug_dec(sk); 725 sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 6c32e92e98fc..b66504592d9b 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -3,16 +3,19 @@
3 * Copyright(c) 2018 Intel Corporation. 3 * Copyright(c) 2018 Intel Corporation.
4 */ 4 */
5 5
6#include <linux/log2.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/overflow.h>
7 9
8#include "xsk_queue.h" 10#include "xsk_queue.h"
9 11
10void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) 12void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
11{ 13{
12 if (!q) 14 if (!q)
13 return; 15 return;
14 16
15 q->umem_props = *umem_props; 17 q->size = size;
18 q->chunk_mask = chunk_mask;
16} 19}
17 20
18static u32 xskq_umem_get_ring_size(struct xsk_queue *q) 21static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
@@ -61,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q)
61 page_frag_free(q->ring); 64 page_frag_free(q->ring);
62 kfree(q); 65 kfree(q);
63} 66}
67
68struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
69{
70 struct xdp_umem_fq_reuse *newq;
71
72 /* Check for overflow */
73 if (nentries > (u32)roundup_pow_of_two(nentries))
74 return NULL;
75 nentries = roundup_pow_of_two(nentries);
76
77 newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
78 if (!newq)
79 return NULL;
80 memset(newq, 0, offsetof(typeof(*newq), handles));
81
82 newq->nentries = nentries;
83 return newq;
84}
85EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
86
87struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
88 struct xdp_umem_fq_reuse *newq)
89{
90 struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
91
92 if (!oldq) {
93 umem->fq_reuse = newq;
94 return NULL;
95 }
96
97 if (newq->nentries < oldq->length)
98 return newq;
99
100 memcpy(newq->handles, oldq->handles,
101 array_size(oldq->length, sizeof(u64)));
102 newq->length = oldq->length;
103
104 umem->fq_reuse = newq;
105 return oldq;
106}
107EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
108
109void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
110{
111 kvfree(rq);
112}
113EXPORT_SYMBOL_GPL(xsk_reuseq_free);
114
115void xsk_reuseq_destroy(struct xdp_umem *umem)
116{
117 xsk_reuseq_free(umem->fq_reuse);
118 umem->fq_reuse = NULL;
119}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 8a64b150be54..bcb5cbb40419 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -31,7 +31,8 @@ struct xdp_umem_ring {
31}; 31};
32 32
33struct xsk_queue { 33struct xsk_queue {
34 struct xdp_umem_props umem_props; 34 u64 chunk_mask;
35 u64 size;
35 u32 ring_mask; 36 u32 ring_mask;
36 u32 nentries; 37 u32 nentries;
37 u32 prod_head; 38 u32 prod_head;
@@ -78,7 +79,7 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
78 79
79static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) 80static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
80{ 81{
81 if (addr >= q->umem_props.size) { 82 if (addr >= q->size) {
82 q->invalid_descs++; 83 q->invalid_descs++;
83 return false; 84 return false;
84 } 85 }
@@ -92,7 +93,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
92 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; 93 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
93 unsigned int idx = q->cons_tail & q->ring_mask; 94 unsigned int idx = q->cons_tail & q->ring_mask;
94 95
95 *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask; 96 *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
96 if (xskq_is_valid_addr(q, *addr)) 97 if (xskq_is_valid_addr(q, *addr))
97 return addr; 98 return addr;
98 99
@@ -173,8 +174,8 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
173 if (!xskq_is_valid_addr(q, d->addr)) 174 if (!xskq_is_valid_addr(q, d->addr))
174 return false; 175 return false;
175 176
176 if (((d->addr + d->len) & q->umem_props.chunk_mask) != 177 if (((d->addr + d->len) & q->chunk_mask) !=
177 (d->addr & q->umem_props.chunk_mask)) { 178 (d->addr & q->chunk_mask)) {
178 q->invalid_descs++; 179 q->invalid_descs++;
179 return false; 180 return false;
180 } 181 }
@@ -253,8 +254,11 @@ static inline bool xskq_empty_desc(struct xsk_queue *q)
253 return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; 254 return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
254} 255}
255 256
256void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); 257void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
257struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); 258struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
258void xskq_destroy(struct xsk_queue *q_ops); 259void xskq_destroy(struct xsk_queue *q_ops);
259 260
261/* Executed by the core when the entire UMEM gets freed */
262void xsk_reuseq_destroy(struct xdp_umem *umem);
263
260#endif /* _LINUX_XSK_QUEUE_H */ 264#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5611b7521020..144c137886b1 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -99,7 +99,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
99 99
100 do { 100 do {
101 struct sk_buff *nskb = skb2->next; 101 struct sk_buff *nskb = skb2->next;
102 skb2->next = NULL; 102 skb_mark_not_on_list(skb2);
103 103
104 xo = xfrm_offload(skb2); 104 xo = xfrm_offload(skb2);
105 xo->flags |= XFRM_DEV_RESUME; 105 xo->flags |= XFRM_DEV_RESUME;
@@ -192,9 +192,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
192 192
193 err = dev->xfrmdev_ops->xdo_dev_state_add(x); 193 err = dev->xfrmdev_ops->xdo_dev_state_add(x);
194 if (err) { 194 if (err) {
195 xso->num_exthdrs = 0;
196 xso->flags = 0;
195 xso->dev = NULL; 197 xso->dev = NULL;
196 dev_put(dev); 198 dev_put(dev);
197 return err; 199
200 if (err != -EOPNOTSUPP)
201 return err;
198 } 202 }
199 203
200 return 0; 204 return 0;
diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 61be810389d8..ce66323102f9 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -13,7 +13,7 @@ static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
13 13
14static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr) 14static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
15{ 15{
16 return ntohl(addr->a6[2] ^ addr->a6[3]); 16 return jhash2((__force u32 *)addr->a6, 4, 0);
17} 17}
18 18
19static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, 19static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
@@ -26,8 +26,7 @@ static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
26static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, 26static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
27 const xfrm_address_t *saddr) 27 const xfrm_address_t *saddr)
28{ 28{
29 return ntohl(daddr->a6[2] ^ daddr->a6[3] ^ 29 return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr);
30 saddr->a6[2] ^ saddr->a6[3]);
31} 30}
32 31
33static inline u32 __bits2mask32(__u8 bits) 32static inline u32 __bits2mask32(__u8 bits)
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index be3520e429c9..684c0bc01e2c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -131,7 +131,7 @@ struct sec_path *secpath_dup(struct sec_path *src)
131 sp->len = 0; 131 sp->len = 0;
132 sp->olen = 0; 132 sp->olen = 0;
133 133
134 memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH])); 134 memset(sp->ovec, 0, sizeof(sp->ovec));
135 135
136 if (src) { 136 if (src) {
137 int i; 137 int i;
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 6f05e831a73e..d679fa0f44b3 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -472,9 +472,9 @@ static int xfrmi4_err(struct sk_buff *skb, u32 info)
472 } 472 }
473 473
474 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 474 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
475 ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); 475 ipv4_update_pmtu(skb, net, info, 0, protocol);
476 else 476 else
477 ipv4_redirect(skb, net, 0, 0, protocol, 0); 477 ipv4_redirect(skb, net, 0, protocol);
478 xfrm_state_put(x); 478 xfrm_state_put(x);
479 479
480 return 0; 480 return 0;
@@ -564,9 +564,6 @@ static void xfrmi_get_stats64(struct net_device *dev,
564{ 564{
565 int cpu; 565 int cpu;
566 566
567 if (!dev->tstats)
568 return;
569
570 for_each_possible_cpu(cpu) { 567 for_each_possible_cpu(cpu) {
571 struct pcpu_sw_netstats *stats; 568 struct pcpu_sw_netstats *stats;
572 struct pcpu_sw_netstats tmp; 569 struct pcpu_sw_netstats tmp;
@@ -745,7 +742,7 @@ nla_put_failure:
745 return -EMSGSIZE; 742 return -EMSGSIZE;
746} 743}
747 744
748struct net *xfrmi_get_link_net(const struct net_device *dev) 745static struct net *xfrmi_get_link_net(const struct net_device *dev)
749{ 746{
750 struct xfrm_if *xi = netdev_priv(dev); 747 struct xfrm_if *xi = netdev_priv(dev);
751 748
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 261995d37ced..4ae87c5ce2e3 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -193,7 +193,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
193 struct sk_buff *nskb = segs->next; 193 struct sk_buff *nskb = segs->next;
194 int err; 194 int err;
195 195
196 segs->next = NULL; 196 skb_mark_not_on_list(segs);
197 err = xfrm_output2(net, sk, segs); 197 err = xfrm_output2(net, sk, segs);
198 198
199 if (unlikely(err)) { 199 if (unlikely(err)) {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index df7ca2dabc48..ca7a207b81a9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1007,7 +1007,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
1007 int err; 1007 int err;
1008 1008
1009 err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, 1009 err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy,
1010 NULL); 1010 cb->extack);
1011 if (err < 0) 1011 if (err < 0)
1012 return err; 1012 return err;
1013 1013